Change utf8::to_next/to_previous so that they are more symetrical

The previous implementation could yield different positions when
iterating forward and backward, leading to confusion in boost regex.

This makes an existing problem a bit more visible: iterating with
to_next and with read_codepoint wont behave the same way, as
read_codepoint will put the iterator onto the byte following the
utf8 codepoint, whereas to_next will put it on the next utf8
character start byte, which might be different if the buffer content
is not valid utf8.

Fixes #1195
This commit is contained in:
Maxime Coste 2017-04-20 16:18:49 +01:00
parent 30e6387071
commit dbcddafbfd
5 changed files with 19 additions and 17 deletions

View File

@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end)
// According to rfc3629, UTF-8 allows only up to 4 bytes. // According to rfc3629, UTF-8 allows only up to 4 bytes.
// (21 bits codepoint) // (21 bits codepoint)
unsigned char byte = read(it); unsigned char byte = read(it);
if (not (byte & 0x80)) // 0xxxxxxx if ((byte & 0x80) == 0) // 0xxxxxxx
return byte; return byte;
if (it == end) if (it == end)
@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end)
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass> template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
ByteCount codepoint_size(char byte) ByteCount codepoint_size(char byte)
{ {
if (not (byte & 0x80)) // 0xxxxxxx if ((byte & 0x80) == 0) // 0xxxxxxx
return 1; return 1;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx else if ((byte & 0xE0) == 0xC0) // 110xxxxx
return 2; return 2;
@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp)
template<typename Iterator> template<typename Iterator>
void to_next(Iterator& it, const Iterator& end) void to_next(Iterator& it, const Iterator& end)
{ {
if (it != end and read(it) & 0x80) if (it != end)
while (it != end and (*(it) & 0xC0) == 0x80) ++it;
++it; while (it != end and not is_character_start(*it))
++it;
} }
// returns an iterator to next character first byte // returns an iterator to next character first byte
@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end)
template<typename Iterator> template<typename Iterator>
void to_previous(Iterator& it, const Iterator& begin) void to_previous(Iterator& it, const Iterator& begin)
{ {
while (it != begin and (*(--it) & 0xC0) == 0x80) if (it != begin)
; --it;
while (not is_character_start(*it))
--it;
} }
// returns an iterator to the previous character first byte // returns an iterator to the previous character first byte
template<typename Iterator> template<typename Iterator>
@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d)
if (d < 0) if (d < 0)
{ {
while (it != end and d != 0) while (it != end and d++ != 0)
{ to_previous(it, end);
if (is_character_start(*--it))
++d;
}
} }
else if (d > 0) else if (d > 0)
{ {
while (it != end and d != 0) while (it != end and d-- != 0)
{ to_next(it, end);
if (is_character_start(*++it))
--d;
}
} }
return it; return it;
} }

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
¦¥ô†èg ’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×­ŸRáïújWWq/ÍÈé.h3ÑÓþbè îƒ‡Î¬÷ÚO¢´”ã<E2809D>a…=Þ~N´uù{G”„ÜqØTYF¯Òý¾f#<0E>èB-—îÿ Þw]±„<šâ º»cª7Jë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt Ä7±q 2j3ã=  UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€$ˆr8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ fìÎJ

View File

@ -0,0 +1 @@
¦¥ô†èg ’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×­ŸRáïújWWq/ÍÈé.h3ÑÓþbè îƒ‡Î¬÷ÚO¢´”ã<E2809D>a…=Þ~N´uù{G”„ÜqØTYF¯Òý¾f#<0E>èB-—îÿ Þw]±„<šâ º»cª7Jë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt Ä7±q 2j3ã=  UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€$ˆr8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ fìÎJ

View File

@ -0,0 +1,2 @@
add-highlighter regions -default code -match-capture sh \
heredoc '<<-?(\w+)' '^\t*(\w+)$' ''