Change utf8::to_next/to_previous so that they are more symetrical
The previous implementation could yield different positions when iterating forward and backward, leading to confusion in boost regex. This makes an existing problem a bit more visible: iterating with to_next and with read_codepoint wont behave the same way, as read_codepoint will put the iterator onto the byte following the utf8 codepoint, whereas to_next will put it on the next utf8 character start byte, which might be different if the buffer content is not valid utf8. Fixes #1195
This commit is contained in:
parent
30e6387071
commit
dbcddafbfd
31
src/utf8.hh
31
src/utf8.hh
|
@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end)
|
||||||
// According to rfc3629, UTF-8 allows only up to 4 bytes.
|
// According to rfc3629, UTF-8 allows only up to 4 bytes.
|
||||||
// (21 bits codepoint)
|
// (21 bits codepoint)
|
||||||
unsigned char byte = read(it);
|
unsigned char byte = read(it);
|
||||||
if (not (byte & 0x80)) // 0xxxxxxx
|
if ((byte & 0x80) == 0) // 0xxxxxxx
|
||||||
return byte;
|
return byte;
|
||||||
|
|
||||||
if (it == end)
|
if (it == end)
|
||||||
|
@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end)
|
||||||
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
|
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
|
||||||
ByteCount codepoint_size(char byte)
|
ByteCount codepoint_size(char byte)
|
||||||
{
|
{
|
||||||
if (not (byte & 0x80)) // 0xxxxxxx
|
if ((byte & 0x80) == 0) // 0xxxxxxx
|
||||||
return 1;
|
return 1;
|
||||||
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
|
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
|
||||||
return 2;
|
return 2;
|
||||||
|
@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp)
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
void to_next(Iterator& it, const Iterator& end)
|
void to_next(Iterator& it, const Iterator& end)
|
||||||
{
|
{
|
||||||
if (it != end and read(it) & 0x80)
|
if (it != end)
|
||||||
while (it != end and (*(it) & 0xC0) == 0x80)
|
++it;
|
||||||
++it;
|
while (it != end and not is_character_start(*it))
|
||||||
|
++it;
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns an iterator to next character first byte
|
// returns an iterator to next character first byte
|
||||||
|
@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end)
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
void to_previous(Iterator& it, const Iterator& begin)
|
void to_previous(Iterator& it, const Iterator& begin)
|
||||||
{
|
{
|
||||||
while (it != begin and (*(--it) & 0xC0) == 0x80)
|
if (it != begin)
|
||||||
;
|
--it;
|
||||||
|
while (not is_character_start(*it))
|
||||||
|
--it;
|
||||||
}
|
}
|
||||||
// returns an iterator to the previous character first byte
|
// returns an iterator to the previous character first byte
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
|
@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d)
|
||||||
|
|
||||||
if (d < 0)
|
if (d < 0)
|
||||||
{
|
{
|
||||||
while (it != end and d != 0)
|
while (it != end and d++ != 0)
|
||||||
{
|
to_previous(it, end);
|
||||||
if (is_character_start(*--it))
|
|
||||||
++d;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (d > 0)
|
else if (d > 0)
|
||||||
{
|
{
|
||||||
while (it != end and d != 0)
|
while (it != end and d-- != 0)
|
||||||
{
|
to_next(it, end);
|
||||||
if (is_character_start(*++it))
|
|
||||||
--d;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return it;
|
return it;
|
||||||
}
|
}
|
||||||
|
|
1
test/regression/1195-infinite-loop-in-regex-matching/cmd
Normal file
1
test/regression/1195-infinite-loop-in-regex-matching/cmd
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
test/regression/1195-infinite-loop-in-regex-matching/in
Normal file
1
test/regression/1195-infinite-loop-in-regex-matching/in
Normal file
|
@ -0,0 +1 @@
|
||||||
|
¦¥ô†èg
’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×vÔ ŸRáïújWWq/ÍÈé.h3ÑÓþbè –ά÷ÚO¢´”ã<E2809D>a‘…=Þ~N´uù{G”„ÜqØTYF¯Òý¾f#‡<0E>èB-—îÿ Þw]±„<šâ
º»cª7J‚ë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt
Ä7±q
2j3ã= UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ–]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€‘$ˆ:þ‚›r›8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ
fìÎJ
|
1
test/regression/1195-infinite-loop-in-regex-matching/out
Normal file
1
test/regression/1195-infinite-loop-in-regex-matching/out
Normal file
|
@ -0,0 +1 @@
|
||||||
|
¦¥ô†èg
’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×vÔ ŸRáïújWWq/ÍÈé.h3ÑÓþbè –ά÷ÚO¢´”ã<E2809D>a‘…=Þ~N´uù{G”„ÜqØTYF¯Òý¾f#‡<0E>èB-—îÿ Þw]±„<šâ
º»cª7J‚ë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt
Ä7±q
2j3ã= UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ–]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€‘$ˆ:þ‚›r›8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ
fìÎJ
|
2
test/regression/1195-infinite-loop-in-regex-matching/rc
Normal file
2
test/regression/1195-infinite-loop-in-regex-matching/rc
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
add-highlighter regions -default code -match-capture sh \
|
||||||
|
heredoc '<<-?(\w+)' '^\t*(\w+)$' ''
|
Loading…
Reference in New Issue
Block a user