Change utf8::to_next/to_previous so that they are more symetrical

The previous implementation could yield different positions when iterating forward and backward, leading to confusion in boost regex. This makes an existing problem a bit more visible: iterating with to_next and with read_codepoint wont behave the same way, as read_codepoint will put the iterator onto the byte following the utf8 codepoint, whereas to_next will put it on the next utf8 character start byte, which might be different if the buffer content is not valid utf8. Fixes #1195
2017-04-20 16:18:49 +01:00 · 2017-04-20 16:18:49 +01:00 · dbcddafbfd
commit dbcddafbfd
parent 30e6387071
5 changed files with 19 additions and 17 deletions
--- a/src/utf8.hh
+++ b/src/utf8.hh
@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end)
    // According to rfc3629, UTF-8 allows only up to 4 bytes.
    // (21 bits codepoint)
    unsigned char byte = read(it);
-    if (not (byte & 0x80)) // 0xxxxxxx
+    if ((byte & 0x80) == 0) // 0xxxxxxx
        return byte;
    if (it == end)
@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end)
 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
 ByteCount codepoint_size(char byte)
 {
-    if (not (byte & 0x80)) // 0xxxxxxx
+    if ((byte & 0x80) == 0) // 0xxxxxxx
        return 1;
    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
        return 2;
@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp)
 template<typename Iterator>
 void to_next(Iterator& it, const Iterator& end)
 {
-    if (it != end and read(it) & 0x80)
+    if (it != end)
-        while (it != end and (*(it) & 0xC0) == 0x80)
+        ++it;
-            ++it;
+    while (it != end and not is_character_start(*it))
        ++it;
 }
 // returns an iterator to next character first byte
@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end)
 template<typename Iterator>
 void to_previous(Iterator& it, const Iterator& begin)
 {
-    while (it != begin and (*(--it) & 0xC0) == 0x80)
+    if (it != begin)
-           ;
+        --it;
    while (not is_character_start(*it))
        --it;
 }
 // returns an iterator to the previous character first byte
 template<typename Iterator>
@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d)
    if (d < 0)
    {
-        while (it != end and d != 0)
+        while (it != end and d++ != 0)
-        {
+            to_previous(it, end);
            if (is_character_start(*--it))
                ++d;
        }
    }
    else if (d > 0)
    {
-        while (it != end and d != 0)
+        while (it != end and d-- != 0)
-        {
+            to_next(it, end);
            if (is_character_start(*++it))
                --d;
        }
    }
    return it;
 }
--- a/test/regression/1195-infinite-loop-in-regex-matching/cmd
+++ b/test/regression/1195-infinite-loop-in-regex-matching/cmd
@ -0,0 +1 @@
--- a/test/regression/1195-infinite-loop-in-regex-matching/in
+++ b/test/regression/1195-infinite-loop-in-regex-matching/in
@ -0,0 +1 @@
 ¦¥ô†èg
’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×vÔ	ŸRáïújWWq/ÍÈé.h3ÑÓþbè îƒ‡–Î¬÷ÚO¢´”ã<E2809D>a‘…=Þ~N´uÃ¹{G”„ÜqØTYF¯Òý¾f#‡<0E>èB-—îÿ	Þw]±„<šâ
º»cª7J‚ë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt
Ä7±q
2j3ã= UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ–]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€‘$ˆ:þ‚›r›8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ
fìÎJ
--- a/test/regression/1195-infinite-loop-in-regex-matching/out
+++ b/test/regression/1195-infinite-loop-in-regex-matching/out
@ -0,0 +1 @@
 ¦¥ô†èg
’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×vÔ	ŸRáïújWWq/ÍÈé.h3ÑÓþbè îƒ‡–Î¬÷ÚO¢´”ã<E2809D>a‘…=Þ~N´uÃ¹{G”„ÜqØTYF¯Òý¾f#‡<0E>èB-—îÿ	Þw]±„<šâ
º»cª7J‚ë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt
Ä7±q
2j3ã= UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ–]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€‘$ˆ:þ‚›r›8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ
fìÎJ
--- a/test/regression/1195-infinite-loop-in-regex-matching/rc
+++ b/test/regression/1195-infinite-loop-in-regex-matching/rc
@ -0,0 +1,2 @@
 add-highlighter regions -default code -match-capture sh \
    heredoc '<<-?(\w+)' '^\t*(\w+)$' ''
		`@ -0,0 +1 @@`
							¦¥ô†èg ’¤`<60>)»kÈWÂ!lå–ÃöÇõ¿³)É×vÔ ŸRáïújWWq/ÍÈé.h3ÑÓþbè îƒ‡–Î¬÷ÚO¢´”ã<E2809D>a‘…=Þ~N´uÃ¹{G”„ÜqØTYF¯Òý¾f#‡<0E>èB-—îÿ Þw]±„<šâ º»cª7J‚ë¿Ù»ÄGËÇïsÐÁ¢ÐÃäåjImÅÂçÐt Ä7±q 2j3ã= UÝè*S9§ô1ýÀ;Ê=ÄM¤²MJœ^îr>e·…UØt„¨Uâç,ÿqßÇsòwUàÆyÑ3µÍŠ–]ÞA”3·ƒiå·ÏÙ’.ûâ„nDD€‘$ˆ:þ‚›r›8Çfé<03>¾ˆb;‡›‹zö¿ã…â«öÈ fìÎJ
		`@ -0,0 +1,2 @@`
							`add-highlighter regions -default code -match-capture sh \`
							`heredoc '<<-?(\w+)' '^\t*(\w+)$' ''`