From dbcddafbfdc8808e8823812b1a5c40d4aedcdf90 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Thu, 20 Apr 2017 16:18:49 +0100 Subject: [PATCH] Change utf8::to_next/to_previous so that they are more symetrical The previous implementation could yield different positions when iterating forward and backward, leading to confusion in boost regex. This makes an existing problem a bit more visible: iterating with to_next and with read_codepoint wont behave the same way, as read_codepoint will put the iterator onto the byte following the utf8 codepoint, whereas to_next will put it on the next utf8 character start byte, which might be different if the buffer content is not valid utf8. Fixes #1195 --- src/utf8.hh | 31 +++++++++---------- .../1195-infinite-loop-in-regex-matching/cmd | 1 + .../1195-infinite-loop-in-regex-matching/in | 1 + .../1195-infinite-loop-in-regex-matching/out | 1 + .../1195-infinite-loop-in-regex-matching/rc | 2 ++ 5 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 test/regression/1195-infinite-loop-in-regex-matching/cmd create mode 100644 test/regression/1195-infinite-loop-in-regex-matching/in create mode 100644 test/regression/1195-infinite-loop-in-regex-matching/out create mode 100644 test/regression/1195-infinite-loop-in-regex-matching/rc diff --git a/src/utf8.hh b/src/utf8.hh index bbf34ae3..34cecc81 100644 --- a/src/utf8.hh +++ b/src/utf8.hh @@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end) // According to rfc3629, UTF-8 allows only up to 4 bytes. // (21 bits codepoint) unsigned char byte = read(it); - if (not (byte & 0x80)) // 0xxxxxxx + if ((byte & 0x80) == 0) // 0xxxxxxx return byte; if (it == end) @@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end) template ByteCount codepoint_size(char byte) { - if (not (byte & 0x80)) // 0xxxxxxx + if ((byte & 0x80) == 0) // 0xxxxxxx return 1; else if ((byte & 0xE0) == 0xC0) // 110xxxxx return 2; @@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp) template void to_next(Iterator& it, const Iterator& end) { - if (it != end and read(it) & 0x80) - while (it != end and (*(it) & 0xC0) == 0x80) - ++it; + if (it != end) + ++it; + while (it != end and not is_character_start(*it)) + ++it; } // returns an iterator to next character first byte @@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end) template void to_previous(Iterator& it, const Iterator& begin) { - while (it != begin and (*(--it) & 0xC0) == 0x80) - ; + if (it != begin) + --it; + while (not is_character_start(*it)) + --it; } // returns an iterator to the previous character first byte template @@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d) if (d < 0) { - while (it != end and d != 0) - { - if (is_character_start(*--it)) - ++d; - } + while (it != end and d++ != 0) + to_previous(it, end); } else if (d > 0) { - while (it != end and d != 0) - { - if (is_character_start(*++it)) - --d; - } + while (it != end and d-- != 0) + to_next(it, end); } return it; } diff --git a/test/regression/1195-infinite-loop-in-regex-matching/cmd b/test/regression/1195-infinite-loop-in-regex-matching/cmd new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/test/regression/1195-infinite-loop-in-regex-matching/cmd @@ -0,0 +1 @@ + diff --git a/test/regression/1195-infinite-loop-in-regex-matching/in b/test/regression/1195-infinite-loop-in-regex-matching/in new file mode 100644 index 00000000..21611d1a --- /dev/null +++ b/test/regression/1195-infinite-loop-in-regex-matching/in @@ -0,0 +1 @@ +g `)kW!l)v RjWWq/.h3bάOa=~Nuù{GqTYFf#B- w]< c7JٻGsjImt 7q 2j3= U*S91;=MMJ^r>eUtU,qswUy3͊]A3iْ.nDD$:r8fb;z fJ diff --git a/test/regression/1195-infinite-loop-in-regex-matching/out b/test/regression/1195-infinite-loop-in-regex-matching/out new file mode 100644 index 00000000..21611d1a --- /dev/null +++ b/test/regression/1195-infinite-loop-in-regex-matching/out @@ -0,0 +1 @@ +g `)kW!l)v RjWWq/.h3bάOa=~Nuù{GqTYFf#B- w]< c7JٻGsjImt 7q 2j3= U*S91;=MMJ^r>eUtU,qswUy3͊]A3iْ.nDD$:r8fb;z fJ diff --git a/test/regression/1195-infinite-loop-in-regex-matching/rc b/test/regression/1195-infinite-loop-in-regex-matching/rc new file mode 100644 index 00000000..d13cd29c --- /dev/null +++ b/test/regression/1195-infinite-loop-in-regex-matching/rc @@ -0,0 +1,2 @@ +add-highlighter regions -default code -match-capture sh \ + heredoc '<<-?(\w+)' '^\t*(\w+)$' ''