From d539e8fb896dff4ed1757eaf9a81d9385a709580 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 4 Dec 2019 22:00:31 +1100 Subject: [PATCH] Do not decode utf-8 when looking for regex next start There is no need to decode as we know any non-ascii characters will be treated as Other in the StartDesc. --- src/regex_impl.hh | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/regex_impl.hh b/src/regex_impl.hh index e49c5186..13c48c09 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -235,8 +235,8 @@ public: } else if (start != config.end) { - const Codepoint cp = codepoint(start, config); - if (not start_desc->map[cp < StartDesc::count ? cp : StartDesc::other]) + const unsigned char c = forward ? *start : *utf8::previous(start, config.end); + if (not start_desc->map[(c < StartDesc::count) ? c : StartDesc::other]) return false; } } @@ -525,12 +525,21 @@ private: { while (start != config.end) { - const Codepoint cp = read_codepoint(start, config); - if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other]) + static_assert(StartDesc::count <= 128, "start desc should be ascii only"); + if constexpr (forward) { - forward ? utf8::to_previous(start, config.subject_begin) - : utf8::to_next(start, config.subject_end); - return; + const unsigned char c = *start; + if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other]) + return; + utf8::to_next(start, config.end); + } + else + { + auto prev = utf8::previous(start, config.end); + const unsigned char c = *prev; + if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other]) + return; + start = prev; } } } @@ -612,17 +621,6 @@ private: is_word(utf8::codepoint(pos, config.subject_end)); } - static Codepoint read_codepoint(Iterator& it, const ExecConfig& config) - { - if (forward) - return utf8::read_codepoint(it, config.subject_end); - else - { - utf8::to_previous(it, config.subject_begin); - return utf8::codepoint(it, config.subject_end); - } - } - static Codepoint codepoint(const Iterator& it, const ExecConfig& config) { return utf8::codepoint(forward ? it : utf8::previous(it, config.subject_begin),