Do not decode utf-8 when looking for regex next start

There is no need to decode as we know any non-ascii characters will
be treated as Other in the StartDesc.
This commit is contained in:
Maxime Coste 2019-12-04 22:00:31 +11:00
parent ee2985739b
commit d539e8fb89

View File

@ -235,8 +235,8 @@ public:
}
else if (start != config.end)
{
const Codepoint cp = codepoint(start, config);
if (not start_desc->map[cp < StartDesc::count ? cp : StartDesc::other])
const unsigned char c = forward ? *start : *utf8::previous(start, config.end);
if (not start_desc->map[(c < StartDesc::count) ? c : StartDesc::other])
return false;
}
}
@ -525,12 +525,21 @@ private:
{
while (start != config.end)
{
const Codepoint cp = read_codepoint(start, config);
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
static_assert(StartDesc::count <= 128, "start desc should be ascii only");
if constexpr (forward)
{
forward ? utf8::to_previous(start, config.subject_begin)
: utf8::to_next(start, config.subject_end);
return;
const unsigned char c = *start;
if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other])
return;
utf8::to_next(start, config.end);
}
else
{
auto prev = utf8::previous(start, config.end);
const unsigned char c = *prev;
if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other])
return;
start = prev;
}
}
}
@ -612,17 +621,6 @@ private:
is_word(utf8::codepoint(pos, config.subject_end));
}
static Codepoint read_codepoint(Iterator& it, const ExecConfig& config)
{
if (forward)
return utf8::read_codepoint(it, config.subject_end);
else
{
utf8::to_previous(it, config.subject_begin);
return utf8::codepoint(it, config.subject_end);
}
}
static Codepoint codepoint(const Iterator& it, const ExecConfig& config)
{
return utf8::codepoint(forward ? it : utf8::previous(it, config.subject_begin),