Only decode current codepoint once per step

Instead of potentially decoding for each thread, always decode as
its only slightly slower than finding next codepoint (which will
be necessary anyway) and pass the codepoint to each thread.
This commit is contained in:
Maxime Coste 2023-02-19 12:15:33 +11:00
parent 2b74cd4b59
commit be49f36205

View File

@ -352,7 +352,7 @@ private:
// Steps a thread until it consumes the current character, matches or fail // Steps a thread until it consumes the current character, matches or fail
[[gnu::always_inline]] [[gnu::always_inline]]
void step_thread(const Iterator& pos, uint16_t current_step, Thread thread, const ExecConfig& config) void step_thread(const Iterator& pos, Codepoint cp, uint16_t current_step, Thread thread, const ExecConfig& config)
{ {
auto failed = [this, &thread]() { auto failed = [this, &thread]() {
release_saves(thread.saves); release_saves(thread.saves);
@ -388,14 +388,13 @@ private:
return; return;
case CompiledRegex::Literal: case CompiledRegex::Literal:
if (pos != config.end and if (pos != config.end and
inst.param.literal.codepoint == (inst.param.literal.ignore_case ? to_lower(codepoint(pos, config)) inst.param.literal.codepoint == (inst.param.literal.ignore_case ? to_lower(cp) : cp))
: codepoint(pos, config)))
return consumed(); return consumed();
return failed(); return failed();
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
return consumed(); return consumed();
case CompiledRegex::AnyCharExceptNewLine: case CompiledRegex::AnyCharExceptNewLine:
if (pos != config.end and codepoint(pos, config) != '\n') if (pos != config.end and cp != '\n')
return consumed(); return consumed();
return failed(); return failed();
case CompiledRegex::Jump: case CompiledRegex::Jump:
@ -428,13 +427,11 @@ private:
case CompiledRegex::CharClass: case CompiledRegex::CharClass:
if (pos == config.end) if (pos == config.end)
return failed(); return failed();
return m_program.character_classes[inst.param.character_class_index].matches(codepoint(pos, config)) ? return m_program.character_classes[inst.param.character_class_index].matches(cp) ? consumed() : failed();
consumed() : failed();
case CompiledRegex::CharType: case CompiledRegex::CharType:
if (pos == config.end) if (pos == config.end)
return failed(); return failed();
return is_ctype(inst.param.character_type, codepoint(pos, config)) ? return is_ctype(inst.param.character_type, cp) ? consumed() : failed();
consumed() : failed();
case CompiledRegex::LineAssertion: case CompiledRegex::LineAssertion:
if (not (inst.param.line_start ? is_line_start(pos, config) : is_line_end(pos, config))) if (not (inst.param.line_start ? is_line_start(pos, config) : is_line_end(pos, config)))
return failed(); return failed();
@ -486,8 +483,11 @@ private:
current_step = 1; // step 0 is never valid current_step = 1; // step 0 is never valid
} }
auto next = pos;
Codepoint cp = pos != config.end ? codepoint(next, config) : -1;
while (not m_threads.current_is_empty()) while (not m_threads.current_is_empty())
step_thread(pos, current_step, m_threads.pop_current(), config); step_thread(pos, cp, current_step, m_threads.pop_current(), config);
if (pos == config.end or if (pos == config.end or
(m_threads.next_is_empty() and (not search or m_found_match)) or (m_threads.next_is_empty() and (not search or m_found_match)) or
@ -498,9 +498,7 @@ private:
return m_found_match; return m_found_match;
} }
forward ? utf8::to_next(pos, config.end) pos = next;
: utf8::to_previous(pos, config.end);
if (search and not m_found_match) if (search and not m_found_match)
{ {
if (start_desc and m_threads.next_is_empty()) if (start_desc and m_threads.next_is_empty())
@ -611,10 +609,17 @@ private:
is_word(utf8::codepoint(pos, config.subject_end)); is_word(utf8::codepoint(pos, config.subject_end));
} }
static Codepoint codepoint(const Iterator& it, const ExecConfig& config) static Codepoint codepoint(Iterator& it, const ExecConfig& config)
{ {
return utf8::codepoint(forward ? it : utf8::previous(it, config.subject_begin), if constexpr (forward)
config.subject_end); {
return utf8::read_codepoint(it, config.end);
}
else
{
utf8::to_previous(it, config.end);
return utf8::codepoint(it, config.begin);
}
} }
const CompiledRegex& m_program; const CompiledRegex& m_program;