Refactor ThreadedRegexVM::exec_program to avoid branching

Moving logic into step_thread instead of returning an enum to
select what to run avoids the switch logic and improves run time.
This commit is contained in:
Maxime Coste 2018-11-05 19:46:53 +11:00
parent 7463a0d449
commit 7959c7f731

View File

@ -302,12 +302,19 @@ private:
ConstArrayView<CompiledRegex::Instruction> instructions; ConstArrayView<CompiledRegex::Instruction> instructions;
}; };
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
// Steps a thread until it consumes the current character, matches or fail // Steps a thread until it consumes the current character, matches or fail
StepResult step_thread(const Iterator& pos, uint16_t current_step, Thread& thread, const ExecConfig& config) void step_thread(const Iterator& pos, uint16_t current_step, Thread thread, const ExecConfig& config)
{ {
const bool no_saves = (config.flags & RegexExecFlags::NoSaves); auto failed = [this](const Thread& thread) {
release_saves(thread.saves);
};
auto consumed = [this](const Thread& thread) {
if (m_program.instructions[thread.inst].scheduled)
return release_saves(thread.saves);
m_program.instructions[thread.inst].scheduled = true;
m_threads.push_next(thread);
};
auto* instructions = m_program.instructions.data(); auto* instructions = m_program.instructions.data();
while (true) while (true)
{ {
@ -315,25 +322,25 @@ private:
// if this instruction was already executed for this step in another thread, // if this instruction was already executed for this step in another thread,
// then this thread is redundant and can be dropped // then this thread is redundant and can be dropped
if (inst.last_step == current_step) if (inst.last_step == current_step)
return StepResult::Failed; return failed(thread);
inst.last_step = current_step; inst.last_step = current_step;
switch (inst.op) switch (inst.op)
{ {
case CompiledRegex::Literal: case CompiledRegex::Literal:
if (pos != config.end and inst.param == codepoint(pos, config)) if (pos != config.end and inst.param == codepoint(pos, config))
return StepResult::Consumed; return consumed(thread);
return StepResult::Failed; return failed(thread);
case CompiledRegex::Literal_IgnoreCase: case CompiledRegex::Literal_IgnoreCase:
if (pos != config.end and inst.param == to_lower(codepoint(pos, config))) if (pos != config.end and inst.param == to_lower(codepoint(pos, config)))
return StepResult::Consumed; return consumed(thread);
return StepResult::Failed; return failed(thread);
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
return StepResult::Consumed; return consumed(thread);
case CompiledRegex::AnyCharExceptNewLine: case CompiledRegex::AnyCharExceptNewLine:
if (pos != config.end and codepoint(pos, config) != '\n') if (pos != config.end and codepoint(pos, config) != '\n')
return StepResult::Consumed; return consumed(thread);
return StepResult::Failed; return failed(thread);
case CompiledRegex::Jump: case CompiledRegex::Jump:
thread.inst = static_cast<int16_t>(inst.param); thread.inst = static_cast<int16_t>(inst.param);
break; break;
@ -354,7 +361,7 @@ private:
} }
case CompiledRegex::Save: case CompiledRegex::Save:
{ {
if (no_saves) if (config.flags & RegexExecFlags::NoSaves)
break; break;
if (thread.saves < 0) if (thread.saves < 0)
thread.saves = new_saves<false>(nullptr); thread.saves = new_saves<false>(nullptr);
@ -368,72 +375,86 @@ private:
} }
case CompiledRegex::Class: case CompiledRegex::Class:
if (pos == config.end) if (pos == config.end)
return StepResult::Failed; return failed(thread);
return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ? return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ?
StepResult::Consumed : StepResult::Failed; consumed(thread) : failed(thread);
case CompiledRegex::CharacterType: case CompiledRegex::CharacterType:
if (pos == config.end) if (pos == config.end)
return StepResult::Failed; return failed(thread);
return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ? return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ?
StepResult::Consumed : StepResult::Failed; consumed(thread) : failed(thread);
case CompiledRegex::LineStart: case CompiledRegex::LineStart:
if (not is_line_start(pos, config)) if (not is_line_start(pos, config))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::LineEnd: case CompiledRegex::LineEnd:
if (not is_line_end(pos, config)) if (not is_line_end(pos, config))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::WordBoundary: case CompiledRegex::WordBoundary:
if (not is_word_boundary(pos, config)) if (not is_word_boundary(pos, config))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::NotWordBoundary: case CompiledRegex::NotWordBoundary:
if (is_word_boundary(pos, config)) if (is_word_boundary(pos, config))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::SubjectBegin: case CompiledRegex::SubjectBegin:
if (pos != config.subject_begin) if (pos != config.subject_begin)
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::SubjectEnd: case CompiledRegex::SubjectEnd:
if (pos != config.subject_end) if (pos != config.subject_end)
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::LookAhead: case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead: case CompiledRegex::NegativeLookAhead:
if (lookaround<MatchDirection::Forward, false>(inst.param, pos, config) != if (lookaround<MatchDirection::Forward, false>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookAhead)) (inst.op == CompiledRegex::LookAhead))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::LookAhead_IgnoreCase: case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase: case CompiledRegex::NegativeLookAhead_IgnoreCase:
if (lookaround<MatchDirection::Forward, true>(inst.param, pos, config) != if (lookaround<MatchDirection::Forward, true>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookAhead_IgnoreCase)) (inst.op == CompiledRegex::LookAhead_IgnoreCase))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::LookBehind: case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind: case CompiledRegex::NegativeLookBehind:
if (lookaround<MatchDirection::Backward, false>(inst.param, pos, config) != if (lookaround<MatchDirection::Backward, false>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookBehind)) (inst.op == CompiledRegex::LookBehind))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::LookBehind_IgnoreCase: case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase: case CompiledRegex::NegativeLookBehind_IgnoreCase:
if (lookaround<MatchDirection::Backward, true>(inst.param, pos, config) != if (lookaround<MatchDirection::Backward, true>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookBehind_IgnoreCase)) (inst.op == CompiledRegex::LookBehind_IgnoreCase))
return StepResult::Failed; return failed(thread);
break; break;
case CompiledRegex::FindNextStart: case CompiledRegex::FindNextStart:
kak_assert(m_threads.current_is_empty()); // search thread should by construction be the lower priority one // search thread should by construction be the lowest priority thread
if (m_threads.next_is_empty()) kak_assert(m_threads.current_is_empty());
return StepResult::FindNextStart; if (not m_threads.next_is_empty())
return StepResult::Consumed; return consumed(thread);
m_threads.push_next(thread);
m_find_next_start = true;
return;
case CompiledRegex::Match: case CompiledRegex::Match:
return StepResult::Matched; if ((pos != config.end and not (config.flags & RegexExecFlags::Search)) or
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
return failed(thread);
release_saves(m_captures);
m_captures = thread.saves;
m_found_match = true;
// remove lower priority threads
while (not m_threads.current_is_empty())
release_saves(m_threads.pop_current().saves);
return;
} }
} }
return StepResult::Failed; return failed(thread);
} }
bool exec_program(Iterator pos, const ExecConfig& config) bool exec_program(Iterator pos, const ExecConfig& config)
@ -446,7 +467,7 @@ private:
const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc; const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc;
uint16_t current_step = -1; uint16_t current_step = -1;
bool found_match = false; m_found_match = false;
while (true) // Iterate on all codepoints and once at the end while (true) // Iterate on all codepoints and once at the end
{ {
if (++current_step == 0) if (++current_step == 0)
@ -457,63 +478,27 @@ private:
current_step = 1; // step 0 is never valid current_step = 1; // step 0 is never valid
} }
bool find_next_start = false; m_find_next_start = false;
while (not m_threads.current_is_empty()) while (not m_threads.current_is_empty())
{ step_thread(pos, current_step, m_threads.pop_current(), config);
auto thread = m_threads.pop_current();
switch (step_thread(pos, current_step, thread, config))
{
case StepResult::Matched:
if ((pos != config.end and not (config.flags & RegexExecFlags::Search)) or
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
{
release_saves(thread.saves);
continue;
}
release_saves(m_captures);
m_captures = thread.saves;
found_match = true;
// remove this and lower priority threads
while (not m_threads.current_is_empty())
release_saves(m_threads.pop_current().saves);
break;
case StepResult::Failed:
release_saves(thread.saves);
break;
case StepResult::Consumed:
if (m_program.instructions[thread.inst].scheduled)
{
release_saves(thread.saves);
continue;
}
m_program.instructions[thread.inst].scheduled = true;
m_threads.push_next(thread);
break;
case StepResult::FindNextStart:
m_threads.push_next(thread);
find_next_start = true;
break;
}
}
for (auto& thread : m_threads.next_threads()) for (auto& thread : m_threads.next_threads())
m_program.instructions[thread.inst].scheduled = false; m_program.instructions[thread.inst].scheduled = false;
if (pos == config.end or m_threads.next_is_empty() or if (pos == config.end or m_threads.next_is_empty() or
(found_match and (config.flags & RegexExecFlags::AnyMatch))) (m_found_match and (config.flags & RegexExecFlags::AnyMatch)))
{ {
for (auto& t : m_threads.next_threads()) for (auto& t : m_threads.next_threads())
release_saves(t.saves); release_saves(t.saves);
m_threads.clear_next(); m_threads.clear_next();
return found_match; return m_found_match;
} }
m_threads.swap_next(); m_threads.swap_next();
forward ? utf8::to_next(pos, config.subject_end) forward ? utf8::to_next(pos, config.subject_end)
: utf8::to_previous(pos, config.subject_begin); : utf8::to_previous(pos, config.subject_begin);
if (find_next_start and start_desc) if (m_find_next_start and start_desc)
to_next_start(pos, config, *start_desc); to_next_start(pos, config, *start_desc);
} }
} }
@ -674,6 +659,8 @@ private:
Vector<Saves*, MemoryDomain::Regex> m_saves; Vector<Saves*, MemoryDomain::Regex> m_saves;
int16_t m_first_free = -1; int16_t m_first_free = -1;
int16_t m_captures = -1; int16_t m_captures = -1;
bool m_found_match = false;
bool m_find_next_start = false;
}; };
} }