Regex: Only reset processed and scheduled flags on relevant instructions

On big regex, reseting all those flags on all instructions for each
character can become the dominant operation. Track that actual
instructions index processed (the scheduled are already tracked in
the next_threads vector), and only reset these.
This commit is contained in:
Maxime Coste 2017-10-11 10:24:05 +08:00
parent 5bf4be645a
commit 3c999aba37

View File

@ -141,7 +141,6 @@ public:
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end) if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
return false; return false;
Vector<Thread> current_threads, next_threads;
const bool no_saves = (m_flags & RegexExecFlags::NoSaves); const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
Utf8It start{m_begin}; Utf8It start{m_begin};
@ -151,8 +150,9 @@ public:
if (flags & RegexExecFlags::Search) if (flags & RegexExecFlags::Search)
to_next_start(start, m_end, start_chars); to_next_start(start, m_end, start_chars);
ExecState state;
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr), if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads)) state))
return true; return true;
if (not (flags & RegexExecFlags::Search)) if (not (flags & RegexExecFlags::Search))
@ -162,7 +162,7 @@ public:
{ {
to_next_start(++start, m_end, start_chars); to_next_start(++start, m_end, start_chars);
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr), if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads)) state))
return true; return true;
} }
while (start != m_end); while (start != m_end);
@ -226,17 +226,25 @@ private:
utf8::iterator<Iterator>, utf8::iterator<Iterator>,
std::reverse_iterator<utf8::iterator<Iterator>>>; std::reverse_iterator<utf8::iterator<Iterator>>>;
struct ExecState
{
Vector<Thread> current_threads;
Vector<Thread> next_threads;
Vector<uint32_t> processed;
};
enum class StepResult { Consumed, Matched, Failed }; enum class StepResult { Consumed, Matched, Failed };
// Steps a thread until it consumes the current character, matches or fail // Steps a thread until it consumes the current character, matches or fail
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads) StepResult step(const Utf8It& pos, Thread& thread, ExecState& state)
{ {
while (true) while (true)
{ {
auto& inst = m_program.instructions[thread.inst++]; auto& inst = m_program.instructions[thread.inst];
if (inst.processed) if (inst.processed)
return StepResult::Failed; return StepResult::Failed;
inst.processed = true; inst.processed = true;
state.processed.push_back(thread.inst++);
switch (inst.op) switch (inst.op)
{ {
@ -257,14 +265,14 @@ private:
{ {
if (thread.saves) if (thread.saves)
++thread.saves->refcount; ++thread.saves->refcount;
threads.push_back({inst.param, thread.saves}); state.current_threads.push_back({inst.param, thread.saves});
break; break;
} }
case CompiledRegex::Split_PrioritizeChild: case CompiledRegex::Split_PrioritizeChild:
{ {
if (thread.saves) if (thread.saves)
++thread.saves->refcount; ++thread.saves->refcount;
threads.push_back({thread.inst, thread.saves}); state.current_threads.push_back({thread.inst, thread.saves});
thread.inst = inst.param; thread.inst = inst.param;
break; break;
} }
@ -340,25 +348,19 @@ private:
return StepResult::Failed; return StepResult::Failed;
} }
bool exec_from(Utf8It pos, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads) bool exec_from(Utf8It pos, Saves* initial_saves, ExecState& state)
{ {
current_threads.push_back({0, initial_saves}); state.current_threads.push_back({0, initial_saves});
next_threads.clear(); state.next_threads.clear();
bool found_match = false; bool found_match = false;
while (true) // Iterate on all codepoints and once at the end while (true) // Iterate on all codepoints and once at the end
{ {
for (auto& inst : m_program.instructions) while (not state.current_threads.empty())
{ {
inst.processed = false; auto thread = state.current_threads.back();
inst.scheduled = false; state.current_threads.pop_back();
} switch (step(pos, thread, state))
while (not current_threads.empty())
{
auto thread = current_threads.back();
current_threads.pop_back();
switch (step(pos, thread, current_threads))
{ {
case StepResult::Matched: case StepResult::Matched:
if ((pos != m_end and not (m_flags & RegexExecFlags::Search)) or if ((pos != m_end and not (m_flags & RegexExecFlags::Search)) or
@ -370,11 +372,8 @@ private:
release_saves(m_captures); release_saves(m_captures);
m_captures = thread.saves; m_captures = thread.saves;
if (pos == m_end or (m_flags & RegexExecFlags::AnyMatch))
return true;
found_match = true; found_match = true;
current_threads.clear(); // remove this and lower priority threads state.current_threads.clear(); // remove this and lower priority threads
break; break;
case StepResult::Failed: case StepResult::Failed:
release_saves(thread.saves); release_saves(thread.saves);
@ -386,15 +385,22 @@ private:
continue; continue;
} }
m_program.instructions[thread.inst].scheduled = true; m_program.instructions[thread.inst].scheduled = true;
next_threads.push_back(thread); state.next_threads.push_back(thread);
break; break;
} }
} }
if (pos == m_end or next_threads.empty()) for (auto& thread : state.next_threads)
m_program.instructions[thread.inst].scheduled = false;
for (auto inst : state.processed)
m_program.instructions[inst].processed = false;
state.processed.clear();
if (pos == m_end or state.next_threads.empty() or
(found_match and (m_flags & RegexExecFlags::AnyMatch)))
return found_match; return found_match;
std::swap(current_threads, next_threads); std::swap(state.current_threads, state.next_threads);
std::reverse(current_threads.begin(), current_threads.end()); std::reverse(state.current_threads.begin(), state.current_threads.end());
++pos; ++pos;
} }
} }