Regex: Go back to instruction based search of next start

The previous method, which was a bit faster in the general use case,
can hit some cases where we get quadratic behaviour and very slow
matching.

By using an instruction, we can guarantee our complexity of O(N*M)
as we will never have more than N threads (N being the instruction
count) and we run the threads once per codepoint in the subject
string.

That slows down the general case slightly, but ensure we dont have
pathological cases.

This new version is much faster than the previous instruction based
search because it does not use a plain `.*` searcher, but a specific,
smarter instruction specialized for finding the next start if we are
in the correct conditions.
This commit is contained in:
Maxime Coste 2017-10-20 15:17:02 +08:00
parent 3f627058b0
commit d9b4076e3c
2 changed files with 39 additions and 25 deletions

View File

@ -614,6 +614,7 @@ struct RegexCompiler
RegexCompiler(const ParsedRegex& parsed_regex, RegexCompileFlags flags, MatchDirection direction)
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward}
{
write_search_prefix();
compile_node(m_parsed_regex.ast);
push_inst(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
@ -788,6 +789,16 @@ private:
return pos;
}
// Add an set of instruction prefix used in the search use case
void write_search_prefix()
{
kak_assert(m_program.instructions.empty());
push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size);
push_inst(CompiledRegex::FindNextStart);
push_inst(CompiledRegex::Split_PrioritizeParent, 1);
kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size);
}
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
{
constexpr auto max_instructions = std::numeric_limits<uint16_t>::max();
@ -1003,6 +1014,8 @@ void dump_regex(const CompiledRegex& program)
printf("%s (%s)\n", name, str.c_str());
break;
}
case CompiledRegex::FindNextStart:
printf("find next start\n");
case CompiledRegex::Match:
printf("match\n");
}

View File

@ -28,6 +28,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
enum Op : char
{
Match,
FindNextStart,
Literal,
Literal_IgnoreCase,
AnyChar,
@ -62,6 +63,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
};
static_assert(sizeof(Instruction) == 8, "");
static constexpr uint16_t search_prefix_size = 3;
explicit operator bool() const { return not instructions.empty(); }
Vector<Instruction, MemoryDomain::Regex> instructions;
@ -151,30 +154,15 @@ public:
const bool no_saves = (flags & RegexExecFlags::NoSaves);
Utf8It start{m_begin};
const bool search = (flags & RegexExecFlags::Search);
const CompiledRegex::StartChars* start_chars = m_program.start_chars.get();
if (flags & RegexExecFlags::Search)
to_next_start(start, m_end, start_chars);
Utf8It start{m_begin};
if (search)
to_next_start(start, m_end, m_program.start_chars.get());
ExecState state;
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
state))
return true;
if (not (flags & RegexExecFlags::Search))
return false;
do
{
to_next_start(++start, m_end, start_chars);
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
state))
return true;
}
while (start != m_end);
return false;
return exec_program(start, search ? 0 : CompiledRegex::search_prefix_size,
no_saves ? nullptr : new_saves<false>(nullptr), state);
}
ArrayView<const Iterator> captures() const
@ -247,10 +235,10 @@ private:
uint16_t step = -1;
};
enum class StepResult { Consumed, Matched, Failed };
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
// Steps a thread until it consumes the current character, matches or fail
StepResult step(const Utf8It& pos, Thread& thread, ExecState& state)
StepResult step(Utf8It& pos, Thread& thread, ExecState& state)
{
while (true)
{
@ -354,6 +342,11 @@ private:
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
return StepResult::Failed;
break;
case CompiledRegex::FindNextStart:
kak_assert(state.current_threads.empty()); // search thread should by construction be the lower priority one
if (state.next_threads.empty())
return StepResult::FindNextStart;
return StepResult::Consumed;
case CompiledRegex::Match:
return StepResult::Matched;
}
@ -361,9 +354,9 @@ private:
return StepResult::Failed;
}
bool exec_from(Utf8It pos, Saves* initial_saves, ExecState& state)
bool exec_program(Utf8It pos, uint16_t first_inst, Saves* initial_saves, ExecState& state)
{
state.current_threads.push_back({0, initial_saves});
state.current_threads.push_back({first_inst, initial_saves});
state.next_threads.clear();
bool found_match = false;
@ -377,6 +370,7 @@ private:
state.step = 1; // step 0 is never valid
}
bool find_next_start = false;
while (not state.current_threads.empty())
{
auto thread = state.current_threads.back();
@ -408,6 +402,10 @@ private:
m_program.instructions[thread.inst].scheduled = true;
state.next_threads.push_back(thread);
break;
case StepResult::FindNextStart:
state.next_threads.push_back(thread);
find_next_start = true;
break;
}
}
for (auto& thread : state.next_threads)
@ -420,6 +418,9 @@ private:
std::swap(state.current_threads, state.next_threads);
std::reverse(state.current_threads.begin(), state.current_threads.end());
++pos;
if (find_next_start)
to_next_start(pos, m_end, m_program.start_chars.get());
}
}