Regex: Go back to instruction based search of next start
The previous method, which was a bit faster in the general use case, can hit some cases where we get quadratic behaviour and very slow matching. By using an instruction, we can guarantee our complexity of O(N*M) as we will never have more than N threads (N being the instruction count) and we run the threads once per codepoint in the subject string. That slows down the general case slightly, but ensure we dont have pathological cases. This new version is much faster than the previous instruction based search because it does not use a plain `.*` searcher, but a specific, smarter instruction specialized for finding the next start if we are in the correct conditions.
This commit is contained in:
parent
3f627058b0
commit
d9b4076e3c
|
@ -614,6 +614,7 @@ struct RegexCompiler
|
||||||
RegexCompiler(const ParsedRegex& parsed_regex, RegexCompileFlags flags, MatchDirection direction)
|
RegexCompiler(const ParsedRegex& parsed_regex, RegexCompileFlags flags, MatchDirection direction)
|
||||||
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward}
|
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward}
|
||||||
{
|
{
|
||||||
|
write_search_prefix();
|
||||||
compile_node(m_parsed_regex.ast);
|
compile_node(m_parsed_regex.ast);
|
||||||
push_inst(CompiledRegex::Match);
|
push_inst(CompiledRegex::Match);
|
||||||
m_program.matchers = m_parsed_regex.matchers;
|
m_program.matchers = m_parsed_regex.matchers;
|
||||||
|
@ -788,6 +789,16 @@ private:
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add an set of instruction prefix used in the search use case
|
||||||
|
void write_search_prefix()
|
||||||
|
{
|
||||||
|
kak_assert(m_program.instructions.empty());
|
||||||
|
push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size);
|
||||||
|
push_inst(CompiledRegex::FindNextStart);
|
||||||
|
push_inst(CompiledRegex::Split_PrioritizeParent, 1);
|
||||||
|
kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size);
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
|
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
|
||||||
{
|
{
|
||||||
constexpr auto max_instructions = std::numeric_limits<uint16_t>::max();
|
constexpr auto max_instructions = std::numeric_limits<uint16_t>::max();
|
||||||
|
@ -1003,6 +1014,8 @@ void dump_regex(const CompiledRegex& program)
|
||||||
printf("%s (%s)\n", name, str.c_str());
|
printf("%s (%s)\n", name, str.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case CompiledRegex::FindNextStart:
|
||||||
|
printf("find next start\n");
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
printf("match\n");
|
printf("match\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
enum Op : char
|
enum Op : char
|
||||||
{
|
{
|
||||||
Match,
|
Match,
|
||||||
|
FindNextStart,
|
||||||
Literal,
|
Literal,
|
||||||
Literal_IgnoreCase,
|
Literal_IgnoreCase,
|
||||||
AnyChar,
|
AnyChar,
|
||||||
|
@ -62,6 +63,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
};
|
};
|
||||||
static_assert(sizeof(Instruction) == 8, "");
|
static_assert(sizeof(Instruction) == 8, "");
|
||||||
|
|
||||||
|
static constexpr uint16_t search_prefix_size = 3;
|
||||||
|
|
||||||
explicit operator bool() const { return not instructions.empty(); }
|
explicit operator bool() const { return not instructions.empty(); }
|
||||||
|
|
||||||
Vector<Instruction, MemoryDomain::Regex> instructions;
|
Vector<Instruction, MemoryDomain::Regex> instructions;
|
||||||
|
@ -151,30 +154,15 @@ public:
|
||||||
|
|
||||||
|
|
||||||
const bool no_saves = (flags & RegexExecFlags::NoSaves);
|
const bool no_saves = (flags & RegexExecFlags::NoSaves);
|
||||||
Utf8It start{m_begin};
|
const bool search = (flags & RegexExecFlags::Search);
|
||||||
|
|
||||||
const CompiledRegex::StartChars* start_chars = m_program.start_chars.get();
|
Utf8It start{m_begin};
|
||||||
if (flags & RegexExecFlags::Search)
|
if (search)
|
||||||
to_next_start(start, m_end, start_chars);
|
to_next_start(start, m_end, m_program.start_chars.get());
|
||||||
|
|
||||||
ExecState state;
|
ExecState state;
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
return exec_program(start, search ? 0 : CompiledRegex::search_prefix_size,
|
||||||
state))
|
no_saves ? nullptr : new_saves<false>(nullptr), state);
|
||||||
return true;
|
|
||||||
|
|
||||||
if (not (flags & RegexExecFlags::Search))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
to_next_start(++start, m_end, start_chars);
|
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
|
||||||
state))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
while (start != m_end);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ArrayView<const Iterator> captures() const
|
ArrayView<const Iterator> captures() const
|
||||||
|
@ -247,10 +235,10 @@ private:
|
||||||
uint16_t step = -1;
|
uint16_t step = -1;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class StepResult { Consumed, Matched, Failed };
|
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
|
||||||
|
|
||||||
// Steps a thread until it consumes the current character, matches or fail
|
// Steps a thread until it consumes the current character, matches or fail
|
||||||
StepResult step(const Utf8It& pos, Thread& thread, ExecState& state)
|
StepResult step(Utf8It& pos, Thread& thread, ExecState& state)
|
||||||
{
|
{
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
@ -354,6 +342,11 @@ private:
|
||||||
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
|
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
|
case CompiledRegex::FindNextStart:
|
||||||
|
kak_assert(state.current_threads.empty()); // search thread should by construction be the lower priority one
|
||||||
|
if (state.next_threads.empty())
|
||||||
|
return StepResult::FindNextStart;
|
||||||
|
return StepResult::Consumed;
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
return StepResult::Matched;
|
return StepResult::Matched;
|
||||||
}
|
}
|
||||||
|
@ -361,9 +354,9 @@ private:
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_from(Utf8It pos, Saves* initial_saves, ExecState& state)
|
bool exec_program(Utf8It pos, uint16_t first_inst, Saves* initial_saves, ExecState& state)
|
||||||
{
|
{
|
||||||
state.current_threads.push_back({0, initial_saves});
|
state.current_threads.push_back({first_inst, initial_saves});
|
||||||
state.next_threads.clear();
|
state.next_threads.clear();
|
||||||
|
|
||||||
bool found_match = false;
|
bool found_match = false;
|
||||||
|
@ -377,6 +370,7 @@ private:
|
||||||
state.step = 1; // step 0 is never valid
|
state.step = 1; // step 0 is never valid
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool find_next_start = false;
|
||||||
while (not state.current_threads.empty())
|
while (not state.current_threads.empty())
|
||||||
{
|
{
|
||||||
auto thread = state.current_threads.back();
|
auto thread = state.current_threads.back();
|
||||||
|
@ -408,6 +402,10 @@ private:
|
||||||
m_program.instructions[thread.inst].scheduled = true;
|
m_program.instructions[thread.inst].scheduled = true;
|
||||||
state.next_threads.push_back(thread);
|
state.next_threads.push_back(thread);
|
||||||
break;
|
break;
|
||||||
|
case StepResult::FindNextStart:
|
||||||
|
state.next_threads.push_back(thread);
|
||||||
|
find_next_start = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto& thread : state.next_threads)
|
for (auto& thread : state.next_threads)
|
||||||
|
@ -420,6 +418,9 @@ private:
|
||||||
std::swap(state.current_threads, state.next_threads);
|
std::swap(state.current_threads, state.next_threads);
|
||||||
std::reverse(state.current_threads.begin(), state.current_threads.end());
|
std::reverse(state.current_threads.begin(), state.current_threads.end());
|
||||||
++pos;
|
++pos;
|
||||||
|
|
||||||
|
if (find_next_start)
|
||||||
|
to_next_start(pos, m_end, m_program.start_chars.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user