Refactor regex find next start not to be an instruction anymore

The same logic can be hard coded, avoiding one thread and 3
instructions, improving the regex matching speed.
This commit is contained in:
Maxime Coste 2019-01-05 18:28:56 +11:00
parent fd043435e5
commit 0364a99827
2 changed files with 15 additions and 36 deletions

View File

@ -676,14 +676,13 @@ struct RegexCompiler
{ {
kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward); kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward);
// Approximation of the number of instructions generated // Approximation of the number of instructions generated
m_program.instructions.reserve((CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1) m_program.instructions.reserve((parsed_regex.nodes.size() + 1)
* (((flags & RegexCompileFlags::Backward) and * (((flags & RegexCompileFlags::Backward) and
not (flags & RegexCompileFlags::NoForward)) ? 2 : 1)); not (flags & RegexCompileFlags::NoForward)) ? 2 : 1));
if (not (flags & RegexCompileFlags::NoForward)) if (not (flags & RegexCompileFlags::NoForward))
{ {
m_program.forward_start_desc = compute_start_desc<RegexMode::Forward>(); m_program.forward_start_desc = compute_start_desc<RegexMode::Forward>();
write_search_prefix();
compile_node<RegexMode::Forward>(0); compile_node<RegexMode::Forward>(0);
push_inst(CompiledRegex::Match); push_inst(CompiledRegex::Match);
} }
@ -692,7 +691,6 @@ struct RegexCompiler
{ {
m_program.first_backward_inst = m_program.instructions.size(); m_program.first_backward_inst = m_program.instructions.size();
m_program.backward_start_desc = compute_start_desc<RegexMode::Backward>(); m_program.backward_start_desc = compute_start_desc<RegexMode::Backward>();
write_search_prefix();
compile_node<RegexMode::Backward>(0); compile_node<RegexMode::Backward>(0);
push_inst(CompiledRegex::Match); push_inst(CompiledRegex::Match);
} }
@ -866,16 +864,6 @@ private:
return start_pos; return start_pos;
} }
// Add a sequence of instructions that enable searching for a match instead of checking for it
void write_search_prefix()
{
const uint32_t first_inst = m_program.instructions.size();
push_inst(CompiledRegex::Split_PrioritizeChild, first_inst + CompiledRegex::search_prefix_size);
push_inst(CompiledRegex::FindNextStart);
push_inst(CompiledRegex::Split_PrioritizeParent, first_inst + 1);
kak_assert(m_program.instructions.size() == first_inst + CompiledRegex::search_prefix_size);
}
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
{ {
constexpr auto max_instructions = std::numeric_limits<int16_t>::max(); constexpr auto max_instructions = std::numeric_limits<int16_t>::max();
@ -1137,9 +1125,6 @@ String dump_regex(const CompiledRegex& program)
res += format("{} ({})\n", name, str); res += format("{} ({})\n", name, str);
break; break;
} }
case CompiledRegex::FindNextStart:
res += "find next start\n";
break;
case CompiledRegex::Match: case CompiledRegex::Match:
res += "match\n"; res += "match\n";
} }

View File

@ -49,7 +49,6 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
enum Op : char enum Op : char
{ {
Match, Match,
FindNextStart,
Literal, Literal,
Literal_IgnoreCase, Literal_IgnoreCase,
AnyChar, AnyChar,
@ -97,8 +96,6 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
}; };
static_assert(sizeof(Instruction) == 8, ""); static_assert(sizeof(Instruction) == 8, "");
static constexpr uint16_t search_prefix_size = 3;
explicit operator bool() const { return not instructions.empty(); } explicit operator bool() const { return not instructions.empty(); }
struct NamedCapture struct NamedCapture
@ -217,8 +214,6 @@ public:
instructions = instructions.subrange(0, m_program.first_backward_inst); instructions = instructions.subrange(0, m_program.first_backward_inst);
else else
instructions = instructions.subrange(m_program.first_backward_inst); instructions = instructions.subrange(m_program.first_backward_inst);
if (not search)
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
const ExecConfig config{ const ExecConfig config{
Sentinel{forward ? begin : end}, Sentinel{forward ? begin : end},
@ -452,14 +447,6 @@ private:
(inst.op == CompiledRegex::LookBehind_IgnoreCase)) (inst.op == CompiledRegex::LookBehind_IgnoreCase))
return failed(); return failed();
break; break;
case CompiledRegex::FindNextStart:
// search thread should by construction be the lowest priority thread
kak_assert(m_threads.current_is_empty());
if (not m_threads.next_is_empty())
return consumed();
m_threads.push_next(thread);
m_find_next_start = true;
return;
case CompiledRegex::Match: case CompiledRegex::Match:
if ((pos != config.end and not (mode & RegexMode::Search)) or if ((pos != config.end and not (mode & RegexMode::Search)) or
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin)) (config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
@ -484,10 +471,12 @@ private:
release_saves(m_captures); release_saves(m_captures);
m_captures = -1; m_captures = -1;
m_threads.grow_ifn(); m_threads.grow_ifn();
m_threads.push_current({static_cast<int16_t>(&config.instructions[0] - &m_program.instructions[0]), -1}); const int16_t first_inst = &config.instructions[0] - &m_program.instructions[0];
m_threads.push_current({first_inst, -1});
const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc; const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc;
constexpr bool search = mode & RegexMode::Search;
constexpr bool any_match = mode & RegexMode::AnyMatch; constexpr bool any_match = mode & RegexMode::AnyMatch;
uint16_t current_step = -1; uint16_t current_step = -1;
m_found_match = false; m_found_match = false;
@ -501,14 +490,15 @@ private:
current_step = 1; // step 0 is never valid current_step = 1; // step 0 is never valid
} }
m_find_next_start = false;
while (not m_threads.current_is_empty()) while (not m_threads.current_is_empty())
step_thread(pos, current_step, m_threads.pop_current(), config); step_thread(pos, current_step, m_threads.pop_current(), config);
for (auto& thread : m_threads.next_threads()) for (auto& thread : m_threads.next_threads())
m_program.instructions[thread.inst].scheduled = false; m_program.instructions[thread.inst].scheduled = false;
if (pos == config.end or m_threads.next_is_empty() or (m_found_match and any_match)) if (pos == config.end or
(m_threads.next_is_empty() and (not search or m_found_match)) or
(m_found_match and any_match))
{ {
for (auto& t : m_threads.next_threads()) for (auto& t : m_threads.next_threads())
release_saves(t.saves); release_saves(t.saves);
@ -516,12 +506,17 @@ private:
return m_found_match; return m_found_match;
} }
m_threads.swap_next();
forward ? utf8::to_next(pos, config.subject_end) forward ? utf8::to_next(pos, config.subject_end)
: utf8::to_previous(pos, config.subject_begin); : utf8::to_previous(pos, config.subject_begin);
if (m_find_next_start and start_desc) if (search)
to_next_start(pos, config, *start_desc); {
if (start_desc and m_threads.next_is_empty())
to_next_start(pos, config, *start_desc);
m_threads.grow_ifn();
m_threads.push_next({first_inst, -1});
}
m_threads.swap_next();
} }
} }
@ -691,7 +686,6 @@ private:
int16_t m_first_free = -1; int16_t m_first_free = -1;
int16_t m_captures = -1; int16_t m_captures = -1;
bool m_found_match = false; bool m_found_match = false;
bool m_find_next_start = false;
}; };
} }