From 0364a998273cdf91904a6cd6d1cf921e44b41c84 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Sat, 5 Jan 2019 18:28:56 +1100 Subject: [PATCH] Refactor regex find next start not to be an instruction anymore The same logic can be hard coded, avoiding one thread and 3 instructions, improving the regex matching speed. --- src/regex_impl.cc | 17 +---------------- src/regex_impl.hh | 34 ++++++++++++++-------------------- 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 6e5bee51..28d201f5 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -676,14 +676,13 @@ struct RegexCompiler { kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward); // Approximation of the number of instructions generated - m_program.instructions.reserve((CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1) + m_program.instructions.reserve((parsed_regex.nodes.size() + 1) * (((flags & RegexCompileFlags::Backward) and not (flags & RegexCompileFlags::NoForward)) ? 2 : 1)); if (not (flags & RegexCompileFlags::NoForward)) { m_program.forward_start_desc = compute_start_desc(); - write_search_prefix(); compile_node(0); push_inst(CompiledRegex::Match); } @@ -692,7 +691,6 @@ struct RegexCompiler { m_program.first_backward_inst = m_program.instructions.size(); m_program.backward_start_desc = compute_start_desc(); - write_search_prefix(); compile_node(0); push_inst(CompiledRegex::Match); } @@ -866,16 +864,6 @@ private: return start_pos; } - // Add a sequence of instructions that enable searching for a match instead of checking for it - void write_search_prefix() - { - const uint32_t first_inst = m_program.instructions.size(); - push_inst(CompiledRegex::Split_PrioritizeChild, first_inst + CompiledRegex::search_prefix_size); - push_inst(CompiledRegex::FindNextStart); - push_inst(CompiledRegex::Split_PrioritizeParent, first_inst + 1); - kak_assert(m_program.instructions.size() == first_inst + CompiledRegex::search_prefix_size); - } - uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) { constexpr auto max_instructions = std::numeric_limits::max(); @@ -1137,9 +1125,6 @@ String dump_regex(const CompiledRegex& program) res += format("{} ({})\n", name, str); break; } - case CompiledRegex::FindNextStart: - res += "find next start\n"; - break; case CompiledRegex::Match: res += "match\n"; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 99ec2585..9ab89c71 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -49,7 +49,6 @@ struct CompiledRegex : RefCountable, UseMemoryDomain enum Op : char { Match, - FindNextStart, Literal, Literal_IgnoreCase, AnyChar, @@ -97,8 +96,6 @@ struct CompiledRegex : RefCountable, UseMemoryDomain }; static_assert(sizeof(Instruction) == 8, ""); - static constexpr uint16_t search_prefix_size = 3; - explicit operator bool() const { return not instructions.empty(); } struct NamedCapture @@ -217,8 +214,6 @@ public: instructions = instructions.subrange(0, m_program.first_backward_inst); else instructions = instructions.subrange(m_program.first_backward_inst); - if (not search) - instructions = instructions.subrange(CompiledRegex::search_prefix_size); const ExecConfig config{ Sentinel{forward ? begin : end}, @@ -452,14 +447,6 @@ private: (inst.op == CompiledRegex::LookBehind_IgnoreCase)) return failed(); break; - case CompiledRegex::FindNextStart: - // search thread should by construction be the lowest priority thread - kak_assert(m_threads.current_is_empty()); - if (not m_threads.next_is_empty()) - return consumed(); - m_threads.push_next(thread); - m_find_next_start = true; - return; case CompiledRegex::Match: if ((pos != config.end and not (mode & RegexMode::Search)) or (config.flags & RegexExecFlags::NotInitialNull and pos == config.begin)) @@ -484,10 +471,12 @@ private: release_saves(m_captures); m_captures = -1; m_threads.grow_ifn(); - m_threads.push_current({static_cast(&config.instructions[0] - &m_program.instructions[0]), -1}); + const int16_t first_inst = &config.instructions[0] - &m_program.instructions[0]; + m_threads.push_current({first_inst, -1}); const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc; + constexpr bool search = mode & RegexMode::Search; constexpr bool any_match = mode & RegexMode::AnyMatch; uint16_t current_step = -1; m_found_match = false; @@ -501,14 +490,15 @@ private: current_step = 1; // step 0 is never valid } - m_find_next_start = false; while (not m_threads.current_is_empty()) step_thread(pos, current_step, m_threads.pop_current(), config); for (auto& thread : m_threads.next_threads()) m_program.instructions[thread.inst].scheduled = false; - if (pos == config.end or m_threads.next_is_empty() or (m_found_match and any_match)) + if (pos == config.end or + (m_threads.next_is_empty() and (not search or m_found_match)) or + (m_found_match and any_match)) { for (auto& t : m_threads.next_threads()) release_saves(t.saves); @@ -516,12 +506,17 @@ private: return m_found_match; } - m_threads.swap_next(); forward ? utf8::to_next(pos, config.subject_end) : utf8::to_previous(pos, config.subject_begin); - if (m_find_next_start and start_desc) - to_next_start(pos, config, *start_desc); + if (search) + { + if (start_desc and m_threads.next_is_empty()) + to_next_start(pos, config, *start_desc); + m_threads.grow_ifn(); + m_threads.push_next({first_inst, -1}); + } + m_threads.swap_next(); } } @@ -691,7 +686,6 @@ private: int16_t m_first_free = -1; int16_t m_captures = -1; bool m_found_match = false; - bool m_find_next_start = false; }; }