From d9b4076e3ca0f4f58e6a9bdea741e919d4ee3f6c Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 20 Oct 2017 15:17:02 +0800 Subject: [PATCH] Regex: Go back to instruction based search of next start The previous method, which was a bit faster in the general use case, can hit some cases where we get quadratic behaviour and very slow matching. By using an instruction, we can guarantee our complexity of O(N*M) as we will never have more than N threads (N being the instruction count) and we run the threads once per codepoint in the subject string. That slows down the general case slightly, but ensure we dont have pathological cases. This new version is much faster than the previous instruction based search because it does not use a plain `.*` searcher, but a specific, smarter instruction specialized for finding the next start if we are in the correct conditions. --- src/regex_impl.cc | 13 ++++++++++++ src/regex_impl.hh | 51 ++++++++++++++++++++++++----------------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index c32bbbf1..f02e6c2a 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -614,6 +614,7 @@ struct RegexCompiler RegexCompiler(const ParsedRegex& parsed_regex, RegexCompileFlags flags, MatchDirection direction) : m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward} { + write_search_prefix(); compile_node(m_parsed_regex.ast); push_inst(CompiledRegex::Match); m_program.matchers = m_parsed_regex.matchers; @@ -788,6 +789,16 @@ private: return pos; } + // Add an set of instruction prefix used in the search use case + void write_search_prefix() + { + kak_assert(m_program.instructions.empty()); + push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size); + push_inst(CompiledRegex::FindNextStart); + push_inst(CompiledRegex::Split_PrioritizeParent, 1); + kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size); + } + uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) { constexpr auto max_instructions = std::numeric_limits::max(); @@ -1003,6 +1014,8 @@ void dump_regex(const CompiledRegex& program) printf("%s (%s)\n", name, str.c_str()); break; } + case CompiledRegex::FindNextStart: + printf("find next start\n"); case CompiledRegex::Match: printf("match\n"); } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 108fe626..90411e94 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -28,6 +28,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain enum Op : char { Match, + FindNextStart, Literal, Literal_IgnoreCase, AnyChar, @@ -62,6 +63,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain }; static_assert(sizeof(Instruction) == 8, ""); + static constexpr uint16_t search_prefix_size = 3; + explicit operator bool() const { return not instructions.empty(); } Vector instructions; @@ -151,30 +154,15 @@ public: const bool no_saves = (flags & RegexExecFlags::NoSaves); - Utf8It start{m_begin}; + const bool search = (flags & RegexExecFlags::Search); - const CompiledRegex::StartChars* start_chars = m_program.start_chars.get(); - if (flags & RegexExecFlags::Search) - to_next_start(start, m_end, start_chars); + Utf8It start{m_begin}; + if (search) + to_next_start(start, m_end, m_program.start_chars.get()); ExecState state; - if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), - state)) - return true; - - if (not (flags & RegexExecFlags::Search)) - return false; - - do - { - to_next_start(++start, m_end, start_chars); - if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), - state)) - return true; - } - while (start != m_end); - - return false; + return exec_program(start, search ? 0 : CompiledRegex::search_prefix_size, + no_saves ? nullptr : new_saves(nullptr), state); } ArrayView captures() const @@ -247,10 +235,10 @@ private: uint16_t step = -1; }; - enum class StepResult { Consumed, Matched, Failed }; + enum class StepResult { Consumed, Matched, Failed, FindNextStart }; // Steps a thread until it consumes the current character, matches or fail - StepResult step(const Utf8It& pos, Thread& thread, ExecState& state) + StepResult step(Utf8It& pos, Thread& thread, ExecState& state) { while (true) { @@ -354,6 +342,11 @@ private: (inst.op == CompiledRegex::LookBehind_IgnoreCase)) return StepResult::Failed; break; + case CompiledRegex::FindNextStart: + kak_assert(state.current_threads.empty()); // search thread should by construction be the lower priority one + if (state.next_threads.empty()) + return StepResult::FindNextStart; + return StepResult::Consumed; case CompiledRegex::Match: return StepResult::Matched; } @@ -361,9 +354,9 @@ private: return StepResult::Failed; } - bool exec_from(Utf8It pos, Saves* initial_saves, ExecState& state) + bool exec_program(Utf8It pos, uint16_t first_inst, Saves* initial_saves, ExecState& state) { - state.current_threads.push_back({0, initial_saves}); + state.current_threads.push_back({first_inst, initial_saves}); state.next_threads.clear(); bool found_match = false; @@ -377,6 +370,7 @@ private: state.step = 1; // step 0 is never valid } + bool find_next_start = false; while (not state.current_threads.empty()) { auto thread = state.current_threads.back(); @@ -408,6 +402,10 @@ private: m_program.instructions[thread.inst].scheduled = true; state.next_threads.push_back(thread); break; + case StepResult::FindNextStart: + state.next_threads.push_back(thread); + find_next_start = true; + break; } } for (auto& thread : state.next_threads) @@ -420,6 +418,9 @@ private: std::swap(state.current_threads, state.next_threads); std::reverse(state.current_threads.begin(), state.current_threads.end()); ++pos; + + if (find_next_start) + to_next_start(pos, m_end, m_program.start_chars.get()); } }