From dbb175841b65ff4c1d8f7bad92fab9841c04e9b0 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 4 Oct 2017 12:16:52 +0800 Subject: [PATCH] Regex: do not write the search prefix inside the program bytecode Its faster to have specialized code in the VM directly --- src/regex_impl.cc | 12 -------- src/regex_impl.hh | 76 +++++++++++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index c8c9d6b2..c7950bd8 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -502,7 +502,6 @@ struct RegexCompiler RegexCompiler(const ParsedRegex& parsed_regex) : m_parsed_regex{parsed_regex} { - write_search_prefix(); compile_node(m_parsed_regex.ast); push_op(CompiledRegex::Match); m_program.matchers = m_parsed_regex.matchers; @@ -659,17 +658,6 @@ private: return pos; } - // Add a '.*' as the first instructions for the search use case - void write_search_prefix() - { - kak_assert(m_program.bytecode.empty()); - push_op(CompiledRegex::Split_PrioritizeChild); - get_offset(alloc_offset()) = CompiledRegex::search_prefix_size; - push_op(CompiledRegex::AnyChar); - push_op(CompiledRegex::Split_PrioritizeParent); - get_offset(alloc_offset()) = 1 + sizeof(Offset); - } - Offset alloc_offset() { auto pos = m_program.bytecode.size(); diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 4d073473..0176b223 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -36,8 +36,6 @@ struct CompiledRegex }; using Offset = unsigned; - static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset); - explicit operator bool() const { return not bytecode.empty(); } Vector bytecode; @@ -103,18 +101,18 @@ struct ThreadedRegexVM } }; - Saves* clone_saves(Saves* saves) + Saves* clone_saves(Iterator* pos) { if (not m_free_saves.empty()) { Saves* res = m_free_saves.back(); m_free_saves.pop_back(); res->refcount = 1; - std::copy(saves->pos, saves->pos + m_program.save_count, res->pos); + std::copy(pos, pos + m_program.save_count, res->pos); return res; } - m_saves.push_back(Saves::allocate(m_program.save_count, saves->pos)); + m_saves.push_back(Saves::allocate(m_program.save_count, pos)); return m_saves.back(); } @@ -130,6 +128,8 @@ struct ThreadedRegexVM Saves* saves; }; + using Utf8It = utf8::iterator; + enum class StepResult { Consumed, Matched, Failed }; StepResult step(Thread& thread, Vector& threads) { @@ -181,7 +181,7 @@ struct ThreadedRegexVM if (thread.saves->refcount > 1) { --thread.saves->refcount; - thread.saves = clone_saves(thread.saves); + thread.saves = clone_saves(thread.saves->pos); } const size_t index = *thread.inst++; thread.saves->pos[index] = m_pos.base(); @@ -250,30 +250,13 @@ struct ThreadedRegexVM return StepResult::Failed; } - bool exec(Iterator begin, Iterator end, RegexExecFlags flags) + bool exec_from(Utf8It start, Saves* initial_saves, Vector& current_threads, Vector& next_threads) { - m_begin = begin; - m_end = end; - m_flags = flags; + current_threads.push_back({m_program.bytecode.data(), initial_saves}); + next_threads.clear(); bool found_match = false; - - if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end) - return false; - - Saves* initial_saves = nullptr; - if (not (m_flags & RegexExecFlags::NoSaves)) - { - m_saves.push_back(Saves::allocate(m_program.save_count)); - initial_saves = m_saves.back(); - } - - const bool search = (flags & RegexExecFlags::Search); - - const auto start_offset = search ? 0 : CompiledRegex::search_prefix_size; - Vector current_threads{Thread{m_program.bytecode.data() + start_offset, initial_saves}}; - Vector next_threads; - for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos) + for (m_pos = start; m_pos != m_end; ++m_pos) { while (not current_threads.empty()) { @@ -282,15 +265,15 @@ struct ThreadedRegexVM switch (step(thread, current_threads)) { case StepResult::Matched: - if (not search or // We are not at end, this is not a full match - (flags & RegexExecFlags::NotInitialNull and m_pos == m_begin)) + if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match + (m_flags & RegexExecFlags::NotInitialNull and m_pos == m_begin)) { release_saves(thread.saves); continue; } m_captures = thread.saves; - if (flags & RegexExecFlags::AnyMatch) + if (m_flags & RegexExecFlags::AnyMatch) return true; found_match = true; @@ -330,6 +313,37 @@ struct ThreadedRegexVM return false; } + bool exec(Iterator begin, Iterator end, RegexExecFlags flags) + { + m_begin = begin; + m_end = end; + m_flags = flags; + + if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end) + return false; + + Vector current_threads, next_threads; + + const bool no_saves = (m_flags & RegexExecFlags::NoSaves); + Vector empty_saves(m_program.save_count, Iterator{}); + + Utf8It start{m_begin, m_begin, m_end}; + if (exec_from(start, no_saves ? nullptr : clone_saves(empty_saves.data()), + current_threads, next_threads)) + return true; + + if (not (flags & RegexExecFlags::Search)) + return false; + + while (start != end) + { + if (exec_from(++start, no_saves ? nullptr : clone_saves(empty_saves.data()), + current_threads, next_threads)) + return true; + } + return false; + } + bool is_line_start() const { return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or @@ -351,8 +365,6 @@ struct ThreadedRegexVM const CompiledRegex& m_program; - using Utf8It = utf8::iterator; - Iterator m_begin; Iterator m_end; Utf8It m_pos;