From 3b69dda04efb1e5ed9109d9fdebe2a73ee177b41 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 6 Oct 2017 13:40:27 +0800 Subject: [PATCH] Regex: Find potential start position using a map of valid start chars With this optimization we get close to performance parity with boost regex on the common use cases in Kakoune. --- src/regex.cc | 3 +- src/regex.hh | 12 +++--- src/regex_impl.cc | 93 +++++++++++++++++++++++++++++++++++++++++++++++ src/regex_impl.hh | 29 +++++++++++++-- 4 files changed, 127 insertions(+), 10 deletions(-) diff --git a/src/regex.cc b/src/regex.cc index 749dced8..a622269b 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -11,7 +11,8 @@ using Utf8It = RegexUtf8It; Regex::Regex(StringView re, flag_type flags) try : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()} { - m_impl = compile_regex(re); + if (auto compiled_regex = compile_regex(re)) + m_impl = new CompiledRegex{std::move(compiled_regex)}; } catch (std::runtime_error& err) { throw regex_error(err.what()); } String option_to_string(const Regex& re) diff --git a/src/regex.hh b/src/regex.hh index 5e2f4b01..64e24304 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -36,11 +36,11 @@ public: static constexpr const char* option_type_name = "regex"; - const CompiledRegex& impl() const { return m_impl; } + const CompiledRegex* impl() const { return m_impl.get(); } private: String m_str; - CompiledRegex m_impl; + RefPtr m_impl; }; template @@ -143,7 +143,7 @@ bool regex_match(It begin, It end, const Regex& re) try { bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, re); - if (re.impl() and matched != regex_match(begin, end, re.impl())) + if (re.impl() and matched != regex_match(begin, end, *re.impl())) regex_mismatch(re); return matched; } @@ -160,7 +160,7 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) { bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, res, re); Vector captures; - if (re.impl() and matched != regex_match(begin, end, captures, re.impl())) + if (re.impl() and matched != regex_match(begin, end, captures, *re.impl())) regex_mismatch(re); if (re.impl() and matched) check_captures(re, res, captures); @@ -179,7 +179,7 @@ bool regex_search(It begin, It end, const Regex& re, try { bool matched = boost::regex_search>({begin, begin, end}, {end, begin, end}, re, flags); - if (re.impl() and matched != regex_search(begin, end, re.impl(), convert_flags(flags))) + if (re.impl() and matched != regex_search(begin, end, *re.impl(), convert_flags(flags))) regex_mismatch(re); return matched; } @@ -197,7 +197,7 @@ bool regex_search(It begin, It end, MatchResults& res, const Regex& re, { bool matched = boost::regex_search>({begin, begin, end}, {end, begin, end}, res, re, flags); Vector captures; - if (re.impl() and matched != regex_search(begin, end, captures, re.impl(), convert_flags(flags))) + if (re.impl() and matched != regex_search(begin, end, captures, *re.impl(), convert_flags(flags))) regex_mismatch(re); if (re.impl() and matched) check_captures(re, res, captures); diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 52c45d61..de00ce87 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -515,6 +515,7 @@ struct RegexCompiler push_op(CompiledRegex::Match); m_program.matchers = m_parsed_regex.matchers; m_program.save_count = m_parsed_regex.capture_count * 2; + m_program.start_chars = compute_start_chars(); } CompiledRegex get_compiled_regex() { return std::move(m_program); } @@ -708,6 +709,87 @@ private: push_codepoint(cp->value); } + // Fills accepted and rejected according to which chars can start the given node, + // returns true if the node did not consume the char, hence a following node in + // sequence would be still relevant for the parent node start chars computation. + bool compute_start_chars(const ParsedRegex::AstNodePtr& node, + bool (&accepted)[256], bool (&rejected)[256]) const + { + switch (node->op) + { + case ParsedRegex::Literal: + if (node->value < 256) + accepted[node->value] = true; + return node->quantifier.allows_none(); + case ParsedRegex::AnyChar: + for (auto& b : accepted) + b = true; + return node->quantifier.allows_none(); + case ParsedRegex::Matcher: + for (auto& b : accepted) // treat matcher as everything can match for now + b = true; + return node->quantifier.allows_none(); + case ParsedRegex::Sequence: + { + bool consumed = false; + for (auto& child : node->children) + { + if (not compute_start_chars(child, accepted, rejected)) + { + consumed = true; + break; + } + } + return not consumed or node->quantifier.allows_none(); + } + case ParsedRegex::Alternation: + { + bool all_consumed = not node->quantifier.allows_none(); + for (auto& child : node->children) + { + if (compute_start_chars(child, accepted, rejected)) + all_consumed = false; + } + return not all_consumed; + } + case ParsedRegex::LineStart: + case ParsedRegex::LineEnd: + case ParsedRegex::WordBoundary: + case ParsedRegex::NotWordBoundary: + case ParsedRegex::SubjectBegin: + case ParsedRegex::SubjectEnd: + case ParsedRegex::ResetStart: + return true; + case ParsedRegex::LookAhead: + if (node->children.empty()) + compute_start_chars(node->children.front(), accepted, rejected); + return true; + case ParsedRegex::NegativeLookAhead: + if (node->children.empty()) + compute_start_chars(node->children.front(), rejected, accepted); + return true; + case ParsedRegex::LookBehind: + return true; + case ParsedRegex::NegativeLookBehind: + return true; + } + return false; + } + + std::unique_ptr compute_start_chars() const + { + bool accepted[256] = {}; + bool rejected[256] = {}; + if (compute_start_chars(m_parsed_regex.ast, accepted, rejected)) + return nullptr; + + auto start_chars = std::make_unique(); + for (int i = 0; i < 256; ++i) + start_chars->map[i] = accepted[i] and not rejected[i]; + + return start_chars; + } + CompiledRegex m_program; const ParsedRegex& m_parsed_regex; }; @@ -1020,6 +1102,17 @@ auto test_regex = UnitTest{[]{ kak_assert(vm.exec("foofoofoo")); kak_assert(not vm.exec("barbarbar")); } + + { + TestVM vm{R"((?pos[0] == '\n'); + } }}; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 0c15b57a..9e81a701 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -6,11 +6,12 @@ #include "utf8_iterator.hh" #include "vector.hh" #include "flags.hh" +#include "ref_ptr.hh" namespace Kakoune { -struct CompiledRegex +struct CompiledRegex : RefCountable { enum Op : char { @@ -41,6 +42,9 @@ struct CompiledRegex Vector bytecode; Vector> matchers; size_t save_count; + + struct StartChars { bool map[256]; }; + std::unique_ptr start_chars; }; CompiledRegex compile_regex(StringView re); @@ -311,6 +315,16 @@ struct ThreadedRegexVM return false; } + void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars) + { + if (not start_chars) + return; + + while (start != end and *start >= 0 and *start < 256 and + not start_chars[*start]) + ++start; + } + bool exec(Iterator begin, Iterator end, RegexExecFlags flags) { m_begin = begin; @@ -324,6 +338,12 @@ struct ThreadedRegexVM const bool no_saves = (m_flags & RegexExecFlags::NoSaves); Utf8It start{m_begin, m_begin, m_end}; + + const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr; + + if (flags & RegexExecFlags::Search) + to_next_start(start, end, start_chars); + if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), current_threads, next_threads)) return true; @@ -331,12 +351,15 @@ struct ThreadedRegexVM if (not (flags & RegexExecFlags::Search)) return false; - while (start != end) + do { - if (exec_from(++start, no_saves ? nullptr : new_saves(nullptr), + to_next_start(++start, end, start_chars); + if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), current_threads, next_threads)) return true; } + while (start != end); + return false; }