Regex: Find potential start position using a map of valid start chars

With this optimization we get close to performance parity with boost regex on the common use cases in Kakoune.
2017-10-06 13:40:27 +08:00 · 2017-10-06 13:40:27 +08:00 · 3b69dda04e
commit 3b69dda04e
parent 741772aef9
4 changed files with 127 additions and 10 deletions
--- a/src/regex.cc
+++ b/src/regex.cc
@ -11,7 +11,8 @@ using Utf8It = RegexUtf8It<const char*>;
 Regex::Regex(StringView re, flag_type flags) try
    : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
 {
-    m_impl = compile_regex(re);
+    if (auto compiled_regex = compile_regex(re))
        m_impl = new CompiledRegex{std::move(compiled_regex)};
 } catch (std::runtime_error& err) { throw regex_error(err.what()); }
 String option_to_string(const Regex& re)
--- a/src/regex.hh
+++ b/src/regex.hh
@ -36,11 +36,11 @@ public:
    static constexpr const char* option_type_name = "regex";
-    const CompiledRegex& impl() const { return m_impl; }
+    const CompiledRegex* impl() const { return m_impl.get(); }
 private:
    String m_str;
-    CompiledRegex m_impl;
+    RefPtr<CompiledRegex> m_impl;
 };
 template<typename It>
@ -143,7 +143,7 @@ bool regex_match(It begin, It end, const Regex& re)
    try
    {
        bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
-        if (re.impl() and matched != regex_match(begin, end, re.impl()))
+        if (re.impl() and matched != regex_match(begin, end, *re.impl()))
            regex_mismatch(re);
        return matched;
    }
@ -160,7 +160,7 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
    {
        bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
        Vector<It> captures;
-        if (re.impl() and matched != regex_match(begin, end, captures, re.impl()))
+        if (re.impl() and matched != regex_match(begin, end, captures, *re.impl()))
            regex_mismatch(re);
        if (re.impl() and matched)
            check_captures(re, res, captures);
@ -179,7 +179,7 @@ bool regex_search(It begin, It end, const Regex& re,
    try
    {
        bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
-        if (re.impl() and matched != regex_search(begin, end, re.impl(), convert_flags(flags)))
+        if (re.impl() and matched != regex_search(begin, end, *re.impl(), convert_flags(flags)))
            regex_mismatch(re);
        return matched;
    }
@ -197,7 +197,7 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
    {
        bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
        Vector<It> captures;
-        if (re.impl() and matched != regex_search(begin, end, captures, re.impl(), convert_flags(flags)))
+        if (re.impl() and matched != regex_search(begin, end, captures, *re.impl(), convert_flags(flags)))
            regex_mismatch(re);
        if (re.impl() and matched)
            check_captures(re, res, captures);
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@ -515,6 +515,7 @@ struct RegexCompiler
        push_op(CompiledRegex::Match);
        m_program.matchers = m_parsed_regex.matchers;
        m_program.save_count = m_parsed_regex.capture_count * 2;
        m_program.start_chars = compute_start_chars();
    }
    CompiledRegex get_compiled_regex() { return std::move(m_program); }
@ -708,6 +709,87 @@ private:
                push_codepoint(cp->value);
    }
    // Fills accepted and rejected according to which chars can start the given node,
    // returns true if the node did not consume the char, hence a following node in
    // sequence would be still relevant for the parent node start chars computation.
    bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
                             bool (&accepted)[256], bool (&rejected)[256]) const
    {
        switch (node->op)
        {
            case ParsedRegex::Literal:
                if (node->value < 256)
                    accepted[node->value] = true;
                return node->quantifier.allows_none();
            case ParsedRegex::AnyChar:
                for (auto& b : accepted)
                    b = true;
                return node->quantifier.allows_none();
            case ParsedRegex::Matcher:
                for (auto& b : accepted) // treat matcher as everything can match for now
                    b = true;
                return node->quantifier.allows_none();
            case ParsedRegex::Sequence:
            {
                bool consumed = false;
                for (auto& child : node->children)
                {
                    if (not compute_start_chars(child, accepted, rejected))
                    {
                        consumed = true;
                        break;
                    }
                }
                return not consumed or node->quantifier.allows_none();
            }
            case ParsedRegex::Alternation:
            {
                bool all_consumed = not node->quantifier.allows_none();
                for (auto& child : node->children)
                {
                    if (compute_start_chars(child, accepted, rejected))
                        all_consumed = false;
                }
                return not all_consumed;
            }
            case ParsedRegex::LineStart:
            case ParsedRegex::LineEnd:
            case ParsedRegex::WordBoundary:
            case ParsedRegex::NotWordBoundary:
            case ParsedRegex::SubjectBegin:
            case ParsedRegex::SubjectEnd:
            case ParsedRegex::ResetStart:
                return true;
            case ParsedRegex::LookAhead:
                if (node->children.empty())
                    compute_start_chars(node->children.front(), accepted, rejected);
                return true;
            case ParsedRegex::NegativeLookAhead:
                if (node->children.empty())
                    compute_start_chars(node->children.front(), rejected, accepted);
                return true;
            case ParsedRegex::LookBehind:
                return true;
            case ParsedRegex::NegativeLookBehind:
                return true;
        }
        return false;
    }
    std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
    {
        bool accepted[256] = {};
        bool rejected[256] = {};
        if (compute_start_chars(m_parsed_regex.ast, accepted, rejected))
            return nullptr;
        auto start_chars = std::make_unique<CompiledRegex::StartChars>();
        for (int i = 0; i < 256; ++i)
            start_chars->map[i] = accepted[i] and not rejected[i];
        return start_chars;
    }
    CompiledRegex m_program;
    const ParsedRegex& m_parsed_regex;
 };
@ -1020,6 +1102,17 @@ auto test_regex = UnitTest{[]{
        kak_assert(vm.exec("foofoofoo"));
        kak_assert(not vm.exec("barbarbar"));
    }
    {
        TestVM vm{R"((?<!\\)(?:\\\\)*")"};
        kak_assert(vm.exec("foo\"", false));
    }
    {
        TestVM vm{R"($)"};
        kak_assert(vm.exec("foo\n", false, true));
        kak_assert(*vm.m_captures->pos[0] == '\n');
    }
 }};
 }
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@ -6,11 +6,12 @@
 #include "utf8_iterator.hh"
 #include "vector.hh"
 #include "flags.hh"
 #include "ref_ptr.hh"
 namespace Kakoune
 {
-struct CompiledRegex
+struct CompiledRegex : RefCountable
 {
    enum Op : char
    {
@ -41,6 +42,9 @@ struct CompiledRegex
    Vector<char> bytecode;
    Vector<std::function<bool (Codepoint)>> matchers;
    size_t save_count;
    struct StartChars { bool map[256]; };
    std::unique_ptr<StartChars> start_chars;
 };
 CompiledRegex compile_regex(StringView re);
@ -311,6 +315,16 @@ struct ThreadedRegexVM
        return false;
    }
    void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars)
    {
        if (not start_chars)
            return;
        while (start != end and *start >= 0 and *start < 256 and
               not start_chars[*start])
            ++start;
    }
    bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
    {
        m_begin = begin;
@ -324,6 +338,12 @@ struct ThreadedRegexVM
        const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
        Utf8It start{m_begin, m_begin, m_end};
        const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
        if (flags & RegexExecFlags::Search)
            to_next_start(start, end, start_chars);
        if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
                      current_threads, next_threads))
            return true;
@ -331,12 +351,15 @@ struct ThreadedRegexVM
        if (not (flags & RegexExecFlags::Search))
            return false;
-        while (start != end)
+        do
        {
-            if (exec_from(++start, no_saves ? nullptr : new_saves<false>(nullptr),
+            to_next_start(++start, end, start_chars);
            if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
                          current_threads, next_threads))
                return true;
        }
        while (start != end);
        return false;
    }