Regex: Refactor regex compilation to a regular RegexCompiler class

2017-09-26 23:04:47 +09:00 · 2017-09-26 23:04:47 +09:00 · f7468b576e
commit f7468b576e
parent d5717edc9d
1 changed files with 250 additions and 235 deletions
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@ -13,18 +13,15 @@
 namespace Kakoune
 {
-struct CompiledRegex
+struct ParsedRegex
 {
-    enum Op : char
+    enum Op
    {
        Match,
        Literal,
        AnyChar,
        Matcher,
-        Jump,
+        Sequence,
-        Split_PrioritizeParent,
+        Alternation,
        Split_PrioritizeChild,
        Save,
        LineStart,
        LineEnd,
        WordBoundary,
@ -33,84 +30,49 @@ struct CompiledRegex
        SubjectEnd,
    };
-    using Offset = unsigned;
+    struct Quantifier
    Vector<char> bytecode;
    Vector<std::function<bool (Codepoint)>> matchers;
    size_t save_count;
 };
 namespace RegexCompiler
 {
 struct Quantifier
 {
    enum Type
    {
-        One,
+        enum Type
-        Optional,
+        {
-        RepeatZeroOrMore,
+            One,
-        RepeatOneOrMore,
+            Optional,
-        RepeatMinMax,
+            RepeatZeroOrMore,
            RepeatOneOrMore,
            RepeatMinMax,
        };
        Type type = One;
        int min = -1, max = -1;
        bool allows_none() const
        {
            return type == Quantifier::Optional or
                   type == Quantifier::RepeatZeroOrMore or
                  (type == Quantifier::RepeatMinMax and min <= 0);
        }
        bool allows_infinite_repeat() const
        {
            return type == Quantifier::RepeatZeroOrMore or
                   type == Quantifier::RepeatOneOrMore or
                  (type == Quantifier::RepeatMinMax and max == -1);
        };
    };
    Type type = One;
    int min = -1, max = -1;
-    bool allows_none() const
+    struct AstNode
    {
-        return type == Quantifier::Optional or
+        Op op;
-               type == Quantifier::RepeatZeroOrMore or
+        Codepoint value;
-              (type == Quantifier::RepeatMinMax and min <= 0);
+        Quantifier quantifier;
-    }
+        Vector<std::unique_ptr<AstNode>> children;
    bool allows_infinite_repeat() const
    {
        return type == Quantifier::RepeatZeroOrMore or
               type == Quantifier::RepeatOneOrMore or
              (type == Quantifier::RepeatMinMax and max == -1);
    };
 };
-enum class Op
+    using AstNodePtr = std::unique_ptr<AstNode>;
 {
    Literal,
    AnyChar,
    Matcher,
    Sequence,
    Alternation,
    LineStart,
    LineEnd,
    WordBoundary,
    NotWordBoundary,
    SubjectBegin,
    SubjectEnd,
 };
 struct AstNode
 {
    Op op;
    Codepoint value;
    Quantifier quantifier;
    Vector<std::unique_ptr<AstNode>> children;
 };
 using AstNodePtr = std::unique_ptr<AstNode>;
 struct CharRange { Codepoint min, max; };
 struct ParsedRegex
 {
    AstNodePtr ast;
    size_t capture_count;
    Vector<std::function<bool (Codepoint)>> matchers;
 };
 AstNodePtr make_ast_node(Op op, Codepoint value = -1,
                         Quantifier quantifier = {Quantifier::One})
 {
    return AstNodePtr{new AstNode{op, value, quantifier, {}}};
 }
 // Recursive descent parser based on naming used in the ECMAScript
 // standard, although the syntax is not fully compatible.
 struct RegexParser
@ -124,6 +86,8 @@ struct RegexParser
    ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
    static ParsedRegex parse(StringView re) { return RegexParser{re}.get_parsed_regex(); }
 private:
    struct InvalidPolicy
    {
@ -131,6 +95,7 @@ private:
    };
    using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
    using AstNodePtr = ParsedRegex::AstNodePtr;
    AstNodePtr disjunction(unsigned capture = -1)
    {
@ -142,7 +107,7 @@ private:
        }
        ++m_pos;
-        AstNodePtr res = make_ast_node(Op::Alternation);
+        AstNodePtr res = new_node(ParsedRegex::Alternation);
        res->children.push_back(std::move(node));
        res->children.push_back(disjunction());
        res->value = capture;
@ -151,7 +116,7 @@ private:
    AstNodePtr alternative()
    {
-        AstNodePtr res = make_ast_node(Op::Sequence);
+        AstNodePtr res = new_node(ParsedRegex::Sequence);
        while (auto node = term())
            res->children.push_back(std::move(node));
        if (res->children.empty())
@ -178,17 +143,17 @@ private:
        switch (*m_pos)
        {
-            case '^': ++m_pos; return make_ast_node(Op::LineStart);
+            case '^': ++m_pos; return new_node(ParsedRegex::LineStart);
-            case '$': ++m_pos; return make_ast_node(Op::LineEnd);
+            case '$': ++m_pos; return new_node(ParsedRegex::LineEnd);
            case '\\':
                if (m_pos+1 == m_regex.end())
                    return nullptr;
                switch (*(m_pos+1))
                {
-                    case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary);
+                    case 'b': m_pos += 2; return new_node(ParsedRegex::WordBoundary);
-                    case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary);
+                    case 'B': m_pos += 2; return new_node(ParsedRegex::NotWordBoundary);
-                    case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin);
+                    case '`': m_pos += 2; return new_node(ParsedRegex::SubjectBegin);
-                    case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd);
+                    case '\'': m_pos += 2; return new_node(ParsedRegex::SubjectEnd);
                }
                break;
            /* TODO: look ahead, look behind */
@ -204,7 +169,7 @@ private:
        const Codepoint cp = *m_pos;
        switch (cp)
        {
-            case '.': ++m_pos; return make_ast_node(Op::AnyChar);
+            case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
            case '(':
            {
                ++m_pos;
@ -225,7 +190,7 @@ private:
                if (contains("^$.*+?()[]{}|", cp))
                    return nullptr;
                ++m_pos;
-                return make_ast_node(Op::Literal, cp);
+                return new_node(ParsedRegex::Literal, cp);
        }
    }
@ -244,7 +209,7 @@ private:
                     chars = character_class.additional_chars] (Codepoint cp) {
                        return iswctype(cp, ctype) or contains(chars, cp);
                    });
-                return make_ast_node(Op::Matcher, matcher_id);
+                return new_node(ParsedRegex::Matcher, matcher_id);
            }
        }
@ -255,13 +220,13 @@ private:
        for (auto& control : control_escapes)
        {
            if (control.name == cp)
-                return make_ast_node(Op::Literal, control.value);
+                return new_node(ParsedRegex::Literal, control.value);
        }
        // TOOD: \c..., \0..., '\0x...', \u...
        if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
-            return make_ast_node(Op::Literal, cp);
+            return new_node(ParsedRegex::Literal, cp);
        parse_error("unknown atom escape");
    }
@ -271,6 +236,7 @@ private:
        if (negative)
            ++m_pos;
        struct CharRange { Codepoint min, max; };
        Vector<CharRange> ranges;
        Vector<std::pair<wctype_t, bool>> ctypes;
        while (m_pos != m_regex.end() and *m_pos != ']')
@ -327,13 +293,13 @@ private:
        auto matcher_id = m_parsed_regex.matchers.size();
        m_parsed_regex.matchers.push_back(std::move(matcher));
-        return make_ast_node(Op::Matcher, matcher_id);
+        return new_node(ParsedRegex::Matcher, matcher_id);
    }
-    Quantifier quantifier()
+    ParsedRegex::Quantifier quantifier()
    {
        if (at_end())
-            return {Quantifier::One};
+            return {ParsedRegex::Quantifier::One};
        auto read_int = [](auto& pos, auto begin, auto end) {
            int res = 0;
@ -349,9 +315,9 @@ private:
        switch (*m_pos)
        {
-            case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore};
+            case '*': ++m_pos; return {ParsedRegex::Quantifier::RepeatZeroOrMore};
-            case '+': ++m_pos; return {Quantifier::RepeatOneOrMore};
+            case '+': ++m_pos; return {ParsedRegex::Quantifier::RepeatOneOrMore};
-            case '?': ++m_pos; return {Quantifier::Optional};
+            case '?': ++m_pos; return {ParsedRegex::Quantifier::Optional};
            case '{':
            {
                auto it = m_pos+1;
@ -365,12 +331,19 @@ private:
                if (*it++ != '}')
                   parse_error("expected closing bracket");
                m_pos = it;
-                return {Quantifier::RepeatMinMax, min, max};
+                return {ParsedRegex::Quantifier::RepeatMinMax, min, max};
            }
-            default: return {Quantifier::One};
+            default: return {ParsedRegex::Quantifier::One};
        }
    }
    static AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1,
                               ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
    {
        return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, {}}};
    }
    bool at_end() const { return m_pos == m_regex.end(); }
    [[gnu::noreturn]]
@ -405,168 +378,210 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6]
    { 's', "space", "", true },
 };
-CompiledRegex::Offset alloc_offset(CompiledRegex& program)
+struct CompiledRegex
 {
-    auto pos = program.bytecode.size();
+    enum Op : char
    program.bytecode.resize(pos + sizeof(CompiledRegex::Offset));
    return pos;
 }
 CompiledRegex::Offset& get_offset(CompiledRegex& program, CompiledRegex::Offset pos)
 {
    return *reinterpret_cast<CompiledRegex::Offset*>(&program.bytecode[pos]);
 }
 void push_codepoint(CompiledRegex& program, Codepoint cp)
 {
    utf8::dump(std::back_inserter(program.bytecode), cp);
 }
 CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node);
 CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
 {
    const auto start_pos = program.bytecode.size();
    const Codepoint capture = (node->op == Op::Alternation or node->op == Op::Sequence) ? node->value : -1;
    if (capture != -1)
    {
-        program.bytecode.push_back(CompiledRegex::Save);
+        Match,
-        program.bytecode.push_back(capture * 2);
+        Literal,
        AnyChar,
        Matcher,
        Jump,
        Split_PrioritizeParent,
        Split_PrioritizeChild,
        Save,
        LineStart,
        LineEnd,
        WordBoundary,
        NotWordBoundary,
        SubjectBegin,
        SubjectEnd,
    };
    using Offset = unsigned;
    Vector<char> bytecode;
    Vector<std::function<bool (Codepoint)>> matchers;
    size_t save_count;
 };
 struct RegexCompiler
 {
    RegexCompiler(const ParsedRegex& parsed_regex)
        : m_parsed_regex{parsed_regex}
    {
        write_search_prefix();
        compile_node(m_parsed_regex.ast);
        push_op(CompiledRegex::Match);
        m_program.matchers = m_parsed_regex.matchers;
        m_program.save_count = m_parsed_regex.capture_count * 2;
    }
-    Vector<CompiledRegex::Offset> goto_inner_end_offsets;
+    CompiledRegex get_compiled_regex() { return std::move(m_program); }
-    switch (node->op)
+
    using Offset = CompiledRegex::Offset;
    static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
    static CompiledRegex compile(StringView re)
    {
-        case Op::Literal:
+        return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex();
-            program.bytecode.push_back(CompiledRegex::Literal);
+    }
-            push_codepoint(program, node->value);
+
-            break;
+private:
-        case Op::AnyChar:
+    Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
-            program.bytecode.push_back(CompiledRegex::AnyChar);
+    {
-            break;
+        const auto start_pos = m_program.bytecode.size();
-        case Op::Matcher:
+
-            program.bytecode.push_back(CompiledRegex::Matcher);
+        const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
-            program.bytecode.push_back(node->value);
+        if (capture != -1)
        case Op::Sequence:
            for (auto& child : node->children)
                compile_node(program, parsed_regex, child);
            break;
        case Op::Alternation:
        {
-            auto& children = node->children;
+            push_op(CompiledRegex::Save);
-            kak_assert(children.size() == 2);
+            push_byte(capture * 2);
            program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
            auto offset = alloc_offset(program);
            compile_node(program, parsed_regex, children[0]);
            program.bytecode.push_back(CompiledRegex::Jump);
            goto_inner_end_offsets.push_back(alloc_offset(program));
            auto right_pos = compile_node(program, parsed_regex, children[1]);
            get_offset(program, offset) = right_pos;
            break;
        }
-        case Op::LineStart:
+
-            program.bytecode.push_back(CompiledRegex::LineStart);
+        Vector<Offset> goto_inner_end_offsets;
-            break;
+        switch (node->op)
-        case Op::LineEnd:
+        {
-            program.bytecode.push_back(CompiledRegex::LineEnd);
+            case ParsedRegex::Literal:
-            break;
+                push_op(CompiledRegex::Literal);
-        case Op::WordBoundary:
+                push_codepoint(node->value);
-            program.bytecode.push_back(CompiledRegex::WordBoundary);
+                break;
-            break;
+            case ParsedRegex::AnyChar:
-        case Op::NotWordBoundary:
+                push_op(CompiledRegex::AnyChar);
-            program.bytecode.push_back(CompiledRegex::NotWordBoundary);
+                break;
-            break;
+            case ParsedRegex::Matcher:
-        case Op::SubjectBegin:
+                push_op(CompiledRegex::Matcher);
-            program.bytecode.push_back(CompiledRegex::SubjectBegin);
+                push_byte(node->value);
-            break;
+            case ParsedRegex::Sequence:
-        case Op::SubjectEnd:
+                for (auto& child : node->children)
-            program.bytecode.push_back(CompiledRegex::SubjectEnd);
+                    compile_node(child);
-            break;
+                break;
            case ParsedRegex::Alternation:
            {
                auto& children = node->children;
                kak_assert(children.size() == 2);
                push_op(CompiledRegex::Split_PrioritizeParent);
                auto offset = alloc_offset();
                compile_node(children[0]);
                push_op(CompiledRegex::Jump);
                goto_inner_end_offsets.push_back(alloc_offset());
                auto right_pos = compile_node(children[1]);
                get_offset(offset) = right_pos;
                break;
            }
            case ParsedRegex::LineStart:
                push_op(CompiledRegex::LineStart);
                break;
            case ParsedRegex::LineEnd:
                push_op(CompiledRegex::LineEnd);
                break;
            case ParsedRegex::WordBoundary:
                push_op(CompiledRegex::WordBoundary);
                break;
            case ParsedRegex::NotWordBoundary:
                push_op(CompiledRegex::NotWordBoundary);
                break;
            case ParsedRegex::SubjectBegin:
                push_op(CompiledRegex::SubjectBegin);
                break;
            case ParsedRegex::SubjectEnd:
                push_op(CompiledRegex::SubjectEnd);
                break;
        }
        for (auto& offset : goto_inner_end_offsets)
            get_offset(offset) =  m_program.bytecode.size();
        if (capture != -1)
        {
            push_op(CompiledRegex::Save);
            push_byte(capture * 2 + 1);
        }
        return start_pos;
    }
-    for (auto& offset : goto_inner_end_offsets)
+    Offset compile_node(const ParsedRegex::AstNodePtr& node)
        get_offset(program, offset) =  program.bytecode.size();
    if (capture != -1)
    {
-        program.bytecode.push_back(CompiledRegex::Save);
+        Offset pos = m_program.bytecode.size();
-        program.bytecode.push_back(capture * 2 + 1);
+        Vector<Offset> goto_end_offsets;
        if (node->quantifier.allows_none())
        {
            push_op(CompiledRegex::Split_PrioritizeParent);
            goto_end_offsets.push_back(alloc_offset());
        }
        auto inner_pos = compile_node_inner(node);
        // Write the node multiple times when we have a min count quantifier
        for (int i = 1; i < node->quantifier.min; ++i)
            inner_pos = compile_node_inner(node);
        if (node->quantifier.allows_infinite_repeat())
        {
            push_op(CompiledRegex::Split_PrioritizeChild);
            get_offset(alloc_offset()) = inner_pos;
        }
        // Write the node as an optional match for the min -> max counts
        else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
                  i < node->quantifier.max; ++i)
        {
            push_op(CompiledRegex::Split_PrioritizeParent);
            goto_end_offsets.push_back(alloc_offset());
            compile_node_inner(node);
        }
        for (auto offset : goto_end_offsets)
            get_offset(offset) = m_program.bytecode.size();
        return pos;
    }
-    return start_pos;
+    // Add a '.*' as the first instructions for the search use case
-}
+    void write_search_prefix()
 CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
 {
    CompiledRegex::Offset pos = program.bytecode.size();
    Vector<CompiledRegex::Offset> goto_end_offsets;
    if (node->quantifier.allows_none())
    {
-        program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
+        kak_assert(m_program.bytecode.empty());
-        goto_end_offsets.push_back(alloc_offset(program));
+        push_op(CompiledRegex::Split_PrioritizeChild);
        get_offset(alloc_offset()) = search_prefix_size;
        push_op(CompiledRegex::AnyChar);
        push_op(CompiledRegex::Split_PrioritizeParent);
        get_offset(alloc_offset()) = 1 + sizeof(Offset);
    }
-    auto inner_pos = compile_node_inner(program, parsed_regex, node);
+    Offset alloc_offset()
    // Write the node multiple times when we have a min count quantifier
    for (int i = 1; i < node->quantifier.min; ++i)
        inner_pos = compile_node_inner(program, parsed_regex, node);
    if (node->quantifier.allows_infinite_repeat())
    {
-        program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
+        auto pos = m_program.bytecode.size();
-        get_offset(program, alloc_offset(program)) = inner_pos;
+        m_program.bytecode.resize(pos + sizeof(Offset));
        return pos;
    }
-    // Write the node as an optional match for the min -> max counts
+
-    else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
+    Offset& get_offset(Offset pos)
              i < node->quantifier.max; ++i)
    {
-        program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
+        return *reinterpret_cast<Offset*>(&m_program.bytecode[pos]);
        goto_end_offsets.push_back(alloc_offset(program));
        compile_node_inner(program, parsed_regex, node);
    }
-    for (auto offset : goto_end_offsets)
+    void push_op(CompiledRegex::Op op)
-        get_offset(program, offset) = program.bytecode.size();
+    {
        m_program.bytecode.push_back(op);
    }
-    return pos;
+    void push_byte(char byte)
-}
+    {
        m_program.bytecode.push_back(byte);
    }
-constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset);
+    void push_codepoint(Codepoint cp)
    {
        utf8::dump(std::back_inserter(m_program.bytecode), cp);
    }
-// Add a '.*' as the first instructions for the search use case
+    CompiledRegex m_program;
-void write_search_prefix(CompiledRegex& program)
+    const ParsedRegex& m_parsed_regex;
-{
+};
    kak_assert(program.bytecode.empty());
    program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
    get_offset(program, alloc_offset(program)) = prefix_size;
    program.bytecode.push_back(CompiledRegex::AnyChar);
    program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
    get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
 }
 CompiledRegex compile(const ParsedRegex& parsed_regex)
 {
    CompiledRegex res;
    write_search_prefix(res);
    compile_node(res, parsed_regex, parsed_regex.ast);
    res.bytecode.push_back(CompiledRegex::Match);
    res.matchers = parsed_regex.matchers;
    res.save_count = parsed_regex.capture_count * 2;
    return res;
 }
 CompiledRegex compile(StringView re)
 {
    return compile(RegexParser{re}.get_parsed_regex());
 }
 }
 void dump(const CompiledRegex& program)
 {
@ -728,7 +743,7 @@ struct ThreadedRegexVM
    {
        bool found_match = false;
        m_threads.clear();
-        add_thread(0, match ? RegexCompiler::prefix_size : 0,
+        add_thread(0, match ? RegexCompiler::search_prefix_size : 0,
                   Vector<const char*>(m_program.save_count, nullptr));
        m_begin = data.begin();
@ -814,7 +829,7 @@ void validate_regex(StringView re)
 {
    try
    {
-        RegexCompiler::RegexParser{re};
+        RegexParser{re};
    }
    catch (runtime_error& err)
    {