Regex: improve regex parse error reporting

Display the place where parsing failed, refactor code to make RegexParser a regular object.
2017-09-26 22:38:56 +09:00 · 2017-09-26 22:38:56 +09:00 · d5717edc9d
commit d5717edc9d
parent 080160553c
1 changed files with 96 additions and 81 deletions
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@ -113,17 +113,17 @@ AstNodePtr make_ast_node(Op op, Codepoint value = -1,

 // Recursive descent parser based on naming used in the ECMAScript
 // standard, although the syntax is not fully compatible.
-struct Parser
+struct RegexParser
 {
-    static ParsedRegex parse(StringView re)
+    RegexParser(StringView re)
+        : m_regex{re}, m_pos{re.begin(), re}
    {
-        ParsedRegex res;
-        res.capture_count = 1;
-        Iterator pos{re.begin(), re}, end{re.end(), re};
-        res.ast = disjunction(res, pos, end, 0);
-        return res;
+        m_parsed_regex.capture_count = 1;
+        m_parsed_regex.ast = disjunction(0);
    }

+    ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
+
 private:
    struct InvalidPolicy
    {
@ -132,62 +132,63 @@ private:

    using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;

-    static AstNodePtr disjunction(ParsedRegex& parsed_regex, Iterator& pos, Iterator end, unsigned capture = -1)
+    AstNodePtr disjunction(unsigned capture = -1)
    {
-        AstNodePtr node = alternative(parsed_regex, pos, end);
-        if (pos == end or *pos != '|')
+        AstNodePtr node = alternative();
+        if (at_end() or *m_pos != '|')
        {
            node->value = capture;
            return node;
        }

+        ++m_pos;
        AstNodePtr res = make_ast_node(Op::Alternation);
        res->children.push_back(std::move(node));
-        res->children.push_back(disjunction(parsed_regex, ++pos, end));
+        res->children.push_back(disjunction());
        res->value = capture;
        return res;
    }

-    static AstNodePtr alternative(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    AstNodePtr alternative()
    {
        AstNodePtr res = make_ast_node(Op::Sequence);
-        while (auto node = term(parsed_regex, pos, end))
+        while (auto node = term())
            res->children.push_back(std::move(node));
        if (res->children.empty())
-            throw runtime_error{"Parse error in alternative"};
+            parse_error("empty alternative");
        return res;
    }

-    static AstNodePtr term(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    AstNodePtr term()
    {
-        if (auto node = assertion(parsed_regex, pos, end))
+        if (auto node = assertion())
            return node;
-        if (auto node = atom(parsed_regex, pos, end))
+        if (auto node = atom())
        {
-            node->quantifier = quantifier(parsed_regex, pos, end);
+            node->quantifier = quantifier();
            return node;
        }
        return nullptr;
    }

-    static AstNodePtr assertion(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    AstNodePtr assertion()
    {
-        if (pos == end)
+        if (at_end())
            return nullptr;

-        switch (*pos)
+        switch (*m_pos)
        {
-            case '^': ++pos; return make_ast_node(Op::LineStart);
-            case '$': ++pos; return make_ast_node(Op::LineEnd);
+            case '^': ++m_pos; return make_ast_node(Op::LineStart);
+            case '$': ++m_pos; return make_ast_node(Op::LineEnd);
            case '\\':
-                if (pos+1 == end)
+                if (m_pos+1 == m_regex.end())
                    return nullptr;
-                switch (*(pos+1))
+                switch (*(m_pos+1))
                {
-                    case 'b': pos += 2; return make_ast_node(Op::WordBoundary);
-                    case 'B': pos += 2; return make_ast_node(Op::NotWordBoundary);
-                    case '`': pos += 2; return make_ast_node(Op::SubjectBegin);
-                    case '\'': pos += 2; return make_ast_node(Op::SubjectEnd);
+                    case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary);
+                    case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary);
+                    case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin);
+                    case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd);
                }
                break;
            /* TODO: look ahead, look behind */
@ -195,50 +196,50 @@ private:
        return nullptr;
    }

-    static AstNodePtr atom(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    AstNodePtr atom()
    {
-        if (pos == end)
+        if (at_end())
            return nullptr;

-        const Codepoint cp = *pos;
+        const Codepoint cp = *m_pos;
        switch (cp)
        {
-            case '.': ++pos; return make_ast_node(Op::AnyChar);
+            case '.': ++m_pos; return make_ast_node(Op::AnyChar);
            case '(':
            {
-                ++pos;
-                auto content = disjunction(parsed_regex, pos, end, parsed_regex.capture_count++);
+                ++m_pos;
+                auto content = disjunction(m_parsed_regex.capture_count++);

-                if (pos == end or *pos != ')')
-                    throw runtime_error{"Unclosed parenthesis"};
-                ++pos;
+                if (at_end() or *m_pos != ')')
+                    parse_error("unclosed parenthesis");
+                ++m_pos;
                return content;
            }
            case '\\':
-                ++pos;
-                return atom_escape(parsed_regex, pos, end);
+                ++m_pos;
+                return atom_escape();
            case '[':
-                ++pos;
-                return character_class(parsed_regex, pos, end);
+                ++m_pos;
+                return character_class();
            default:
                if (contains("^$.*+?()[]{}|", cp))
                    return nullptr;
-                ++pos;
+                ++m_pos;
                return make_ast_node(Op::Literal, cp);
        }
    }

-    static AstNodePtr atom_escape(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    AstNodePtr atom_escape()
    {
-        const Codepoint cp = *pos++;
+        const Codepoint cp = *m_pos++;

        // CharacterClassEscape
        for (auto& character_class : character_class_escapes)
        {
            if (character_class.cp == cp)
            {
-                auto matcher_id = parsed_regex.matchers.size();
-                parsed_regex.matchers.push_back(
+                auto matcher_id = m_parsed_regex.matchers.size();
+                m_parsed_regex.matchers.push_back(
                    [ctype = wctype(character_class.ctype),
                     chars = character_class.additional_chars] (Codepoint cp) {
                        return iswctype(cp, ctype) or contains(chars, cp);
@ -261,57 +262,57 @@ private:

        if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
            return make_ast_node(Op::Literal, cp);
-        throw runtime_error{"Unknown atom escape"};
+        parse_error("unknown atom escape");
    }

-    static AstNodePtr character_class(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    AstNodePtr character_class()
    {
-        const bool negative = pos != end and *pos == '^';
+        const bool negative = m_pos != m_regex.end() and *m_pos == '^';
        if (negative)
-            ++pos;
+            ++m_pos;

        Vector<CharRange> ranges;
        Vector<std::pair<wctype_t, bool>> ctypes;
-        while (pos != end and *pos != ']')
+        while (m_pos != m_regex.end() and *m_pos != ']')
        {
-            const auto cp = *pos++;
+            const auto cp = *m_pos++;
            if (cp == '-')
            {
                ranges.push_back({ '-', '-' });
                continue;
            }

-            if (pos == end)
+            if (at_end())
                break;

            if (cp == '\\')
            {
                auto it = find_if(character_class_escapes,
-                                  [cp = *pos](auto& t) { return t.cp == cp; });
+                                  [cp = *m_pos](auto& t) { return t.cp == cp; });
                if (it != std::end(character_class_escapes))
                {
                    ctypes.push_back({wctype(it->ctype), not it->neg});
                    for (auto& c : it->additional_chars)
                        ranges.push_back({(Codepoint)c, (Codepoint)c});
-                    ++pos;
+                    ++m_pos;
                    continue;
                }
            }

            CharRange range = { cp, cp };
-            if (*pos == '-')
+            if (*m_pos == '-')
            {
-                if (++pos == end)
+                if (++m_pos == m_regex.end())
                    break;
-                range.max = *pos++;
+                range.max = *m_pos++;
                if (range.min > range.max)
-                    throw runtime_error{"Invalid range specified"};
+                    parse_error("invalid range specified");
            }
            ranges.push_back(range);
        }
-        if (pos == end)
-            throw runtime_error{"Unclosed character class"};
-        ++pos;
+        if (at_end())
+            parse_error("unclosed character class");
+        ++m_pos;

        auto matcher = [ranges = std::move(ranges),
                        ctypes = std::move(ctypes), negative] (Codepoint cp) {
@ -323,18 +324,18 @@ private:
            return negative ? not found : found;
        };

-        auto matcher_id = parsed_regex.matchers.size();
-        parsed_regex.matchers.push_back(std::move(matcher));
+        auto matcher_id = m_parsed_regex.matchers.size();
+        m_parsed_regex.matchers.push_back(std::move(matcher));

        return make_ast_node(Op::Matcher, matcher_id);
    }

-    static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
+    Quantifier quantifier()
    {
-        if (pos == end)
+        if (at_end())
            return {Quantifier::One};

-        auto read_int = [](Iterator& pos, Iterator begin, Iterator end) {
+        auto read_int = [](auto& pos, auto begin, auto end) {
            int res = 0;
            for (; pos != end; ++pos)
            {
@ -346,30 +347,44 @@ private:
            return res;
        };

-        switch (*pos)
+        switch (*m_pos)
        {
-            case '*': ++pos; return {Quantifier::RepeatZeroOrMore};
-            case '+': ++pos; return {Quantifier::RepeatOneOrMore};
-            case '?': ++pos; return {Quantifier::Optional};
+            case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore};
+            case '+': ++m_pos; return {Quantifier::RepeatOneOrMore};
+            case '?': ++m_pos; return {Quantifier::Optional};
            case '{':
            {
-                auto it = pos+1;
-                const int min = read_int(it, it, end);
+                auto it = m_pos+1;
+                const int min = read_int(it, it, m_regex.end());
                int max = min;
                if (*it == ',')
                {
                    ++it;
-                    max = read_int(it, it, end);
+                    max = read_int(it, it, m_regex.end());
                }
                if (*it++ != '}')
-                   throw runtime_error{"expected closing bracket"};
-                pos = it;
+                   parse_error("expected closing bracket");
+                m_pos = it;
                return {Quantifier::RepeatMinMax, min, max};
            }
            default: return {Quantifier::One};
        }
    }

+    bool at_end() const { return m_pos == m_regex.end(); }
+
+    [[gnu::noreturn]]
+    void parse_error(StringView error)
+    {
+        throw runtime_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
+                                   StringView{m_regex.begin(), m_pos.base()},
+                                   StringView{m_pos.base(), m_regex.end()}));
+    }
+
+    ParsedRegex m_parsed_regex;
+    StringView m_regex;
+    Iterator m_pos;
+
    struct CharacterClassEscape {
        Codepoint cp;
        const char* ctype;
@ -381,7 +396,7 @@ private:
 };

 // For some reason Gcc fails to link if this is constexpr
-const Parser::CharacterClassEscape Parser::character_class_escapes[6] = {
+const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = {
    { 'd', "digit", "", false },
    { 'D', "digit", "", true },
    { 'w', "alnum", "_", false },
@ -548,7 +563,7 @@ CompiledRegex compile(const ParsedRegex& parsed_regex)

 CompiledRegex compile(StringView re)
 {
-    return compile(Parser::parse(re));
+    return compile(RegexParser{re}.get_parsed_regex());
 }

 }
@ -799,11 +814,11 @@ void validate_regex(StringView re)
 {
    try
    {
-        RegexCompiler::Parser::parse(re);
+        RegexCompiler::RegexParser{re};
    }
    catch (runtime_error& err)
    {
-        write_to_debug_buffer(format("regex-impl: <<{}>> failed to parse: {}", re, err.what()));
+        write_to_debug_buffer(err.what());
    }
 }