Regex: improve regex parse error reporting

Display the place where parsing failed, refactor code to make
RegexParser a regular object.
This commit is contained in:
Maxime Coste 2017-09-26 22:38:56 +09:00
parent 080160553c
commit d5717edc9d

View File

@ -113,17 +113,17 @@ AstNodePtr make_ast_node(Op op, Codepoint value = -1,
// Recursive descent parser based on naming used in the ECMAScript // Recursive descent parser based on naming used in the ECMAScript
// standard, although the syntax is not fully compatible. // standard, although the syntax is not fully compatible.
struct Parser struct RegexParser
{ {
static ParsedRegex parse(StringView re) RegexParser(StringView re)
: m_regex{re}, m_pos{re.begin(), re}
{ {
ParsedRegex res; m_parsed_regex.capture_count = 1;
res.capture_count = 1; m_parsed_regex.ast = disjunction(0);
Iterator pos{re.begin(), re}, end{re.end(), re};
res.ast = disjunction(res, pos, end, 0);
return res;
} }
ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
private: private:
struct InvalidPolicy struct InvalidPolicy
{ {
@ -132,62 +132,63 @@ private:
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>; using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
static AstNodePtr disjunction(ParsedRegex& parsed_regex, Iterator& pos, Iterator end, unsigned capture = -1) AstNodePtr disjunction(unsigned capture = -1)
{ {
AstNodePtr node = alternative(parsed_regex, pos, end); AstNodePtr node = alternative();
if (pos == end or *pos != '|') if (at_end() or *m_pos != '|')
{ {
node->value = capture; node->value = capture;
return node; return node;
} }
++m_pos;
AstNodePtr res = make_ast_node(Op::Alternation); AstNodePtr res = make_ast_node(Op::Alternation);
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
res->children.push_back(disjunction(parsed_regex, ++pos, end)); res->children.push_back(disjunction());
res->value = capture; res->value = capture;
return res; return res;
} }
static AstNodePtr alternative(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) AstNodePtr alternative()
{ {
AstNodePtr res = make_ast_node(Op::Sequence); AstNodePtr res = make_ast_node(Op::Sequence);
while (auto node = term(parsed_regex, pos, end)) while (auto node = term())
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
if (res->children.empty()) if (res->children.empty())
throw runtime_error{"Parse error in alternative"}; parse_error("empty alternative");
return res; return res;
} }
static AstNodePtr term(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) AstNodePtr term()
{ {
if (auto node = assertion(parsed_regex, pos, end)) if (auto node = assertion())
return node; return node;
if (auto node = atom(parsed_regex, pos, end)) if (auto node = atom())
{ {
node->quantifier = quantifier(parsed_regex, pos, end); node->quantifier = quantifier();
return node; return node;
} }
return nullptr; return nullptr;
} }
static AstNodePtr assertion(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) AstNodePtr assertion()
{ {
if (pos == end) if (at_end())
return nullptr; return nullptr;
switch (*pos) switch (*m_pos)
{ {
case '^': ++pos; return make_ast_node(Op::LineStart); case '^': ++m_pos; return make_ast_node(Op::LineStart);
case '$': ++pos; return make_ast_node(Op::LineEnd); case '$': ++m_pos; return make_ast_node(Op::LineEnd);
case '\\': case '\\':
if (pos+1 == end) if (m_pos+1 == m_regex.end())
return nullptr; return nullptr;
switch (*(pos+1)) switch (*(m_pos+1))
{ {
case 'b': pos += 2; return make_ast_node(Op::WordBoundary); case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary);
case 'B': pos += 2; return make_ast_node(Op::NotWordBoundary); case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary);
case '`': pos += 2; return make_ast_node(Op::SubjectBegin); case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin);
case '\'': pos += 2; return make_ast_node(Op::SubjectEnd); case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd);
} }
break; break;
/* TODO: look ahead, look behind */ /* TODO: look ahead, look behind */
@ -195,50 +196,50 @@ private:
return nullptr; return nullptr;
} }
static AstNodePtr atom(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) AstNodePtr atom()
{ {
if (pos == end) if (at_end())
return nullptr; return nullptr;
const Codepoint cp = *pos; const Codepoint cp = *m_pos;
switch (cp) switch (cp)
{ {
case '.': ++pos; return make_ast_node(Op::AnyChar); case '.': ++m_pos; return make_ast_node(Op::AnyChar);
case '(': case '(':
{ {
++pos; ++m_pos;
auto content = disjunction(parsed_regex, pos, end, parsed_regex.capture_count++); auto content = disjunction(m_parsed_regex.capture_count++);
if (pos == end or *pos != ')') if (at_end() or *m_pos != ')')
throw runtime_error{"Unclosed parenthesis"}; parse_error("unclosed parenthesis");
++pos; ++m_pos;
return content; return content;
} }
case '\\': case '\\':
++pos; ++m_pos;
return atom_escape(parsed_regex, pos, end); return atom_escape();
case '[': case '[':
++pos; ++m_pos;
return character_class(parsed_regex, pos, end); return character_class();
default: default:
if (contains("^$.*+?()[]{}|", cp)) if (contains("^$.*+?()[]{}|", cp))
return nullptr; return nullptr;
++pos; ++m_pos;
return make_ast_node(Op::Literal, cp); return make_ast_node(Op::Literal, cp);
} }
} }
static AstNodePtr atom_escape(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) AstNodePtr atom_escape()
{ {
const Codepoint cp = *pos++; const Codepoint cp = *m_pos++;
// CharacterClassEscape // CharacterClassEscape
for (auto& character_class : character_class_escapes) for (auto& character_class : character_class_escapes)
{ {
if (character_class.cp == cp) if (character_class.cp == cp)
{ {
auto matcher_id = parsed_regex.matchers.size(); auto matcher_id = m_parsed_regex.matchers.size();
parsed_regex.matchers.push_back( m_parsed_regex.matchers.push_back(
[ctype = wctype(character_class.ctype), [ctype = wctype(character_class.ctype),
chars = character_class.additional_chars] (Codepoint cp) { chars = character_class.additional_chars] (Codepoint cp) {
return iswctype(cp, ctype) or contains(chars, cp); return iswctype(cp, ctype) or contains(chars, cp);
@ -261,57 +262,57 @@ private:
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return make_ast_node(Op::Literal, cp); return make_ast_node(Op::Literal, cp);
throw runtime_error{"Unknown atom escape"}; parse_error("unknown atom escape");
} }
static AstNodePtr character_class(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) AstNodePtr character_class()
{ {
const bool negative = pos != end and *pos == '^'; const bool negative = m_pos != m_regex.end() and *m_pos == '^';
if (negative) if (negative)
++pos; ++m_pos;
Vector<CharRange> ranges; Vector<CharRange> ranges;
Vector<std::pair<wctype_t, bool>> ctypes; Vector<std::pair<wctype_t, bool>> ctypes;
while (pos != end and *pos != ']') while (m_pos != m_regex.end() and *m_pos != ']')
{ {
const auto cp = *pos++; const auto cp = *m_pos++;
if (cp == '-') if (cp == '-')
{ {
ranges.push_back({ '-', '-' }); ranges.push_back({ '-', '-' });
continue; continue;
} }
if (pos == end) if (at_end())
break; break;
if (cp == '\\') if (cp == '\\')
{ {
auto it = find_if(character_class_escapes, auto it = find_if(character_class_escapes,
[cp = *pos](auto& t) { return t.cp == cp; }); [cp = *m_pos](auto& t) { return t.cp == cp; });
if (it != std::end(character_class_escapes)) if (it != std::end(character_class_escapes))
{ {
ctypes.push_back({wctype(it->ctype), not it->neg}); ctypes.push_back({wctype(it->ctype), not it->neg});
for (auto& c : it->additional_chars) for (auto& c : it->additional_chars)
ranges.push_back({(Codepoint)c, (Codepoint)c}); ranges.push_back({(Codepoint)c, (Codepoint)c});
++pos; ++m_pos;
continue; continue;
} }
} }
CharRange range = { cp, cp }; CharRange range = { cp, cp };
if (*pos == '-') if (*m_pos == '-')
{ {
if (++pos == end) if (++m_pos == m_regex.end())
break; break;
range.max = *pos++; range.max = *m_pos++;
if (range.min > range.max) if (range.min > range.max)
throw runtime_error{"Invalid range specified"}; parse_error("invalid range specified");
} }
ranges.push_back(range); ranges.push_back(range);
} }
if (pos == end) if (at_end())
throw runtime_error{"Unclosed character class"}; parse_error("unclosed character class");
++pos; ++m_pos;
auto matcher = [ranges = std::move(ranges), auto matcher = [ranges = std::move(ranges),
ctypes = std::move(ctypes), negative] (Codepoint cp) { ctypes = std::move(ctypes), negative] (Codepoint cp) {
@ -323,18 +324,18 @@ private:
return negative ? not found : found; return negative ? not found : found;
}; };
auto matcher_id = parsed_regex.matchers.size(); auto matcher_id = m_parsed_regex.matchers.size();
parsed_regex.matchers.push_back(std::move(matcher)); m_parsed_regex.matchers.push_back(std::move(matcher));
return make_ast_node(Op::Matcher, matcher_id); return make_ast_node(Op::Matcher, matcher_id);
} }
static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) Quantifier quantifier()
{ {
if (pos == end) if (at_end())
return {Quantifier::One}; return {Quantifier::One};
auto read_int = [](Iterator& pos, Iterator begin, Iterator end) { auto read_int = [](auto& pos, auto begin, auto end) {
int res = 0; int res = 0;
for (; pos != end; ++pos) for (; pos != end; ++pos)
{ {
@ -346,30 +347,44 @@ private:
return res; return res;
}; };
switch (*pos) switch (*m_pos)
{ {
case '*': ++pos; return {Quantifier::RepeatZeroOrMore}; case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore};
case '+': ++pos; return {Quantifier::RepeatOneOrMore}; case '+': ++m_pos; return {Quantifier::RepeatOneOrMore};
case '?': ++pos; return {Quantifier::Optional}; case '?': ++m_pos; return {Quantifier::Optional};
case '{': case '{':
{ {
auto it = pos+1; auto it = m_pos+1;
const int min = read_int(it, it, end); const int min = read_int(it, it, m_regex.end());
int max = min; int max = min;
if (*it == ',') if (*it == ',')
{ {
++it; ++it;
max = read_int(it, it, end); max = read_int(it, it, m_regex.end());
} }
if (*it++ != '}') if (*it++ != '}')
throw runtime_error{"expected closing bracket"}; parse_error("expected closing bracket");
pos = it; m_pos = it;
return {Quantifier::RepeatMinMax, min, max}; return {Quantifier::RepeatMinMax, min, max};
} }
default: return {Quantifier::One}; default: return {Quantifier::One};
} }
} }
bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]]
void parse_error(StringView error)
{
throw runtime_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
StringView{m_regex.begin(), m_pos.base()},
StringView{m_pos.base(), m_regex.end()}));
}
ParsedRegex m_parsed_regex;
StringView m_regex;
Iterator m_pos;
struct CharacterClassEscape { struct CharacterClassEscape {
Codepoint cp; Codepoint cp;
const char* ctype; const char* ctype;
@ -381,7 +396,7 @@ private:
}; };
// For some reason Gcc fails to link if this is constexpr // For some reason Gcc fails to link if this is constexpr
const Parser::CharacterClassEscape Parser::character_class_escapes[6] = { const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = {
{ 'd', "digit", "", false }, { 'd', "digit", "", false },
{ 'D', "digit", "", true }, { 'D', "digit", "", true },
{ 'w', "alnum", "_", false }, { 'w', "alnum", "_", false },
@ -548,7 +563,7 @@ CompiledRegex compile(const ParsedRegex& parsed_regex)
CompiledRegex compile(StringView re) CompiledRegex compile(StringView re)
{ {
return compile(Parser::parse(re)); return compile(RegexParser{re}.get_parsed_regex());
} }
} }
@ -799,11 +814,11 @@ void validate_regex(StringView re)
{ {
try try
{ {
RegexCompiler::Parser::parse(re); RegexCompiler::RegexParser{re};
} }
catch (runtime_error& err) catch (runtime_error& err)
{ {
write_to_debug_buffer(format("regex-impl: <<{}>> failed to parse: {}", re, err.what())); write_to_debug_buffer(err.what());
} }
} }