Regex: improve regex parse error reporting
Display the place where parsing failed, refactor code to make RegexParser a regular object.
This commit is contained in:
parent
080160553c
commit
d5717edc9d
|
@ -113,17 +113,17 @@ AstNodePtr make_ast_node(Op op, Codepoint value = -1,
|
|||
|
||||
// Recursive descent parser based on naming used in the ECMAScript
|
||||
// standard, although the syntax is not fully compatible.
|
||||
struct Parser
|
||||
struct RegexParser
|
||||
{
|
||||
static ParsedRegex parse(StringView re)
|
||||
RegexParser(StringView re)
|
||||
: m_regex{re}, m_pos{re.begin(), re}
|
||||
{
|
||||
ParsedRegex res;
|
||||
res.capture_count = 1;
|
||||
Iterator pos{re.begin(), re}, end{re.end(), re};
|
||||
res.ast = disjunction(res, pos, end, 0);
|
||||
return res;
|
||||
m_parsed_regex.capture_count = 1;
|
||||
m_parsed_regex.ast = disjunction(0);
|
||||
}
|
||||
|
||||
ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
|
||||
|
||||
private:
|
||||
struct InvalidPolicy
|
||||
{
|
||||
|
@ -132,62 +132,63 @@ private:
|
|||
|
||||
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
|
||||
|
||||
static AstNodePtr disjunction(ParsedRegex& parsed_regex, Iterator& pos, Iterator end, unsigned capture = -1)
|
||||
AstNodePtr disjunction(unsigned capture = -1)
|
||||
{
|
||||
AstNodePtr node = alternative(parsed_regex, pos, end);
|
||||
if (pos == end or *pos != '|')
|
||||
AstNodePtr node = alternative();
|
||||
if (at_end() or *m_pos != '|')
|
||||
{
|
||||
node->value = capture;
|
||||
return node;
|
||||
}
|
||||
|
||||
++m_pos;
|
||||
AstNodePtr res = make_ast_node(Op::Alternation);
|
||||
res->children.push_back(std::move(node));
|
||||
res->children.push_back(disjunction(parsed_regex, ++pos, end));
|
||||
res->children.push_back(disjunction());
|
||||
res->value = capture;
|
||||
return res;
|
||||
}
|
||||
|
||||
static AstNodePtr alternative(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
AstNodePtr alternative()
|
||||
{
|
||||
AstNodePtr res = make_ast_node(Op::Sequence);
|
||||
while (auto node = term(parsed_regex, pos, end))
|
||||
while (auto node = term())
|
||||
res->children.push_back(std::move(node));
|
||||
if (res->children.empty())
|
||||
throw runtime_error{"Parse error in alternative"};
|
||||
parse_error("empty alternative");
|
||||
return res;
|
||||
}
|
||||
|
||||
static AstNodePtr term(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
AstNodePtr term()
|
||||
{
|
||||
if (auto node = assertion(parsed_regex, pos, end))
|
||||
if (auto node = assertion())
|
||||
return node;
|
||||
if (auto node = atom(parsed_regex, pos, end))
|
||||
if (auto node = atom())
|
||||
{
|
||||
node->quantifier = quantifier(parsed_regex, pos, end);
|
||||
node->quantifier = quantifier();
|
||||
return node;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static AstNodePtr assertion(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
AstNodePtr assertion()
|
||||
{
|
||||
if (pos == end)
|
||||
if (at_end())
|
||||
return nullptr;
|
||||
|
||||
switch (*pos)
|
||||
switch (*m_pos)
|
||||
{
|
||||
case '^': ++pos; return make_ast_node(Op::LineStart);
|
||||
case '$': ++pos; return make_ast_node(Op::LineEnd);
|
||||
case '^': ++m_pos; return make_ast_node(Op::LineStart);
|
||||
case '$': ++m_pos; return make_ast_node(Op::LineEnd);
|
||||
case '\\':
|
||||
if (pos+1 == end)
|
||||
if (m_pos+1 == m_regex.end())
|
||||
return nullptr;
|
||||
switch (*(pos+1))
|
||||
switch (*(m_pos+1))
|
||||
{
|
||||
case 'b': pos += 2; return make_ast_node(Op::WordBoundary);
|
||||
case 'B': pos += 2; return make_ast_node(Op::NotWordBoundary);
|
||||
case '`': pos += 2; return make_ast_node(Op::SubjectBegin);
|
||||
case '\'': pos += 2; return make_ast_node(Op::SubjectEnd);
|
||||
case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary);
|
||||
case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary);
|
||||
case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin);
|
||||
case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd);
|
||||
}
|
||||
break;
|
||||
/* TODO: look ahead, look behind */
|
||||
|
@ -195,50 +196,50 @@ private:
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static AstNodePtr atom(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
AstNodePtr atom()
|
||||
{
|
||||
if (pos == end)
|
||||
if (at_end())
|
||||
return nullptr;
|
||||
|
||||
const Codepoint cp = *pos;
|
||||
const Codepoint cp = *m_pos;
|
||||
switch (cp)
|
||||
{
|
||||
case '.': ++pos; return make_ast_node(Op::AnyChar);
|
||||
case '.': ++m_pos; return make_ast_node(Op::AnyChar);
|
||||
case '(':
|
||||
{
|
||||
++pos;
|
||||
auto content = disjunction(parsed_regex, pos, end, parsed_regex.capture_count++);
|
||||
++m_pos;
|
||||
auto content = disjunction(m_parsed_regex.capture_count++);
|
||||
|
||||
if (pos == end or *pos != ')')
|
||||
throw runtime_error{"Unclosed parenthesis"};
|
||||
++pos;
|
||||
if (at_end() or *m_pos != ')')
|
||||
parse_error("unclosed parenthesis");
|
||||
++m_pos;
|
||||
return content;
|
||||
}
|
||||
case '\\':
|
||||
++pos;
|
||||
return atom_escape(parsed_regex, pos, end);
|
||||
++m_pos;
|
||||
return atom_escape();
|
||||
case '[':
|
||||
++pos;
|
||||
return character_class(parsed_regex, pos, end);
|
||||
++m_pos;
|
||||
return character_class();
|
||||
default:
|
||||
if (contains("^$.*+?()[]{}|", cp))
|
||||
return nullptr;
|
||||
++pos;
|
||||
++m_pos;
|
||||
return make_ast_node(Op::Literal, cp);
|
||||
}
|
||||
}
|
||||
|
||||
static AstNodePtr atom_escape(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
AstNodePtr atom_escape()
|
||||
{
|
||||
const Codepoint cp = *pos++;
|
||||
const Codepoint cp = *m_pos++;
|
||||
|
||||
// CharacterClassEscape
|
||||
for (auto& character_class : character_class_escapes)
|
||||
{
|
||||
if (character_class.cp == cp)
|
||||
{
|
||||
auto matcher_id = parsed_regex.matchers.size();
|
||||
parsed_regex.matchers.push_back(
|
||||
auto matcher_id = m_parsed_regex.matchers.size();
|
||||
m_parsed_regex.matchers.push_back(
|
||||
[ctype = wctype(character_class.ctype),
|
||||
chars = character_class.additional_chars] (Codepoint cp) {
|
||||
return iswctype(cp, ctype) or contains(chars, cp);
|
||||
|
@ -261,57 +262,57 @@ private:
|
|||
|
||||
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
||||
return make_ast_node(Op::Literal, cp);
|
||||
throw runtime_error{"Unknown atom escape"};
|
||||
parse_error("unknown atom escape");
|
||||
}
|
||||
|
||||
static AstNodePtr character_class(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
AstNodePtr character_class()
|
||||
{
|
||||
const bool negative = pos != end and *pos == '^';
|
||||
const bool negative = m_pos != m_regex.end() and *m_pos == '^';
|
||||
if (negative)
|
||||
++pos;
|
||||
++m_pos;
|
||||
|
||||
Vector<CharRange> ranges;
|
||||
Vector<std::pair<wctype_t, bool>> ctypes;
|
||||
while (pos != end and *pos != ']')
|
||||
while (m_pos != m_regex.end() and *m_pos != ']')
|
||||
{
|
||||
const auto cp = *pos++;
|
||||
const auto cp = *m_pos++;
|
||||
if (cp == '-')
|
||||
{
|
||||
ranges.push_back({ '-', '-' });
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pos == end)
|
||||
if (at_end())
|
||||
break;
|
||||
|
||||
if (cp == '\\')
|
||||
{
|
||||
auto it = find_if(character_class_escapes,
|
||||
[cp = *pos](auto& t) { return t.cp == cp; });
|
||||
[cp = *m_pos](auto& t) { return t.cp == cp; });
|
||||
if (it != std::end(character_class_escapes))
|
||||
{
|
||||
ctypes.push_back({wctype(it->ctype), not it->neg});
|
||||
for (auto& c : it->additional_chars)
|
||||
ranges.push_back({(Codepoint)c, (Codepoint)c});
|
||||
++pos;
|
||||
++m_pos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
CharRange range = { cp, cp };
|
||||
if (*pos == '-')
|
||||
if (*m_pos == '-')
|
||||
{
|
||||
if (++pos == end)
|
||||
if (++m_pos == m_regex.end())
|
||||
break;
|
||||
range.max = *pos++;
|
||||
range.max = *m_pos++;
|
||||
if (range.min > range.max)
|
||||
throw runtime_error{"Invalid range specified"};
|
||||
parse_error("invalid range specified");
|
||||
}
|
||||
ranges.push_back(range);
|
||||
}
|
||||
if (pos == end)
|
||||
throw runtime_error{"Unclosed character class"};
|
||||
++pos;
|
||||
if (at_end())
|
||||
parse_error("unclosed character class");
|
||||
++m_pos;
|
||||
|
||||
auto matcher = [ranges = std::move(ranges),
|
||||
ctypes = std::move(ctypes), negative] (Codepoint cp) {
|
||||
|
@ -323,18 +324,18 @@ private:
|
|||
return negative ? not found : found;
|
||||
};
|
||||
|
||||
auto matcher_id = parsed_regex.matchers.size();
|
||||
parsed_regex.matchers.push_back(std::move(matcher));
|
||||
auto matcher_id = m_parsed_regex.matchers.size();
|
||||
m_parsed_regex.matchers.push_back(std::move(matcher));
|
||||
|
||||
return make_ast_node(Op::Matcher, matcher_id);
|
||||
}
|
||||
|
||||
static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
||||
Quantifier quantifier()
|
||||
{
|
||||
if (pos == end)
|
||||
if (at_end())
|
||||
return {Quantifier::One};
|
||||
|
||||
auto read_int = [](Iterator& pos, Iterator begin, Iterator end) {
|
||||
auto read_int = [](auto& pos, auto begin, auto end) {
|
||||
int res = 0;
|
||||
for (; pos != end; ++pos)
|
||||
{
|
||||
|
@ -346,30 +347,44 @@ private:
|
|||
return res;
|
||||
};
|
||||
|
||||
switch (*pos)
|
||||
switch (*m_pos)
|
||||
{
|
||||
case '*': ++pos; return {Quantifier::RepeatZeroOrMore};
|
||||
case '+': ++pos; return {Quantifier::RepeatOneOrMore};
|
||||
case '?': ++pos; return {Quantifier::Optional};
|
||||
case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore};
|
||||
case '+': ++m_pos; return {Quantifier::RepeatOneOrMore};
|
||||
case '?': ++m_pos; return {Quantifier::Optional};
|
||||
case '{':
|
||||
{
|
||||
auto it = pos+1;
|
||||
const int min = read_int(it, it, end);
|
||||
auto it = m_pos+1;
|
||||
const int min = read_int(it, it, m_regex.end());
|
||||
int max = min;
|
||||
if (*it == ',')
|
||||
{
|
||||
++it;
|
||||
max = read_int(it, it, end);
|
||||
max = read_int(it, it, m_regex.end());
|
||||
}
|
||||
if (*it++ != '}')
|
||||
throw runtime_error{"expected closing bracket"};
|
||||
pos = it;
|
||||
parse_error("expected closing bracket");
|
||||
m_pos = it;
|
||||
return {Quantifier::RepeatMinMax, min, max};
|
||||
}
|
||||
default: return {Quantifier::One};
|
||||
}
|
||||
}
|
||||
|
||||
bool at_end() const { return m_pos == m_regex.end(); }
|
||||
|
||||
[[gnu::noreturn]]
|
||||
void parse_error(StringView error)
|
||||
{
|
||||
throw runtime_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
|
||||
StringView{m_regex.begin(), m_pos.base()},
|
||||
StringView{m_pos.base(), m_regex.end()}));
|
||||
}
|
||||
|
||||
ParsedRegex m_parsed_regex;
|
||||
StringView m_regex;
|
||||
Iterator m_pos;
|
||||
|
||||
struct CharacterClassEscape {
|
||||
Codepoint cp;
|
||||
const char* ctype;
|
||||
|
@ -381,7 +396,7 @@ private:
|
|||
};
|
||||
|
||||
// For some reason Gcc fails to link if this is constexpr
|
||||
const Parser::CharacterClassEscape Parser::character_class_escapes[6] = {
|
||||
const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = {
|
||||
{ 'd', "digit", "", false },
|
||||
{ 'D', "digit", "", true },
|
||||
{ 'w', "alnum", "_", false },
|
||||
|
@ -548,7 +563,7 @@ CompiledRegex compile(const ParsedRegex& parsed_regex)
|
|||
|
||||
CompiledRegex compile(StringView re)
|
||||
{
|
||||
return compile(Parser::parse(re));
|
||||
return compile(RegexParser{re}.get_parsed_regex());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -799,11 +814,11 @@ void validate_regex(StringView re)
|
|||
{
|
||||
try
|
||||
{
|
||||
RegexCompiler::Parser::parse(re);
|
||||
RegexCompiler::RegexParser{re};
|
||||
}
|
||||
catch (runtime_error& err)
|
||||
{
|
||||
write_to_debug_buffer(format("regex-impl: <<{}>> failed to parse: {}", re, err.what()));
|
||||
write_to_debug_buffer(err.what());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user