Regex: improve regex parse error reporting
Display the place where parsing failed, refactor code to make RegexParser a regular object.
This commit is contained in:
parent
080160553c
commit
d5717edc9d
|
@ -113,17 +113,17 @@ AstNodePtr make_ast_node(Op op, Codepoint value = -1,
|
||||||
|
|
||||||
// Recursive descent parser based on naming used in the ECMAScript
|
// Recursive descent parser based on naming used in the ECMAScript
|
||||||
// standard, although the syntax is not fully compatible.
|
// standard, although the syntax is not fully compatible.
|
||||||
struct Parser
|
struct RegexParser
|
||||||
{
|
{
|
||||||
static ParsedRegex parse(StringView re)
|
RegexParser(StringView re)
|
||||||
|
: m_regex{re}, m_pos{re.begin(), re}
|
||||||
{
|
{
|
||||||
ParsedRegex res;
|
m_parsed_regex.capture_count = 1;
|
||||||
res.capture_count = 1;
|
m_parsed_regex.ast = disjunction(0);
|
||||||
Iterator pos{re.begin(), re}, end{re.end(), re};
|
|
||||||
res.ast = disjunction(res, pos, end, 0);
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct InvalidPolicy
|
struct InvalidPolicy
|
||||||
{
|
{
|
||||||
|
@ -132,62 +132,63 @@ private:
|
||||||
|
|
||||||
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
|
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
|
||||||
|
|
||||||
static AstNodePtr disjunction(ParsedRegex& parsed_regex, Iterator& pos, Iterator end, unsigned capture = -1)
|
AstNodePtr disjunction(unsigned capture = -1)
|
||||||
{
|
{
|
||||||
AstNodePtr node = alternative(parsed_regex, pos, end);
|
AstNodePtr node = alternative();
|
||||||
if (pos == end or *pos != '|')
|
if (at_end() or *m_pos != '|')
|
||||||
{
|
{
|
||||||
node->value = capture;
|
node->value = capture;
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
++m_pos;
|
||||||
AstNodePtr res = make_ast_node(Op::Alternation);
|
AstNodePtr res = make_ast_node(Op::Alternation);
|
||||||
res->children.push_back(std::move(node));
|
res->children.push_back(std::move(node));
|
||||||
res->children.push_back(disjunction(parsed_regex, ++pos, end));
|
res->children.push_back(disjunction());
|
||||||
res->value = capture;
|
res->value = capture;
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr alternative(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
AstNodePtr alternative()
|
||||||
{
|
{
|
||||||
AstNodePtr res = make_ast_node(Op::Sequence);
|
AstNodePtr res = make_ast_node(Op::Sequence);
|
||||||
while (auto node = term(parsed_regex, pos, end))
|
while (auto node = term())
|
||||||
res->children.push_back(std::move(node));
|
res->children.push_back(std::move(node));
|
||||||
if (res->children.empty())
|
if (res->children.empty())
|
||||||
throw runtime_error{"Parse error in alternative"};
|
parse_error("empty alternative");
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr term(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
AstNodePtr term()
|
||||||
{
|
{
|
||||||
if (auto node = assertion(parsed_regex, pos, end))
|
if (auto node = assertion())
|
||||||
return node;
|
return node;
|
||||||
if (auto node = atom(parsed_regex, pos, end))
|
if (auto node = atom())
|
||||||
{
|
{
|
||||||
node->quantifier = quantifier(parsed_regex, pos, end);
|
node->quantifier = quantifier();
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr assertion(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
AstNodePtr assertion()
|
||||||
{
|
{
|
||||||
if (pos == end)
|
if (at_end())
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
switch (*pos)
|
switch (*m_pos)
|
||||||
{
|
{
|
||||||
case '^': ++pos; return make_ast_node(Op::LineStart);
|
case '^': ++m_pos; return make_ast_node(Op::LineStart);
|
||||||
case '$': ++pos; return make_ast_node(Op::LineEnd);
|
case '$': ++m_pos; return make_ast_node(Op::LineEnd);
|
||||||
case '\\':
|
case '\\':
|
||||||
if (pos+1 == end)
|
if (m_pos+1 == m_regex.end())
|
||||||
return nullptr;
|
return nullptr;
|
||||||
switch (*(pos+1))
|
switch (*(m_pos+1))
|
||||||
{
|
{
|
||||||
case 'b': pos += 2; return make_ast_node(Op::WordBoundary);
|
case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary);
|
||||||
case 'B': pos += 2; return make_ast_node(Op::NotWordBoundary);
|
case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary);
|
||||||
case '`': pos += 2; return make_ast_node(Op::SubjectBegin);
|
case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin);
|
||||||
case '\'': pos += 2; return make_ast_node(Op::SubjectEnd);
|
case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
/* TODO: look ahead, look behind */
|
/* TODO: look ahead, look behind */
|
||||||
|
@ -195,50 +196,50 @@ private:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr atom(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
AstNodePtr atom()
|
||||||
{
|
{
|
||||||
if (pos == end)
|
if (at_end())
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
const Codepoint cp = *pos;
|
const Codepoint cp = *m_pos;
|
||||||
switch (cp)
|
switch (cp)
|
||||||
{
|
{
|
||||||
case '.': ++pos; return make_ast_node(Op::AnyChar);
|
case '.': ++m_pos; return make_ast_node(Op::AnyChar);
|
||||||
case '(':
|
case '(':
|
||||||
{
|
{
|
||||||
++pos;
|
++m_pos;
|
||||||
auto content = disjunction(parsed_regex, pos, end, parsed_regex.capture_count++);
|
auto content = disjunction(m_parsed_regex.capture_count++);
|
||||||
|
|
||||||
if (pos == end or *pos != ')')
|
if (at_end() or *m_pos != ')')
|
||||||
throw runtime_error{"Unclosed parenthesis"};
|
parse_error("unclosed parenthesis");
|
||||||
++pos;
|
++m_pos;
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
case '\\':
|
case '\\':
|
||||||
++pos;
|
++m_pos;
|
||||||
return atom_escape(parsed_regex, pos, end);
|
return atom_escape();
|
||||||
case '[':
|
case '[':
|
||||||
++pos;
|
++m_pos;
|
||||||
return character_class(parsed_regex, pos, end);
|
return character_class();
|
||||||
default:
|
default:
|
||||||
if (contains("^$.*+?()[]{}|", cp))
|
if (contains("^$.*+?()[]{}|", cp))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
++pos;
|
++m_pos;
|
||||||
return make_ast_node(Op::Literal, cp);
|
return make_ast_node(Op::Literal, cp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr atom_escape(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
AstNodePtr atom_escape()
|
||||||
{
|
{
|
||||||
const Codepoint cp = *pos++;
|
const Codepoint cp = *m_pos++;
|
||||||
|
|
||||||
// CharacterClassEscape
|
// CharacterClassEscape
|
||||||
for (auto& character_class : character_class_escapes)
|
for (auto& character_class : character_class_escapes)
|
||||||
{
|
{
|
||||||
if (character_class.cp == cp)
|
if (character_class.cp == cp)
|
||||||
{
|
{
|
||||||
auto matcher_id = parsed_regex.matchers.size();
|
auto matcher_id = m_parsed_regex.matchers.size();
|
||||||
parsed_regex.matchers.push_back(
|
m_parsed_regex.matchers.push_back(
|
||||||
[ctype = wctype(character_class.ctype),
|
[ctype = wctype(character_class.ctype),
|
||||||
chars = character_class.additional_chars] (Codepoint cp) {
|
chars = character_class.additional_chars] (Codepoint cp) {
|
||||||
return iswctype(cp, ctype) or contains(chars, cp);
|
return iswctype(cp, ctype) or contains(chars, cp);
|
||||||
|
@ -261,57 +262,57 @@ private:
|
||||||
|
|
||||||
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
||||||
return make_ast_node(Op::Literal, cp);
|
return make_ast_node(Op::Literal, cp);
|
||||||
throw runtime_error{"Unknown atom escape"};
|
parse_error("unknown atom escape");
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr character_class(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
AstNodePtr character_class()
|
||||||
{
|
{
|
||||||
const bool negative = pos != end and *pos == '^';
|
const bool negative = m_pos != m_regex.end() and *m_pos == '^';
|
||||||
if (negative)
|
if (negative)
|
||||||
++pos;
|
++m_pos;
|
||||||
|
|
||||||
Vector<CharRange> ranges;
|
Vector<CharRange> ranges;
|
||||||
Vector<std::pair<wctype_t, bool>> ctypes;
|
Vector<std::pair<wctype_t, bool>> ctypes;
|
||||||
while (pos != end and *pos != ']')
|
while (m_pos != m_regex.end() and *m_pos != ']')
|
||||||
{
|
{
|
||||||
const auto cp = *pos++;
|
const auto cp = *m_pos++;
|
||||||
if (cp == '-')
|
if (cp == '-')
|
||||||
{
|
{
|
||||||
ranges.push_back({ '-', '-' });
|
ranges.push_back({ '-', '-' });
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos == end)
|
if (at_end())
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (cp == '\\')
|
if (cp == '\\')
|
||||||
{
|
{
|
||||||
auto it = find_if(character_class_escapes,
|
auto it = find_if(character_class_escapes,
|
||||||
[cp = *pos](auto& t) { return t.cp == cp; });
|
[cp = *m_pos](auto& t) { return t.cp == cp; });
|
||||||
if (it != std::end(character_class_escapes))
|
if (it != std::end(character_class_escapes))
|
||||||
{
|
{
|
||||||
ctypes.push_back({wctype(it->ctype), not it->neg});
|
ctypes.push_back({wctype(it->ctype), not it->neg});
|
||||||
for (auto& c : it->additional_chars)
|
for (auto& c : it->additional_chars)
|
||||||
ranges.push_back({(Codepoint)c, (Codepoint)c});
|
ranges.push_back({(Codepoint)c, (Codepoint)c});
|
||||||
++pos;
|
++m_pos;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CharRange range = { cp, cp };
|
CharRange range = { cp, cp };
|
||||||
if (*pos == '-')
|
if (*m_pos == '-')
|
||||||
{
|
{
|
||||||
if (++pos == end)
|
if (++m_pos == m_regex.end())
|
||||||
break;
|
break;
|
||||||
range.max = *pos++;
|
range.max = *m_pos++;
|
||||||
if (range.min > range.max)
|
if (range.min > range.max)
|
||||||
throw runtime_error{"Invalid range specified"};
|
parse_error("invalid range specified");
|
||||||
}
|
}
|
||||||
ranges.push_back(range);
|
ranges.push_back(range);
|
||||||
}
|
}
|
||||||
if (pos == end)
|
if (at_end())
|
||||||
throw runtime_error{"Unclosed character class"};
|
parse_error("unclosed character class");
|
||||||
++pos;
|
++m_pos;
|
||||||
|
|
||||||
auto matcher = [ranges = std::move(ranges),
|
auto matcher = [ranges = std::move(ranges),
|
||||||
ctypes = std::move(ctypes), negative] (Codepoint cp) {
|
ctypes = std::move(ctypes), negative] (Codepoint cp) {
|
||||||
|
@ -323,18 +324,18 @@ private:
|
||||||
return negative ? not found : found;
|
return negative ? not found : found;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto matcher_id = parsed_regex.matchers.size();
|
auto matcher_id = m_parsed_regex.matchers.size();
|
||||||
parsed_regex.matchers.push_back(std::move(matcher));
|
m_parsed_regex.matchers.push_back(std::move(matcher));
|
||||||
|
|
||||||
return make_ast_node(Op::Matcher, matcher_id);
|
return make_ast_node(Op::Matcher, matcher_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
|
Quantifier quantifier()
|
||||||
{
|
{
|
||||||
if (pos == end)
|
if (at_end())
|
||||||
return {Quantifier::One};
|
return {Quantifier::One};
|
||||||
|
|
||||||
auto read_int = [](Iterator& pos, Iterator begin, Iterator end) {
|
auto read_int = [](auto& pos, auto begin, auto end) {
|
||||||
int res = 0;
|
int res = 0;
|
||||||
for (; pos != end; ++pos)
|
for (; pos != end; ++pos)
|
||||||
{
|
{
|
||||||
|
@ -346,30 +347,44 @@ private:
|
||||||
return res;
|
return res;
|
||||||
};
|
};
|
||||||
|
|
||||||
switch (*pos)
|
switch (*m_pos)
|
||||||
{
|
{
|
||||||
case '*': ++pos; return {Quantifier::RepeatZeroOrMore};
|
case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore};
|
||||||
case '+': ++pos; return {Quantifier::RepeatOneOrMore};
|
case '+': ++m_pos; return {Quantifier::RepeatOneOrMore};
|
||||||
case '?': ++pos; return {Quantifier::Optional};
|
case '?': ++m_pos; return {Quantifier::Optional};
|
||||||
case '{':
|
case '{':
|
||||||
{
|
{
|
||||||
auto it = pos+1;
|
auto it = m_pos+1;
|
||||||
const int min = read_int(it, it, end);
|
const int min = read_int(it, it, m_regex.end());
|
||||||
int max = min;
|
int max = min;
|
||||||
if (*it == ',')
|
if (*it == ',')
|
||||||
{
|
{
|
||||||
++it;
|
++it;
|
||||||
max = read_int(it, it, end);
|
max = read_int(it, it, m_regex.end());
|
||||||
}
|
}
|
||||||
if (*it++ != '}')
|
if (*it++ != '}')
|
||||||
throw runtime_error{"expected closing bracket"};
|
parse_error("expected closing bracket");
|
||||||
pos = it;
|
m_pos = it;
|
||||||
return {Quantifier::RepeatMinMax, min, max};
|
return {Quantifier::RepeatMinMax, min, max};
|
||||||
}
|
}
|
||||||
default: return {Quantifier::One};
|
default: return {Quantifier::One};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool at_end() const { return m_pos == m_regex.end(); }
|
||||||
|
|
||||||
|
[[gnu::noreturn]]
|
||||||
|
void parse_error(StringView error)
|
||||||
|
{
|
||||||
|
throw runtime_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
|
||||||
|
StringView{m_regex.begin(), m_pos.base()},
|
||||||
|
StringView{m_pos.base(), m_regex.end()}));
|
||||||
|
}
|
||||||
|
|
||||||
|
ParsedRegex m_parsed_regex;
|
||||||
|
StringView m_regex;
|
||||||
|
Iterator m_pos;
|
||||||
|
|
||||||
struct CharacterClassEscape {
|
struct CharacterClassEscape {
|
||||||
Codepoint cp;
|
Codepoint cp;
|
||||||
const char* ctype;
|
const char* ctype;
|
||||||
|
@ -381,7 +396,7 @@ private:
|
||||||
};
|
};
|
||||||
|
|
||||||
// For some reason Gcc fails to link if this is constexpr
|
// For some reason Gcc fails to link if this is constexpr
|
||||||
const Parser::CharacterClassEscape Parser::character_class_escapes[6] = {
|
const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = {
|
||||||
{ 'd', "digit", "", false },
|
{ 'd', "digit", "", false },
|
||||||
{ 'D', "digit", "", true },
|
{ 'D', "digit", "", true },
|
||||||
{ 'w', "alnum", "_", false },
|
{ 'w', "alnum", "_", false },
|
||||||
|
@ -548,7 +563,7 @@ CompiledRegex compile(const ParsedRegex& parsed_regex)
|
||||||
|
|
||||||
CompiledRegex compile(StringView re)
|
CompiledRegex compile(StringView re)
|
||||||
{
|
{
|
||||||
return compile(Parser::parse(re));
|
return compile(RegexParser{re}.get_parsed_regex());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -799,11 +814,11 @@ void validate_regex(StringView re)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
RegexCompiler::Parser::parse(re);
|
RegexCompiler::RegexParser{re};
|
||||||
}
|
}
|
||||||
catch (runtime_error& err)
|
catch (runtime_error& err)
|
||||||
{
|
{
|
||||||
write_to_debug_buffer(format("regex-impl: <<{}>> failed to parse: {}", re, err.what()));
|
write_to_debug_buffer(err.what());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user