Regex: Rework parsing, treat lookarounds as assertions, and flags separately

This commit is contained in:
Maxime Coste 2017-10-13 10:44:24 +08:00
parent b0233262b8
commit b8495f0953

View File

@ -135,6 +135,8 @@ private:
AstNodePtr term() AstNodePtr term()
{ {
while (flag()) // read all flags
{}
if (auto node = assertion()) if (auto node = assertion())
return node; return node;
if (auto node = atom()) if (auto node = atom())
@ -145,6 +147,34 @@ private:
return nullptr; return nullptr;
} }
bool peek(StringView expected) const
{
auto it = m_pos;
for (Iterator expected_it{expected.begin(), expected}; expected_it != expected.end(); ++expected_it)
{
if (it == m_regex.end() or *it++ != *expected_it)
return false;
}
return true;
}
bool flag()
{
if (peek("(?i)"))
{
m_ignore_case = true;
m_pos += 4;
return true;
}
if (peek("(?I)"))
{
m_ignore_case = false;
m_pos += 4;
return true;
}
return false;
}
AstNodePtr assertion() AstNodePtr assertion()
{ {
if (at_end()) if (at_end())
@ -166,6 +196,34 @@ private:
case 'K': m_pos += 2; return new_node(ParsedRegex::ResetStart); case 'K': m_pos += 2; return new_node(ParsedRegex::ResetStart);
} }
break; break;
case '(':
{
Optional<ParsedRegex::Op> lookaround_op;
constexpr struct { StringView prefix; ParsedRegex::Op op; } lookarounds[] = {
{ "(?=", ParsedRegex::LookAhead },
{ "(?!", ParsedRegex::NegativeLookAhead },
{ "(?<=", ParsedRegex::LookBehind },
{ "(?<!", ParsedRegex::NegativeLookBehind }
};
for (auto& lookaround : lookarounds)
{
if (peek(lookaround.prefix))
{
lookaround_op = lookaround.op;
m_pos += (int)lookaround.prefix.char_length();
break;
}
}
if (not lookaround_op)
return nullptr;
AstNodePtr lookaround = alternative(*lookaround_op);
if (at_end() or *m_pos++ != ')')
parse_error("unclosed parenthesis");
validate_lookaround(lookaround);
return lookaround;
}
} }
return nullptr; return nullptr;
} }
@ -180,60 +238,18 @@ private:
{ {
case '.': ++m_pos; return new_node(ParsedRegex::AnyChar); case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
case '(': case '(':
{
auto advance = [&]() {
if (++m_pos == m_regex.end())
parse_error("unclosed parenthesis");
return *m_pos;
};
AstNodePtr content;
if (advance() == '?')
{
auto c = advance();
if (c == ':')
{ {
++m_pos; ++m_pos;
content = disjunction(-1); bool capture = true;
} if (peek("?:"))
else if (contains("=!<", c))
{ {
bool behind = false; capture = false;
if (c == '<') m_pos += 2;
{
advance();
behind = true;
} }
auto type = *m_pos++; AstNodePtr content = disjunction(capture ? m_parsed_regex.capture_count++ : -1);
if (type == '=') if (at_end() or *m_pos++ != ')')
content = alternative(behind ? ParsedRegex::LookBehind
: ParsedRegex::LookAhead);
else if (type == '!')
content = alternative(behind ? ParsedRegex::NegativeLookBehind
: ParsedRegex::NegativeLookAhead);
else
parse_error("invalid disjunction");
validate_lookaround(content);
}
else if (c == 'i' or c == 'I')
{
m_ignore_case = c == 'i';
if (advance() != ')')
parse_error("unclosed parenthesis"); parse_error("unclosed parenthesis");
++m_pos;
return atom(); // get next atom
}
else
parse_error("invalid disjunction");
}
else
content = disjunction(m_parsed_regex.capture_count++);
if (at_end() or *m_pos != ')')
parse_error("unclosed parenthesis");
++m_pos;
return content; return content;
} }
case '\\': case '\\':
@ -473,7 +489,7 @@ private:
bool at_end() const { return m_pos == m_regex.end(); } bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]] [[gnu::noreturn]]
void parse_error(StringView error) void parse_error(StringView error) const
{ {
throw regex_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error, throw regex_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
StringView{m_regex.begin(), m_pos.base()}, StringView{m_regex.begin(), m_pos.base()},