small regex impl code style tweak

This commit is contained in:
Maxime Coste 2021-11-25 08:56:52 +11:00
parent 08d2a8ba63
commit 16493a99bb

View File

@ -179,11 +179,10 @@ private:
NodeIndex disjunction(uint32_t capture = -1)
{
NodeIndex index = new_node(ParsedRegex::Alternation);
get_node(index).value = capture;
NodeIndex index = add_node(ParsedRegex::Alternation, capture);
while (true)
{
alternative();
alternative(ParsedRegex::Sequence);
if (at_end() or *m_pos != '|')
break;
++m_pos;
@ -193,10 +192,10 @@ private:
return index;
}
NodeIndex alternative(ParsedRegex::Op op = ParsedRegex::Sequence)
NodeIndex alternative(ParsedRegex::Op op)
{
NodeIndex index = new_node(op);
while (auto t = term())
NodeIndex index = add_node(op);
while (term())
{}
get_node(index).children_end = m_parsed_regex.nodes.size();
@ -248,18 +247,18 @@ private:
switch (*m_pos)
{
case '^': ++m_pos; return new_node(ParsedRegex::LineStart);
case '$': ++m_pos; return new_node(ParsedRegex::LineEnd);
case '^': ++m_pos; return add_node(ParsedRegex::LineStart);
case '$': ++m_pos; return add_node(ParsedRegex::LineEnd);
case '\\':
if (m_pos+1 == m_regex.end())
return {};
switch (*(m_pos+1))
{
case 'b': m_pos += 2; return new_node(ParsedRegex::WordBoundary);
case 'B': m_pos += 2; return new_node(ParsedRegex::NotWordBoundary);
case 'A': m_pos += 2; return new_node(ParsedRegex::SubjectBegin);
case 'z': m_pos += 2; return new_node(ParsedRegex::SubjectEnd);
case 'K': m_pos += 2; return new_node(ParsedRegex::ResetStart);
case 'b': m_pos += 2; return add_node(ParsedRegex::WordBoundary);
case 'B': m_pos += 2; return add_node(ParsedRegex::NotWordBoundary);
case 'A': m_pos += 2; return add_node(ParsedRegex::SubjectBegin);
case 'z': m_pos += 2; return add_node(ParsedRegex::SubjectEnd);
case 'K': m_pos += 2; return add_node(ParsedRegex::ResetStart);
}
break;
case '(':
@ -306,10 +305,7 @@ private:
{
case '.':
++m_pos;
if (m_flags & Flags::DotMatchesNewLine)
return new_node(ParsedRegex::AnyChar);
else
return new_node(ParsedRegex::AnyCharExceptNewLine);
return add_node((m_flags & Flags::DotMatchesNewLine) ? ParsedRegex::AnyChar : ParsedRegex::AnyCharExceptNewLine);
case '(':
{
uint32_t capture_group = -1;
@ -347,7 +343,7 @@ private:
if (contains("^$.*+?[]{}", cp) or (cp >= 0xF0000 and cp <= 0xFFFFF))
parse_error(format("unexpected '{}'", cp));
++m_pos;
return new_node(ParsedRegex::Literal, cp);
return add_node(ParsedRegex::Literal, cp);
}
}
@ -380,12 +376,12 @@ private:
if (cp == 'Q')
{
auto escaped_sequence = new_node(ParsedRegex::Sequence);
auto escaped_sequence = add_node(ParsedRegex::Sequence);
constexpr StringView end_mark{"\\E"};
auto quote_end = std::search(m_pos.base(), m_regex.end(), end_mark.begin(), end_mark.end());
while (m_pos != quote_end)
new_node(ParsedRegex::Literal, *m_pos++);
add_node(ParsedRegex::Literal, *m_pos++);
get_node(escaped_sequence).children_end = m_parsed_regex.nodes.size();
if (quote_end != m_regex.end())
@ -397,33 +393,33 @@ private:
// CharacterClassEscape
auto class_it = find_if(character_class_escapes, [cp](auto& c) { return c.cp == cp; });
if (class_it != std::end(character_class_escapes))
return new_node(ParsedRegex::CharType, (Codepoint)class_it->ctype);
return add_node(ParsedRegex::CharType, (Codepoint)class_it->ctype);
// CharacterEscape
for (auto& control : control_escapes)
{
if (control.name == cp)
return new_node(ParsedRegex::Literal, control.value);
return add_node(ParsedRegex::Literal, control.value);
}
if (cp == '0')
return new_node(ParsedRegex::Literal, '\0');
return add_node(ParsedRegex::Literal, '\0');
else if (cp == 'c')
{
if (at_end())
parse_error("unterminated control escape");
Codepoint ctrl = *m_pos++;
if (('a' <= ctrl and ctrl <= 'z') or ('A' <= ctrl and ctrl <= 'Z'))
return new_node(ParsedRegex::Literal, ctrl % 32);
return add_node(ParsedRegex::Literal, ctrl % 32);
parse_error(format("Invalid control escape character '{}'", ctrl));
}
else if (cp == 'x')
return new_node(ParsedRegex::Literal, read_hex(2));
return add_node(ParsedRegex::Literal, read_hex(2));
else if (cp == 'u')
return new_node(ParsedRegex::Literal, read_hex(6));
return add_node(ParsedRegex::Literal, read_hex(6));
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return new_node(ParsedRegex::Literal, cp);
return add_node(ParsedRegex::Literal, cp);
parse_error(format("unknown atom escape '{}'", cp));
}
@ -542,16 +538,16 @@ private:
if (character_class.ctypes == CharacterType::None and not character_class.negative and
character_class.ranges.size() == 1 and
character_class.ranges.front().min == character_class.ranges.front().max)
return new_node(ParsedRegex::Literal, character_class.ranges.front().min);
return add_node(ParsedRegex::Literal, character_class.ranges.front().min);
if (character_class.ctypes != CharacterType::None and not character_class.negative and
character_class.ranges.empty())
return new_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes);
return add_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes);
auto class_id = m_parsed_regex.character_classes.size();
m_parsed_regex.character_classes.push_back(std::move(character_class));
return new_node(ParsedRegex::CharClass, class_id);
return add_node(ParsedRegex::CharClass, class_id);
}
ParsedRegex::Quantifier quantifier()
@ -604,8 +600,7 @@ private:
}
}
NodeIndex new_node(ParsedRegex::Op op, Codepoint value = -1,
ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
NodeIndex add_node(ParsedRegex::Op op, Codepoint value = -1, ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
{
constexpr auto max_nodes = std::numeric_limits<int16_t>::max();
const NodeIndex res = m_parsed_regex.nodes.size();
@ -616,13 +611,12 @@ private:
return res;
}
bool at_end() const { return m_pos == m_regex.end(); }
ParsedRegex::Node& get_node(NodeIndex index)
{
return m_parsed_regex.nodes[index];
}
bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]]
void parse_error(StringView error) const