Use a dedicated vm op for dot when match-newline is false

This commit is contained in:
Olivier Perret 2018-06-24 12:13:35 +02:00
parent b5ee1db1c4
commit 67655de947
2 changed files with 29 additions and 9 deletions

View File

@ -23,6 +23,7 @@ struct ParsedRegex
{ {
Literal, Literal,
AnyChar, AnyChar,
AnyCharExceptNewLine,
Class, Class,
CharacterType, CharacterType,
Sequence, Sequence,
@ -296,14 +297,7 @@ private:
if (m_flags & Flags::DotMatchesNewLine) if (m_flags & Flags::DotMatchesNewLine)
return new_node(ParsedRegex::AnyChar); return new_node(ParsedRegex::AnyChar);
else else
{ return new_node(ParsedRegex::AnyCharExceptNewLine);
CharacterClass c;
c.negative = true;
c.ranges.push_back({ '\n', '\n' });
auto class_id = m_parsed_regex.character_classes.size();
m_parsed_regex.character_classes.push_back(std::move(c));
return new_node(ParsedRegex::Class, class_id);
}
case '(': case '(':
{ {
auto captures = [this, it = (++m_pos).base()]() mutable { auto captures = [this, it = (++m_pos).base()]() mutable {
@ -611,7 +605,8 @@ private:
ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) { ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) {
auto& child = get_node(child_index); auto& child = get_node(child_index);
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and
child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar) child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and
child.op != ParsedRegex::AnyCharExceptNewLine)
parse_error("Lookaround can only contain literals, any chars or character classes"); parse_error("Lookaround can only contain literals, any chars or character classes");
if (child.quantifier.type != ParsedRegex::Quantifier::One) if (child.quantifier.type != ParsedRegex::Quantifier::One)
parse_error("Quantifiers cannot be used in lookarounds"); parse_error("Quantifiers cannot be used in lookarounds");
@ -714,6 +709,9 @@ private:
case ParsedRegex::AnyChar: case ParsedRegex::AnyChar:
push_inst(CompiledRegex::AnyChar); push_inst(CompiledRegex::AnyChar);
break; break;
case ParsedRegex::AnyCharExceptNewLine:
push_inst(CompiledRegex::AnyCharExceptNewLine);
break;
case ParsedRegex::Class: case ParsedRegex::Class:
push_inst(CompiledRegex::Class, node.value); push_inst(CompiledRegex::Class, node.value);
break; break;
@ -887,6 +885,8 @@ private:
: character.value); : character.value);
else if (character.op == ParsedRegex::AnyChar) else if (character.op == ParsedRegex::AnyChar)
m_program.lookarounds.push_back(0xF000); m_program.lookarounds.push_back(0xF000);
else if (character.op == ParsedRegex::AnyCharExceptNewLine)
m_program.lookarounds.push_back(0xF001);
else if (character.op == ParsedRegex::Class) else if (character.op == ParsedRegex::Class)
m_program.lookarounds.push_back(0xF0001 + character.value); m_program.lookarounds.push_back(0xF0001 + character.value);
else if (character.op == ParsedRegex::CharacterType) else if (character.op == ParsedRegex::CharacterType)
@ -930,6 +930,13 @@ private:
for (auto& b : start_desc.map) for (auto& b : start_desc.map)
b = true; b = true;
return node.quantifier.allows_none(); return node.quantifier.allows_none();
case ParsedRegex::AnyCharExceptNewLine:
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
{
if (cp != '\n')
start_desc.map[cp] = true;
}
return node.quantifier.allows_none();
case ParsedRegex::Class: case ParsedRegex::Class:
{ {
auto& character_class = m_parsed_regex.character_classes[node.value]; auto& character_class = m_parsed_regex.character_classes[node.value];
@ -1044,6 +1051,9 @@ String dump_regex(const CompiledRegex& program)
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
res += "any char\n"; res += "any char\n";
break; break;
case CompiledRegex::AnyCharExceptNewLine:
res += "anything but newline\n";
break;
case CompiledRegex::Jump: case CompiledRegex::Jump:
res += format("jump {}\n", inst.param); res += format("jump {}\n", inst.param);
break; break;

View File

@ -59,6 +59,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
Literal, Literal,
Literal_IgnoreCase, Literal_IgnoreCase,
AnyChar, AnyChar,
AnyCharExceptNewLine,
Class, Class,
CharacterType, CharacterType,
Jump, Jump,
@ -322,6 +323,10 @@ private:
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
return StepResult::Consumed; return StepResult::Consumed;
case CompiledRegex::AnyCharExceptNewLine:
if (pos != config.end and *pos != '\n')
return StepResult::Consumed;
return StepResult::Failed;
case CompiledRegex::Jump: case CompiledRegex::Jump:
thread.inst = static_cast<int16_t>(inst.param); thread.inst = static_cast<int16_t>(inst.param);
break; break;
@ -529,6 +534,11 @@ private:
const Codepoint ref = *it; const Codepoint ref = *it;
if (ref == 0xF000) if (ref == 0xF000)
{} // any character matches {} // any character matches
else if (ref == 0xF001)
{
if (cp == '\n')
return false;
}
else if (ref > 0xF0000 and ref < 0xF8000) else if (ref > 0xF0000 and ref < 0xF8000)
{ {
if (not is_character_class(m_program.character_classes[ref - 0xF0001], cp)) if (not is_character_class(m_program.character_classes[ref - 0xF0001], cp))