diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 605e9d36..1f8b492d 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -23,6 +23,7 @@ struct ParsedRegex { Literal, AnyChar, + AnyCharExceptNewLine, Class, CharacterType, Sequence, @@ -296,14 +297,7 @@ private: if (m_flags & Flags::DotMatchesNewLine) return new_node(ParsedRegex::AnyChar); else - { - CharacterClass c; - c.negative = true; - c.ranges.push_back({ '\n', '\n' }); - auto class_id = m_parsed_regex.character_classes.size(); - m_parsed_regex.character_classes.push_back(std::move(c)); - return new_node(ParsedRegex::Class, class_id); - } + return new_node(ParsedRegex::AnyCharExceptNewLine); case '(': { auto captures = [this, it = (++m_pos).base()]() mutable { @@ -611,7 +605,8 @@ private: ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) { auto& child = get_node(child_index); if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and - child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar) + child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and + child.op != ParsedRegex::AnyCharExceptNewLine) parse_error("Lookaround can only contain literals, any chars or character classes"); if (child.quantifier.type != ParsedRegex::Quantifier::One) parse_error("Quantifiers cannot be used in lookarounds"); @@ -714,6 +709,9 @@ private: case ParsedRegex::AnyChar: push_inst(CompiledRegex::AnyChar); break; + case ParsedRegex::AnyCharExceptNewLine: + push_inst(CompiledRegex::AnyCharExceptNewLine); + break; case ParsedRegex::Class: push_inst(CompiledRegex::Class, node.value); break; @@ -887,6 +885,8 @@ private: : character.value); else if (character.op == ParsedRegex::AnyChar) m_program.lookarounds.push_back(0xF000); + else if (character.op == ParsedRegex::AnyCharExceptNewLine) + m_program.lookarounds.push_back(0xF001); else if (character.op == ParsedRegex::Class) m_program.lookarounds.push_back(0xF0001 + character.value); else if (character.op == ParsedRegex::CharacterType) @@ -930,6 +930,13 @@ private: for (auto& b : start_desc.map) b = true; return node.quantifier.allows_none(); + case ParsedRegex::AnyCharExceptNewLine: + for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) + { + if (cp != '\n') + start_desc.map[cp] = true; + } + return node.quantifier.allows_none(); case ParsedRegex::Class: { auto& character_class = m_parsed_regex.character_classes[node.value]; @@ -1044,6 +1051,9 @@ String dump_regex(const CompiledRegex& program) case CompiledRegex::AnyChar: res += "any char\n"; break; + case CompiledRegex::AnyCharExceptNewLine: + res += "anything but newline\n"; + break; case CompiledRegex::Jump: res += format("jump {}\n", inst.param); break; diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 14d5bbfd..155eb7ac 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -59,6 +59,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain Literal, Literal_IgnoreCase, AnyChar, + AnyCharExceptNewLine, Class, CharacterType, Jump, @@ -322,6 +323,10 @@ private: return StepResult::Failed; case CompiledRegex::AnyChar: return StepResult::Consumed; + case CompiledRegex::AnyCharExceptNewLine: + if (pos != config.end and *pos != '\n') + return StepResult::Consumed; + return StepResult::Failed; case CompiledRegex::Jump: thread.inst = static_cast(inst.param); break; @@ -529,6 +534,11 @@ private: const Codepoint ref = *it; if (ref == 0xF000) {} // any character matches + else if (ref == 0xF001) + { + if (cp == '\n') + return false; + } else if (ref > 0xF0000 and ref < 0xF8000) { if (not is_character_class(m_program.character_classes[ref - 0xF0001], cp))