diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 09c8cc02..2e3b9ccf 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -474,8 +474,13 @@ private: void validate_lookaround(const AstNodePtr& node) { for (auto& child : node->children) - if (child->op != ParsedRegex::Literal) - parse_error("Lookaround can only contain literals"); + { + if (child->op != ParsedRegex::Literal and child->op != ParsedRegex::Matcher and + child->op != ParsedRegex::AnyChar) + parse_error("Lookaround can only contain literals, any chars or character classes"); + if (child->quantifier.type != ParsedRegex::Quantifier::One) + parse_error("Quantifiers cannot be used in lookarounds"); + } } ParsedRegex m_parsed_regex; @@ -679,15 +684,27 @@ private: return res; } - uint32_t push_lookaround(const Vector& literals, bool reversed = false) + uint32_t push_lookaround(const Vector& characters, bool reversed = false) { uint32_t res = m_program.lookarounds.size(); + auto write_lookaround = [this](auto&& characters) { + for (auto& character : characters) + { + if (character->op == ParsedRegex::Literal) + m_program.lookarounds.push_back(character->value); + else if (character->op == ParsedRegex::AnyChar) + m_program.lookarounds.push_back(0xF000); + else if (character->op == ParsedRegex::Matcher) + m_program.lookarounds.push_back(0xF0001 + character->value); + else + kak_assert(false); + } + }; + if (reversed) - for (auto& literal : literals | reverse()) - m_program.lookarounds.push_back(literal->value); + write_lookaround(characters | reverse()); else - for (auto& literal : literals) - m_program.lookarounds.push_back(literal->value); + write_lookaround(characters); m_program.lookarounds.push_back((Codepoint)-1); return res; diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 3addb2b2..cb5a53be 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -294,28 +294,14 @@ private: break; case CompiledRegex::LookAhead: case CompiledRegex::NegativeLookAhead: - { - auto ref = m_program.lookarounds.begin() + inst.param; - for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref) - if (*it != *ref) - break; - if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or - (inst.op == CompiledRegex::NegativeLookAhead and *ref == -1)) + if (lookaround(inst.param, pos) != (inst.op == CompiledRegex::LookAhead)) return StepResult::Failed; break; - } case CompiledRegex::LookBehind: case CompiledRegex::NegativeLookBehind: - { - auto ref = m_program.lookarounds.begin() + inst.param; - for (auto it = pos; *ref != -1 and it > m_begin; --it, ++ref) - if (*(it-1) != *ref) - break; - if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or - (inst.op == CompiledRegex::NegativeLookBehind and *ref == -1)) + if (lookaround(inst.param, pos) != (inst.op == CompiledRegex::LookBehind)) return StepResult::Failed; break; - } case CompiledRegex::Match: return StepResult::Matched; } @@ -392,6 +378,26 @@ private: ++start; } + template + bool lookaround(uint32_t index, Utf8It pos) const + { + for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it) + { + if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin)) + return false; + auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it; + if (ref == 0xF000) + {} // any character matches + else if (ref > 0xF0000 and ref <= 0xFFFFD and not m_program.matchers[ref - 0xF0001](cp)) + return false; + else if (ref != cp) + return false; + + (look_direction == MatchDirection::Forward) ? ++pos : --pos; + } + return true; + } + bool is_line_start(const Utf8It& pos) const { if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)