diff --git a/src/regex_impl.cc b/src/regex_impl.cc index b1630a1f..266f920e 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -886,8 +886,8 @@ private: } // Mutate start_desc with informations on which Codepoint could start a match. - // Returns true if the node possibly does not consume the char, in which case - // the next node would still be relevant for the parent node start chars computation. + // Returns true if the subsequent nodes are still relevant for computing the + // start desc template bool compute_start_desc(ParsedRegex::NodeIndex index, CompiledRegex::StartDesc& start_desc) const @@ -916,10 +916,20 @@ private: add_multi_byte_utf8(); return node.quantifier.allows_none(); case ParsedRegex::AnyChar: + if (start_desc.offset + node.quantifier.max <= CompiledRegex::StartDesc::OffsetLimits::max()) + { + start_desc.offset += node.quantifier.max; + return true; + } for (auto& b : start_desc.map) b = true; return node.quantifier.allows_none(); case ParsedRegex::AnyCharExceptNewLine: + if (start_desc.offset + node.quantifier.max <= CompiledRegex::StartDesc::OffsetLimits::max()) + { + start_desc.offset += node.quantifier.max; + return true; + } for (Codepoint cp = 0; cp < single_byte_limit; ++cp) { if (cp != '\n') @@ -1138,7 +1148,7 @@ String dump_regex(const CompiledRegex& program) res += (char)c; } } - res += "]\n"; + res += format("]+{}\n", static_cast(desc.offset)); }; if (program.forward_start_desc) dump_start_desc(*program.forward_start_desc, "forward"); @@ -1556,6 +1566,17 @@ auto test_regex = UnitTest{[]{ kak_assert(vm.exec("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", RegexExecFlags::None)); } + { + TestVM vm{"(.{3,4}|f)oo"}; + kak_assert(vm.forward_start_desc and vm.forward_start_desc->offset == 4); + for (int c = 0; c < CompiledRegex::StartDesc::count; ++c) + kak_assert(vm.forward_start_desc->map[c] == (c == 'f' or c == 'o')); + + kak_assert(vm.exec("xxxoo", RegexExecFlags::None)); + kak_assert(vm.exec("xfoo", RegexExecFlags::None)); + kak_assert(not vm.exec("😄xoo", RegexExecFlags::None)); + } + { auto eq = [](const CompiledRegex::NamedCapture& lhs, const CompiledRegex::NamedCapture& rhs) { diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 80d41dbf..8f11f1af 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -153,6 +153,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain struct StartDesc : UseMemoryDomain { static constexpr Codepoint count = 256; + using OffsetLimits = std::numeric_limits; + uint8_t offset = 0; bool map[count]; }; @@ -531,15 +533,16 @@ private: } } - static Iterator find_next_start(Iterator pos, const ExecConfig& config, const StartDesc& start_desc) + static Iterator find_next_start(Iterator start, const ExecConfig& config, const StartDesc& start_desc) { + auto pos = start; while (pos != config.end) { static_assert(StartDesc::count <= 256, "start desc should be ascii only"); if constexpr (forward) { if (start_desc.map[static_cast(*pos)]) - return pos; + return utf8::advance(pos, start, -CharCount(start_desc.offset)); ++pos; } else