Compute StartDesc with an offset to effective start

This means `.{2,4}foo` will now consider 4 or less before f as
a start candidate instead of every characters
This commit is contained in:
Maxime Coste 2024-03-18 22:25:21 +11:00
parent ee364d911f
commit ca7471c25d
2 changed files with 29 additions and 5 deletions

View File

@ -886,8 +886,8 @@ private:
}
// Mutate start_desc with informations on which Codepoint could start a match.
// Returns true if the node possibly does not consume the char, in which case
// the next node would still be relevant for the parent node start chars computation.
// Returns true if the subsequent nodes are still relevant for computing the
// start desc
template<RegexMode direction>
bool compute_start_desc(ParsedRegex::NodeIndex index,
CompiledRegex::StartDesc& start_desc) const
@ -916,10 +916,20 @@ private:
add_multi_byte_utf8();
return node.quantifier.allows_none();
case ParsedRegex::AnyChar:
if (start_desc.offset + node.quantifier.max <= CompiledRegex::StartDesc::OffsetLimits::max())
{
start_desc.offset += node.quantifier.max;
return true;
}
for (auto& b : start_desc.map)
b = true;
return node.quantifier.allows_none();
case ParsedRegex::AnyCharExceptNewLine:
if (start_desc.offset + node.quantifier.max <= CompiledRegex::StartDesc::OffsetLimits::max())
{
start_desc.offset += node.quantifier.max;
return true;
}
for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{
if (cp != '\n')
@ -1138,7 +1148,7 @@ String dump_regex(const CompiledRegex& program)
res += (char)c;
}
}
res += "]\n";
res += format("]+{}\n", static_cast<int>(desc.offset));
};
if (program.forward_start_desc)
dump_start_desc(*program.forward_start_desc, "forward");
@ -1556,6 +1566,17 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", RegexExecFlags::None));
}
{
TestVM<RegexMode::Forward | RegexMode::Search> vm{"(.{3,4}|f)oo"};
kak_assert(vm.forward_start_desc and vm.forward_start_desc->offset == 4);
for (int c = 0; c < CompiledRegex::StartDesc::count; ++c)
kak_assert(vm.forward_start_desc->map[c] == (c == 'f' or c == 'o'));
kak_assert(vm.exec("xxxoo", RegexExecFlags::None));
kak_assert(vm.exec("xfoo", RegexExecFlags::None));
kak_assert(not vm.exec("😄xoo", RegexExecFlags::None));
}
{
auto eq = [](const CompiledRegex::NamedCapture& lhs,
const CompiledRegex::NamedCapture& rhs) {

View File

@ -153,6 +153,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
struct StartDesc : UseMemoryDomain<MemoryDomain::Regex>
{
static constexpr Codepoint count = 256;
using OffsetLimits = std::numeric_limits<uint8_t>;
uint8_t offset = 0;
bool map[count];
};
@ -531,15 +533,16 @@ private:
}
}
static Iterator find_next_start(Iterator pos, const ExecConfig& config, const StartDesc& start_desc)
static Iterator find_next_start(Iterator start, const ExecConfig& config, const StartDesc& start_desc)
{
auto pos = start;
while (pos != config.end)
{
static_assert(StartDesc::count <= 256, "start desc should be ascii only");
if constexpr (forward)
{
if (start_desc.map[static_cast<unsigned char>(*pos)])
return pos;
return utf8::advance(pos, start, -CharCount(start_desc.offset));
++pos;
}
else