Compute StartDesc with an offset to effective start
This means `.{2,4}foo` will now consider 4 or less before f as a start candidate instead of every characters
This commit is contained in:
parent
ee364d911f
commit
ca7471c25d
|
@ -886,8 +886,8 @@ private:
|
|||
}
|
||||
|
||||
// Mutate start_desc with informations on which Codepoint could start a match.
|
||||
// Returns true if the node possibly does not consume the char, in which case
|
||||
// the next node would still be relevant for the parent node start chars computation.
|
||||
// Returns true if the subsequent nodes are still relevant for computing the
|
||||
// start desc
|
||||
template<RegexMode direction>
|
||||
bool compute_start_desc(ParsedRegex::NodeIndex index,
|
||||
CompiledRegex::StartDesc& start_desc) const
|
||||
|
@ -916,10 +916,20 @@ private:
|
|||
add_multi_byte_utf8();
|
||||
return node.quantifier.allows_none();
|
||||
case ParsedRegex::AnyChar:
|
||||
if (start_desc.offset + node.quantifier.max <= CompiledRegex::StartDesc::OffsetLimits::max())
|
||||
{
|
||||
start_desc.offset += node.quantifier.max;
|
||||
return true;
|
||||
}
|
||||
for (auto& b : start_desc.map)
|
||||
b = true;
|
||||
return node.quantifier.allows_none();
|
||||
case ParsedRegex::AnyCharExceptNewLine:
|
||||
if (start_desc.offset + node.quantifier.max <= CompiledRegex::StartDesc::OffsetLimits::max())
|
||||
{
|
||||
start_desc.offset += node.quantifier.max;
|
||||
return true;
|
||||
}
|
||||
for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
|
||||
{
|
||||
if (cp != '\n')
|
||||
|
@ -1138,7 +1148,7 @@ String dump_regex(const CompiledRegex& program)
|
|||
res += (char)c;
|
||||
}
|
||||
}
|
||||
res += "]\n";
|
||||
res += format("]+{}\n", static_cast<int>(desc.offset));
|
||||
};
|
||||
if (program.forward_start_desc)
|
||||
dump_start_desc(*program.forward_start_desc, "forward");
|
||||
|
@ -1556,6 +1566,17 @@ auto test_regex = UnitTest{[]{
|
|||
kak_assert(vm.exec("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", RegexExecFlags::None));
|
||||
}
|
||||
|
||||
{
|
||||
TestVM<RegexMode::Forward | RegexMode::Search> vm{"(.{3,4}|f)oo"};
|
||||
kak_assert(vm.forward_start_desc and vm.forward_start_desc->offset == 4);
|
||||
for (int c = 0; c < CompiledRegex::StartDesc::count; ++c)
|
||||
kak_assert(vm.forward_start_desc->map[c] == (c == 'f' or c == 'o'));
|
||||
|
||||
kak_assert(vm.exec("xxxoo", RegexExecFlags::None));
|
||||
kak_assert(vm.exec("xfoo", RegexExecFlags::None));
|
||||
kak_assert(not vm.exec("😄xoo", RegexExecFlags::None));
|
||||
}
|
||||
|
||||
{
|
||||
auto eq = [](const CompiledRegex::NamedCapture& lhs,
|
||||
const CompiledRegex::NamedCapture& rhs) {
|
||||
|
|
|
@ -153,6 +153,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
|||
struct StartDesc : UseMemoryDomain<MemoryDomain::Regex>
|
||||
{
|
||||
static constexpr Codepoint count = 256;
|
||||
using OffsetLimits = std::numeric_limits<uint8_t>;
|
||||
uint8_t offset = 0;
|
||||
bool map[count];
|
||||
};
|
||||
|
||||
|
@ -531,15 +533,16 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
static Iterator find_next_start(Iterator pos, const ExecConfig& config, const StartDesc& start_desc)
|
||||
static Iterator find_next_start(Iterator start, const ExecConfig& config, const StartDesc& start_desc)
|
||||
{
|
||||
auto pos = start;
|
||||
while (pos != config.end)
|
||||
{
|
||||
static_assert(StartDesc::count <= 256, "start desc should be ascii only");
|
||||
if constexpr (forward)
|
||||
{
|
||||
if (start_desc.map[static_cast<unsigned char>(*pos)])
|
||||
return pos;
|
||||
return utf8::advance(pos, start, -CharCount(start_desc.offset));
|
||||
++pos;
|
||||
}
|
||||
else
|
||||
|
|
Loading…
Reference in New Issue
Block a user