Regex: fix lookarounds handling when computing starting chars
This commit is contained in:
parent
1c95074657
commit
df73b71dfc
|
@ -711,16 +711,19 @@ private:
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static constexpr size_t start_chars_count = CompiledRegex::StartChars::count;
|
||||||
|
|
||||||
// Fills accepted and rejected according to which chars can start the given node,
|
// Fills accepted and rejected according to which chars can start the given node,
|
||||||
// returns true if the node did not consume the char, hence a following node in
|
// returns true if the node did not consume the char, hence a following node in
|
||||||
// sequence would be still relevant for the parent node start chars computation.
|
// sequence would be still relevant for the parent node start chars computation.
|
||||||
bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
|
bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
|
||||||
bool (&accepted)[256], bool (&rejected)[256]) const
|
bool (&accepted)[start_chars_count],
|
||||||
|
bool (&rejected)[start_chars_count]) const
|
||||||
{
|
{
|
||||||
switch (node->op)
|
switch (node->op)
|
||||||
{
|
{
|
||||||
case ParsedRegex::Literal:
|
case ParsedRegex::Literal:
|
||||||
if (node->value < 256)
|
if (node->value < start_chars_count)
|
||||||
accepted[node->value] = true;
|
accepted[node->value] = true;
|
||||||
return node->quantifier.allows_none();
|
return node->quantifier.allows_none();
|
||||||
case ParsedRegex::AnyChar:
|
case ParsedRegex::AnyChar:
|
||||||
|
@ -763,18 +766,28 @@ private:
|
||||||
case ParsedRegex::ResetStart:
|
case ParsedRegex::ResetStart:
|
||||||
return true;
|
return true;
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
if (not node->children.empty())
|
case ParsedRegex::LookBehind:
|
||||||
compute_start_chars(m_forward ? node->children.front() : node->children.back(),
|
if (not node->children.empty() and
|
||||||
accepted, rejected);
|
m_forward == (node->op == ParsedRegex::LookAhead))
|
||||||
|
{
|
||||||
|
auto& child = m_forward ? node->children.front() : node->children.back();
|
||||||
|
if (child->op == ParsedRegex::Literal and child->value < start_chars_count)
|
||||||
|
{
|
||||||
|
// Any other char is rejected
|
||||||
|
std::fill(rejected, rejected + child->value, true);
|
||||||
|
std::fill(rejected + child->value + 1, rejected + start_chars_count, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
if (not node->children.empty())
|
|
||||||
compute_start_chars(m_forward ? node->children.front() : node->children.back(),
|
|
||||||
rejected, accepted);
|
|
||||||
return true;
|
|
||||||
case ParsedRegex::LookBehind:
|
|
||||||
return true;
|
|
||||||
case ParsedRegex::NegativeLookBehind:
|
case ParsedRegex::NegativeLookBehind:
|
||||||
|
if (node->children.size() == 1 and
|
||||||
|
m_forward == (node->op == ParsedRegex::NegativeLookAhead))
|
||||||
|
{
|
||||||
|
auto& child = node->children.front();
|
||||||
|
if (child->op == ParsedRegex::Literal and child->value < start_chars_count)
|
||||||
|
rejected[child->value] = true;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -782,8 +795,8 @@ private:
|
||||||
|
|
||||||
std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
|
std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
|
||||||
{
|
{
|
||||||
bool accepted[256] = {};
|
bool accepted[start_chars_count] = {};
|
||||||
bool rejected[256] = {};
|
bool rejected[start_chars_count] = {};
|
||||||
if (compute_start_chars(m_parsed_regex.ast, accepted, rejected))
|
if (compute_start_chars(m_parsed_regex.ast, accepted, rejected))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
|
@ -791,7 +804,7 @@ private:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
auto start_chars = std::make_unique<CompiledRegex::StartChars>();
|
auto start_chars = std::make_unique<CompiledRegex::StartChars>();
|
||||||
for (int i = 0; i < 256; ++i)
|
for (int i = 0; i < start_chars_count; ++i)
|
||||||
start_chars->map[i] = accepted[i] and not rejected[i];
|
start_chars->map[i] = accepted[i] and not rejected[i];
|
||||||
|
|
||||||
return start_chars;
|
return start_chars;
|
||||||
|
@ -1133,6 +1146,12 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "ber");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "ber");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<MatchDirection::Backward> vm{R"((baz|boz|foo|qux)(?<!baz)(?<!o))"};
|
||||||
|
kak_assert(vm.exec("quxbozfoobaz", RegexExecFlags::Search));
|
||||||
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "boz");
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM<> vm{R"(()*)"};
|
TestVM<> vm{R"(()*)"};
|
||||||
kak_assert(not vm.exec(" "));
|
kak_assert(not vm.exec(" "));
|
||||||
|
|
|
@ -67,7 +67,11 @@ struct CompiledRegex : RefCountable
|
||||||
MatchDirection direction;
|
MatchDirection direction;
|
||||||
size_t save_count;
|
size_t save_count;
|
||||||
|
|
||||||
struct StartChars { bool map[256]; };
|
struct StartChars
|
||||||
|
{
|
||||||
|
static constexpr size_t count = 256;
|
||||||
|
bool map[count];
|
||||||
|
};
|
||||||
std::unique_ptr<StartChars> start_chars;
|
std::unique_ptr<StartChars> start_chars;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user