Regex: compute if codepoints outside of the start chars map can start
This commit is contained in:
parent
2c6c0be0c1
commit
c423b47109
|
@ -795,7 +795,7 @@ private:
|
||||||
// returns true if the node did not consume the char, hence a following node in
|
// returns true if the node did not consume the char, hence a following node in
|
||||||
// sequence would be still relevant for the parent node start chars computation.
|
// sequence would be still relevant for the parent node start chars computation.
|
||||||
bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
|
bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
|
||||||
bool (&accepted)[start_chars_count]) const
|
CompiledRegex::StartChars& start_chars) const
|
||||||
{
|
{
|
||||||
switch (node->op)
|
switch (node->op)
|
||||||
{
|
{
|
||||||
|
@ -804,27 +804,29 @@ private:
|
||||||
{
|
{
|
||||||
if (node->ignore_case)
|
if (node->ignore_case)
|
||||||
{
|
{
|
||||||
accepted[to_lower(node->value)] = true;
|
start_chars.map[to_lower(node->value)] = true;
|
||||||
accepted[to_upper(node->value)] = true;
|
start_chars.map[to_upper(node->value)] = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
accepted[node->value] = true;
|
start_chars.map[node->value] = true;
|
||||||
}
|
}
|
||||||
return node->quantifier.allows_none();
|
return node->quantifier.allows_none();
|
||||||
case ParsedRegex::AnyChar:
|
case ParsedRegex::AnyChar:
|
||||||
for (auto& b : accepted)
|
for (auto& b : start_chars.map)
|
||||||
b = true;
|
b = true;
|
||||||
|
start_chars.accept_other = true;
|
||||||
return node->quantifier.allows_none();
|
return node->quantifier.allows_none();
|
||||||
case ParsedRegex::Matcher:
|
case ParsedRegex::Matcher:
|
||||||
for (Codepoint c = 0; c < start_chars_count; ++c)
|
for (Codepoint c = 0; c < start_chars_count; ++c)
|
||||||
if (m_program.matchers[node->value](c))
|
if (m_program.matchers[node->value](c))
|
||||||
accepted[c] = true;
|
start_chars.map[c] = true;
|
||||||
|
start_chars.accept_other = true; // stay safe
|
||||||
return node->quantifier.allows_none();
|
return node->quantifier.allows_none();
|
||||||
case ParsedRegex::Sequence:
|
case ParsedRegex::Sequence:
|
||||||
{
|
{
|
||||||
bool consumed = false;
|
bool consumed = false;
|
||||||
auto consumes = [&, this](auto& child) {
|
auto consumes = [&, this](auto& child) {
|
||||||
return not this->compute_start_chars(child, accepted);
|
return not this->compute_start_chars(child, start_chars);
|
||||||
};
|
};
|
||||||
if (m_forward)
|
if (m_forward)
|
||||||
consumed = contains_that(node->children, consumes);
|
consumed = contains_that(node->children, consumes);
|
||||||
|
@ -838,7 +840,7 @@ private:
|
||||||
bool all_consumed = not node->quantifier.allows_none();
|
bool all_consumed = not node->quantifier.allows_none();
|
||||||
for (auto& child : node->children)
|
for (auto& child : node->children)
|
||||||
{
|
{
|
||||||
if (compute_start_chars(child, accepted))
|
if (compute_start_chars(child, start_chars))
|
||||||
all_consumed = false;
|
all_consumed = false;
|
||||||
}
|
}
|
||||||
return not all_consumed;
|
return not all_consumed;
|
||||||
|
@ -862,16 +864,14 @@ private:
|
||||||
[[gnu::noinline]]
|
[[gnu::noinline]]
|
||||||
std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
|
std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
|
||||||
{
|
{
|
||||||
bool accepted[start_chars_count] = {};
|
CompiledRegex::StartChars start_chars{};
|
||||||
if (compute_start_chars(m_parsed_regex.ast, accepted))
|
if (compute_start_chars(m_parsed_regex.ast, start_chars))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
if (not contains(accepted, false))
|
if (not contains(start_chars.map, false))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
auto start_chars = std::make_unique<CompiledRegex::StartChars>();
|
return std::make_unique<CompiledRegex::StartChars>(start_chars);
|
||||||
memcpy(start_chars->map, accepted, sizeof(bool[start_chars_count]));
|
|
||||||
return start_chars;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CompiledRegex m_program;
|
CompiledRegex m_program;
|
||||||
|
|
|
@ -73,6 +73,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
struct StartChars
|
struct StartChars
|
||||||
{
|
{
|
||||||
static constexpr size_t count = 256;
|
static constexpr size_t count = 256;
|
||||||
|
bool accept_other;
|
||||||
bool map[count];
|
bool map[count];
|
||||||
};
|
};
|
||||||
std::unique_ptr<StartChars> start_chars;
|
std::unique_ptr<StartChars> start_chars;
|
||||||
|
@ -152,7 +153,7 @@ public:
|
||||||
const bool no_saves = (flags & RegexExecFlags::NoSaves);
|
const bool no_saves = (flags & RegexExecFlags::NoSaves);
|
||||||
Utf8It start{m_begin};
|
Utf8It start{m_begin};
|
||||||
|
|
||||||
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
|
const CompiledRegex::StartChars* start_chars = m_program.start_chars.get();
|
||||||
if (flags & RegexExecFlags::Search)
|
if (flags & RegexExecFlags::Search)
|
||||||
to_next_start(start, m_end, start_chars);
|
to_next_start(start, m_end, start_chars);
|
||||||
|
|
||||||
|
@ -422,13 +423,15 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars)
|
void to_next_start(Utf8It& start, const Utf8It& end,
|
||||||
|
const CompiledRegex::StartChars* start_chars)
|
||||||
{
|
{
|
||||||
if (not start_chars)
|
if (not start_chars)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
while (start != end and *start >= 0 and *start < 256 and
|
while (start != end and *start >= 0 and
|
||||||
not start_chars[*start])
|
((*start < 256 and not start_chars->map[*start]) or
|
||||||
|
(*start >= 256 and not start_chars->accept_other)))
|
||||||
++start;
|
++start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user