Regex: compute if codepoints outside of the start chars map can start

This commit is contained in:
Maxime Coste 2017-10-20 11:49:19 +08:00
parent 2c6c0be0c1
commit c423b47109
2 changed files with 21 additions and 18 deletions

View File

@ -795,7 +795,7 @@ private:
// returns true if the node did not consume the char, hence a following node in // returns true if the node did not consume the char, hence a following node in
// sequence would be still relevant for the parent node start chars computation. // sequence would be still relevant for the parent node start chars computation.
bool compute_start_chars(const ParsedRegex::AstNodePtr& node, bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
bool (&accepted)[start_chars_count]) const CompiledRegex::StartChars& start_chars) const
{ {
switch (node->op) switch (node->op)
{ {
@ -804,27 +804,29 @@ private:
{ {
if (node->ignore_case) if (node->ignore_case)
{ {
accepted[to_lower(node->value)] = true; start_chars.map[to_lower(node->value)] = true;
accepted[to_upper(node->value)] = true; start_chars.map[to_upper(node->value)] = true;
} }
else else
accepted[node->value] = true; start_chars.map[node->value] = true;
} }
return node->quantifier.allows_none(); return node->quantifier.allows_none();
case ParsedRegex::AnyChar: case ParsedRegex::AnyChar:
for (auto& b : accepted) for (auto& b : start_chars.map)
b = true; b = true;
start_chars.accept_other = true;
return node->quantifier.allows_none(); return node->quantifier.allows_none();
case ParsedRegex::Matcher: case ParsedRegex::Matcher:
for (Codepoint c = 0; c < start_chars_count; ++c) for (Codepoint c = 0; c < start_chars_count; ++c)
if (m_program.matchers[node->value](c)) if (m_program.matchers[node->value](c))
accepted[c] = true; start_chars.map[c] = true;
start_chars.accept_other = true; // stay safe
return node->quantifier.allows_none(); return node->quantifier.allows_none();
case ParsedRegex::Sequence: case ParsedRegex::Sequence:
{ {
bool consumed = false; bool consumed = false;
auto consumes = [&, this](auto& child) { auto consumes = [&, this](auto& child) {
return not this->compute_start_chars(child, accepted); return not this->compute_start_chars(child, start_chars);
}; };
if (m_forward) if (m_forward)
consumed = contains_that(node->children, consumes); consumed = contains_that(node->children, consumes);
@ -838,7 +840,7 @@ private:
bool all_consumed = not node->quantifier.allows_none(); bool all_consumed = not node->quantifier.allows_none();
for (auto& child : node->children) for (auto& child : node->children)
{ {
if (compute_start_chars(child, accepted)) if (compute_start_chars(child, start_chars))
all_consumed = false; all_consumed = false;
} }
return not all_consumed; return not all_consumed;
@ -862,16 +864,14 @@ private:
[[gnu::noinline]] [[gnu::noinline]]
std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
{ {
bool accepted[start_chars_count] = {}; CompiledRegex::StartChars start_chars{};
if (compute_start_chars(m_parsed_regex.ast, accepted)) if (compute_start_chars(m_parsed_regex.ast, start_chars))
return nullptr; return nullptr;
if (not contains(accepted, false)) if (not contains(start_chars.map, false))
return nullptr; return nullptr;
auto start_chars = std::make_unique<CompiledRegex::StartChars>(); return std::make_unique<CompiledRegex::StartChars>(start_chars);
memcpy(start_chars->map, accepted, sizeof(bool[start_chars_count]));
return start_chars;
} }
CompiledRegex m_program; CompiledRegex m_program;

View File

@ -73,6 +73,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
struct StartChars struct StartChars
{ {
static constexpr size_t count = 256; static constexpr size_t count = 256;
bool accept_other;
bool map[count]; bool map[count];
}; };
std::unique_ptr<StartChars> start_chars; std::unique_ptr<StartChars> start_chars;
@ -152,7 +153,7 @@ public:
const bool no_saves = (flags & RegexExecFlags::NoSaves); const bool no_saves = (flags & RegexExecFlags::NoSaves);
Utf8It start{m_begin}; Utf8It start{m_begin};
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr; const CompiledRegex::StartChars* start_chars = m_program.start_chars.get();
if (flags & RegexExecFlags::Search) if (flags & RegexExecFlags::Search)
to_next_start(start, m_end, start_chars); to_next_start(start, m_end, start_chars);
@ -422,13 +423,15 @@ private:
} }
} }
void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars) void to_next_start(Utf8It& start, const Utf8It& end,
const CompiledRegex::StartChars* start_chars)
{ {
if (not start_chars) if (not start_chars)
return; return;
while (start != end and *start >= 0 and *start < 256 and while (start != end and *start >= 0 and
not start_chars[*start]) ((*start < 256 and not start_chars->map[*start]) or
(*start >= 256 and not start_chars->accept_other)))
++start; ++start;
} }