From c423b47109d7670e1bdd56617b2e9c5370ec712f Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 20 Oct 2017 11:49:19 +0800 Subject: [PATCH] Regex: compute if codepoints outside of the start chars map can start --- src/regex_impl.cc | 28 ++++++++++++++-------------- src/regex_impl.hh | 11 +++++++---- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 232bb865..e5828a8c 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -795,7 +795,7 @@ private: // returns true if the node did not consume the char, hence a following node in // sequence would be still relevant for the parent node start chars computation. bool compute_start_chars(const ParsedRegex::AstNodePtr& node, - bool (&accepted)[start_chars_count]) const + CompiledRegex::StartChars& start_chars) const { switch (node->op) { @@ -804,27 +804,29 @@ private: { if (node->ignore_case) { - accepted[to_lower(node->value)] = true; - accepted[to_upper(node->value)] = true; + start_chars.map[to_lower(node->value)] = true; + start_chars.map[to_upper(node->value)] = true; } else - accepted[node->value] = true; + start_chars.map[node->value] = true; } return node->quantifier.allows_none(); case ParsedRegex::AnyChar: - for (auto& b : accepted) + for (auto& b : start_chars.map) b = true; + start_chars.accept_other = true; return node->quantifier.allows_none(); case ParsedRegex::Matcher: for (Codepoint c = 0; c < start_chars_count; ++c) if (m_program.matchers[node->value](c)) - accepted[c] = true; + start_chars.map[c] = true; + start_chars.accept_other = true; // stay safe return node->quantifier.allows_none(); case ParsedRegex::Sequence: { bool consumed = false; auto consumes = [&, this](auto& child) { - return not this->compute_start_chars(child, accepted); + return not this->compute_start_chars(child, start_chars); }; if (m_forward) consumed = contains_that(node->children, consumes); @@ -838,7 +840,7 @@ private: bool all_consumed = not node->quantifier.allows_none(); for (auto& child : node->children) { - if (compute_start_chars(child, accepted)) + if (compute_start_chars(child, start_chars)) all_consumed = false; } return not all_consumed; @@ -862,16 +864,14 @@ private: [[gnu::noinline]] std::unique_ptr compute_start_chars() const { - bool accepted[start_chars_count] = {}; - if (compute_start_chars(m_parsed_regex.ast, accepted)) + CompiledRegex::StartChars start_chars{}; + if (compute_start_chars(m_parsed_regex.ast, start_chars)) return nullptr; - if (not contains(accepted, false)) + if (not contains(start_chars.map, false)) return nullptr; - auto start_chars = std::make_unique(); - memcpy(start_chars->map, accepted, sizeof(bool[start_chars_count])); - return start_chars; + return std::make_unique(start_chars); } CompiledRegex m_program; diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 664fb4f4..108fe626 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -73,6 +73,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain struct StartChars { static constexpr size_t count = 256; + bool accept_other; bool map[count]; }; std::unique_ptr start_chars; @@ -152,7 +153,7 @@ public: const bool no_saves = (flags & RegexExecFlags::NoSaves); Utf8It start{m_begin}; - const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr; + const CompiledRegex::StartChars* start_chars = m_program.start_chars.get(); if (flags & RegexExecFlags::Search) to_next_start(start, m_end, start_chars); @@ -422,13 +423,15 @@ private: } } - void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars) + void to_next_start(Utf8It& start, const Utf8It& end, + const CompiledRegex::StartChars* start_chars) { if (not start_chars) return; - while (start != end and *start >= 0 and *start < 256 and - not start_chars[*start]) + while (start != end and *start >= 0 and + ((*start < 256 and not start_chars->map[*start]) or + (*start >= 256 and not start_chars->accept_other))) ++start; }