From 3584e00d19f542aa15a74fe411a3a7ddebd89037 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 9 Feb 2018 22:35:12 +1100 Subject: [PATCH] Regex: Use a template argument instead of a regular one for "forward" forward (which controls if we are compling for forward or backward matching) is always statically known, and compilation will first compile forward, then backward (if needed), so by having separate compiled function we get rid of runtime branches. --- src/regex_impl.cc | 135 +++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 66 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index e8dfa02e..677e3c69 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -87,38 +87,46 @@ struct ParsedRegex namespace { -template -bool for_each_child(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func) +template +struct ForEachChild { - const auto end = parsed_regex.nodes[index].children_end; - for (auto child = index+1; child != end; - child = parsed_regex.nodes[child].children_end) + template + static bool apply(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func) { - if (func(child) == false) - return false; + const auto end = parsed_regex.nodes[index].children_end; + for (auto child = index+1; child != end; + child = parsed_regex.nodes[child].children_end) + { + if (func(child) == false) + return false; + } + return true; } - return true; -} +}; -template -bool for_each_child_reverse(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func) +template<> +struct ForEachChild { - auto find_last_child = [&](ParsedRegex::NodeIndex begin, ParsedRegex::NodeIndex end) { - while (parsed_regex.nodes[begin].children_end != end) - begin = parsed_regex.nodes[begin].children_end; - return begin; - }; - const auto first_child = index+1; - auto end = parsed_regex.nodes[index].children_end; - while (end != first_child) + template + static bool apply(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func) { - auto child = find_last_child(first_child, end); - if (func(child) == false) - return false; - end = child; + auto find_last_child = [&](ParsedRegex::NodeIndex begin, ParsedRegex::NodeIndex end) { + while (parsed_regex.nodes[begin].children_end != end) + begin = parsed_regex.nodes[begin].children_end; + return begin; + }; + const auto first_child = index+1; + auto end = parsed_regex.nodes[index].children_end; + while (end != first_child) + { + auto child = find_last_child(first_child, end); + if (func(child) == false) + return false; + end = child; + } + return true; } - return true; -} +}; } // Recursive descent parser based on naming used in the ECMAScript @@ -573,7 +581,7 @@ private: void validate_lookaround(NodeIndex index) { - for_each_child(m_parsed_regex, index, [this](NodeIndex child_index) { + ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) { auto& child = get_node(child_index); if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar) @@ -627,18 +635,18 @@ struct RegexCompiler if (not (flags & RegexCompileFlags::NoForward)) { - m_program.forward_start_desc = compute_start_desc(true); + m_program.forward_start_desc = compute_start_desc(); write_search_prefix(); - compile_node(0, true); + compile_node(0); push_inst(CompiledRegex::Match); } if (flags & RegexCompileFlags::Backward) { m_program.first_backward_inst = m_program.instructions.size(); - m_program.backward_start_desc = compute_start_desc(false); + m_program.backward_start_desc = compute_start_desc(); write_search_prefix(); - compile_node(0, false); + compile_node(0); push_inst(CompiledRegex::Match); } else @@ -652,7 +660,8 @@ struct RegexCompiler private: - uint32_t compile_node_inner(ParsedRegex::NodeIndex index, bool forward) + template + uint32_t compile_node_inner(ParsedRegex::NodeIndex index) { auto& node = get_node(index); @@ -661,6 +670,7 @@ private: const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and (node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs))); + constexpr bool forward = direction == MatchDirection::Forward; if (save) push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1)); @@ -684,28 +694,23 @@ private: break; case ParsedRegex::Sequence: { - if (forward) - for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { - compile_node(child, true); return true; - }); - else - for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { - compile_node(child, false); return true; - }); + ForEachChild::apply(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { + compile_node(child); return true; + }); break; } case ParsedRegex::Alternation: { auto split_pos = m_program.instructions.size(); - for_each_child(m_parsed_regex, index, [this, index](ParsedRegex::NodeIndex child) { + ForEachChild<>::apply(m_parsed_regex, index, [this, index](ParsedRegex::NodeIndex child) { if (child != index+1) push_inst(CompiledRegex::Split_PrioritizeParent); return true; }); - for_each_child(m_parsed_regex, index, - [&, end = node.children_end](ParsedRegex::NodeIndex child) { - auto node = compile_node(child, forward); + ForEachChild<>::apply(m_parsed_regex, index, + [&, end = node.children_end](ParsedRegex::NodeIndex child) { + auto node = compile_node(child); if (child != index+1) m_program.instructions[split_pos++].param = node; if (get_node(child).children_end != end) @@ -722,28 +727,28 @@ private: : CompiledRegex::LookAhead) : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase : CompiledRegex::LookBehind), - push_lookaround(index, false, ignore_case)); + push_lookaround(index, ignore_case)); break; case ParsedRegex::NegativeLookAhead: push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase : CompiledRegex::NegativeLookAhead) : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase : CompiledRegex::NegativeLookBehind), - push_lookaround(index, false, ignore_case)); + push_lookaround(index, ignore_case)); break; case ParsedRegex::LookBehind: push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase : CompiledRegex::LookBehind) : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase : CompiledRegex::LookAhead), - push_lookaround(index, true, ignore_case)); + push_lookaround(index, ignore_case)); break; case ParsedRegex::NegativeLookBehind: push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase : CompiledRegex::NegativeLookBehind) : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase : CompiledRegex::NegativeLookAhead), - push_lookaround(index, true, ignore_case)); + push_lookaround(index, ignore_case)); break; case ParsedRegex::LineStart: push_inst(forward ? CompiledRegex::LineStart @@ -781,7 +786,8 @@ private: return start_pos; } - uint32_t compile_node(ParsedRegex::NodeIndex index, bool forward) + template + uint32_t compile_node(ParsedRegex::NodeIndex index) { auto& node = get_node(index); @@ -799,10 +805,10 @@ private: goto_ends.push_back(split_pos); } - auto inner_pos = compile_node_inner(index, forward); + auto inner_pos = compile_node_inner(index); // Write the node multiple times when we have a min count quantifier for (int i = 1; i < quantifier.min; ++i) - inner_pos = compile_node_inner(index, forward); + inner_pos = compile_node_inner(index); if (quantifier.allows_infinite_repeat()) push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild @@ -816,7 +822,7 @@ private: auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent : CompiledRegex::Split_PrioritizeChild); goto_ends.push_back(split_pos); - compile_node_inner(index, forward); + compile_node_inner(index); } for (auto offset : goto_ends) @@ -845,7 +851,8 @@ private: return res; } - uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool reversed, bool ignore_case) + template + uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case) { uint32_t res = m_program.lookarounds.size(); auto write_matcher = [this, ignore_case](ParsedRegex::NodeIndex child) { @@ -864,10 +871,7 @@ private: return true; }; - if (reversed) - for_each_child_reverse(m_parsed_regex, index, write_matcher); - else - for_each_child(m_parsed_regex, index, write_matcher); + ForEachChild::apply(m_parsed_regex, index, write_matcher); m_program.lookarounds.push_back((Codepoint)-1); return res; @@ -876,8 +880,9 @@ private: // Fills accepted and rejected according to which chars can start the given node, // returns true if the node did not consume the char, hence a following node in // sequence would be still relevant for the parent node start chars computation. + template bool compute_start_desc(ParsedRegex::NodeIndex index, - CompiledRegex::StartDesc& start_desc, bool forward) const + CompiledRegex::StartDesc& start_desc) const { auto& node = get_node(index); switch (node.op) @@ -939,20 +944,17 @@ private: { bool did_not_consume = false; auto does_not_consume = [&, this](auto child) { - return this->compute_start_desc(child, start_desc, forward); + return this->compute_start_desc(child, start_desc); }; - if (forward) - did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume); - else - did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume); + did_not_consume = ForEachChild::apply(m_parsed_regex, index, does_not_consume); return did_not_consume or node.quantifier.allows_none(); } case ParsedRegex::Alternation: { bool all_consumed = not node.quantifier.allows_none(); - for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) { - if (compute_start_desc(child, start_desc, forward)) + ForEachChild<>::apply(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) { + if (compute_start_desc(child, start_desc)) all_consumed = false; return true; }); @@ -974,11 +976,12 @@ private: return false; } + template [[gnu::noinline]] - std::unique_ptr compute_start_desc(bool forward) const + std::unique_ptr compute_start_desc() const { CompiledRegex::StartDesc start_desc{}; - if (compute_start_desc(0, start_desc, forward) or + if (compute_start_desc(0, start_desc) or not contains(start_desc.map, false)) return nullptr;