Regex: Use a template argument instead of a regular one for "forward"

forward (which controls if we are compling for forward or backward
matching) is always statically known, and compilation will first
compile forward, then backward (if needed), so by having separate
compiled function we get rid of runtime branches.
This commit is contained in:
Maxime Coste 2018-02-09 22:35:12 +11:00
parent aa9f7753e8
commit 3584e00d19

View File

@ -87,38 +87,46 @@ struct ParsedRegex
namespace namespace
{ {
template<typename Func> template<MatchDirection = MatchDirection::Forward>
bool for_each_child(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func) struct ForEachChild
{ {
const auto end = parsed_regex.nodes[index].children_end; template<typename Func>
for (auto child = index+1; child != end; static bool apply(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func)
child = parsed_regex.nodes[child].children_end)
{ {
if (func(child) == false) const auto end = parsed_regex.nodes[index].children_end;
return false; for (auto child = index+1; child != end;
child = parsed_regex.nodes[child].children_end)
{
if (func(child) == false)
return false;
}
return true;
} }
return true; };
}
template<typename Func> template<>
bool for_each_child_reverse(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func) struct ForEachChild<MatchDirection::Backward>
{ {
auto find_last_child = [&](ParsedRegex::NodeIndex begin, ParsedRegex::NodeIndex end) { template<typename Func>
while (parsed_regex.nodes[begin].children_end != end) static bool apply(const ParsedRegex& parsed_regex, ParsedRegex::NodeIndex index, Func&& func)
begin = parsed_regex.nodes[begin].children_end;
return begin;
};
const auto first_child = index+1;
auto end = parsed_regex.nodes[index].children_end;
while (end != first_child)
{ {
auto child = find_last_child(first_child, end); auto find_last_child = [&](ParsedRegex::NodeIndex begin, ParsedRegex::NodeIndex end) {
if (func(child) == false) while (parsed_regex.nodes[begin].children_end != end)
return false; begin = parsed_regex.nodes[begin].children_end;
end = child; return begin;
};
const auto first_child = index+1;
auto end = parsed_regex.nodes[index].children_end;
while (end != first_child)
{
auto child = find_last_child(first_child, end);
if (func(child) == false)
return false;
end = child;
}
return true;
} }
return true; };
}
} }
// Recursive descent parser based on naming used in the ECMAScript // Recursive descent parser based on naming used in the ECMAScript
@ -573,7 +581,7 @@ private:
void validate_lookaround(NodeIndex index) void validate_lookaround(NodeIndex index)
{ {
for_each_child(m_parsed_regex, index, [this](NodeIndex child_index) { ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) {
auto& child = get_node(child_index); auto& child = get_node(child_index);
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and
child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar) child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar)
@ -627,18 +635,18 @@ struct RegexCompiler
if (not (flags & RegexCompileFlags::NoForward)) if (not (flags & RegexCompileFlags::NoForward))
{ {
m_program.forward_start_desc = compute_start_desc(true); m_program.forward_start_desc = compute_start_desc<MatchDirection::Forward>();
write_search_prefix(); write_search_prefix();
compile_node(0, true); compile_node<MatchDirection::Forward>(0);
push_inst(CompiledRegex::Match); push_inst(CompiledRegex::Match);
} }
if (flags & RegexCompileFlags::Backward) if (flags & RegexCompileFlags::Backward)
{ {
m_program.first_backward_inst = m_program.instructions.size(); m_program.first_backward_inst = m_program.instructions.size();
m_program.backward_start_desc = compute_start_desc(false); m_program.backward_start_desc = compute_start_desc<MatchDirection::Backward>();
write_search_prefix(); write_search_prefix();
compile_node(0, false); compile_node<MatchDirection::Backward>(0);
push_inst(CompiledRegex::Match); push_inst(CompiledRegex::Match);
} }
else else
@ -652,7 +660,8 @@ struct RegexCompiler
private: private:
uint32_t compile_node_inner(ParsedRegex::NodeIndex index, bool forward) template<MatchDirection direction>
uint32_t compile_node_inner(ParsedRegex::NodeIndex index)
{ {
auto& node = get_node(index); auto& node = get_node(index);
@ -661,6 +670,7 @@ private:
const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs))); (node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
constexpr bool forward = direction == MatchDirection::Forward;
if (save) if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1)); push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1));
@ -684,28 +694,23 @@ private:
break; break;
case ParsedRegex::Sequence: case ParsedRegex::Sequence:
{ {
if (forward) ForEachChild<direction>::apply(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { compile_node<direction>(child); return true;
compile_node(child, true); return true; });
});
else
for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
compile_node(child, false); return true;
});
break; break;
} }
case ParsedRegex::Alternation: case ParsedRegex::Alternation:
{ {
auto split_pos = m_program.instructions.size(); auto split_pos = m_program.instructions.size();
for_each_child(m_parsed_regex, index, [this, index](ParsedRegex::NodeIndex child) { ForEachChild<>::apply(m_parsed_regex, index, [this, index](ParsedRegex::NodeIndex child) {
if (child != index+1) if (child != index+1)
push_inst(CompiledRegex::Split_PrioritizeParent); push_inst(CompiledRegex::Split_PrioritizeParent);
return true; return true;
}); });
for_each_child(m_parsed_regex, index, ForEachChild<>::apply(m_parsed_regex, index,
[&, end = node.children_end](ParsedRegex::NodeIndex child) { [&, end = node.children_end](ParsedRegex::NodeIndex child) {
auto node = compile_node(child, forward); auto node = compile_node<direction>(child);
if (child != index+1) if (child != index+1)
m_program.instructions[split_pos++].param = node; m_program.instructions[split_pos++].param = node;
if (get_node(child).children_end != end) if (get_node(child).children_end != end)
@ -722,28 +727,28 @@ private:
: CompiledRegex::LookAhead) : CompiledRegex::LookAhead)
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind), : CompiledRegex::LookBehind),
push_lookaround(index, false, ignore_case)); push_lookaround<MatchDirection::Forward>(index, ignore_case));
break; break;
case ParsedRegex::NegativeLookAhead: case ParsedRegex::NegativeLookAhead:
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead) : CompiledRegex::NegativeLookAhead)
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind), : CompiledRegex::NegativeLookBehind),
push_lookaround(index, false, ignore_case)); push_lookaround<MatchDirection::Forward>(index, ignore_case));
break; break;
case ParsedRegex::LookBehind: case ParsedRegex::LookBehind:
push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind) : CompiledRegex::LookBehind)
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead), : CompiledRegex::LookAhead),
push_lookaround(index, true, ignore_case)); push_lookaround<MatchDirection::Backward>(index, ignore_case));
break; break;
case ParsedRegex::NegativeLookBehind: case ParsedRegex::NegativeLookBehind:
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind) : CompiledRegex::NegativeLookBehind)
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead), : CompiledRegex::NegativeLookAhead),
push_lookaround(index, true, ignore_case)); push_lookaround<MatchDirection::Backward>(index, ignore_case));
break; break;
case ParsedRegex::LineStart: case ParsedRegex::LineStart:
push_inst(forward ? CompiledRegex::LineStart push_inst(forward ? CompiledRegex::LineStart
@ -781,7 +786,8 @@ private:
return start_pos; return start_pos;
} }
uint32_t compile_node(ParsedRegex::NodeIndex index, bool forward) template<MatchDirection direction>
uint32_t compile_node(ParsedRegex::NodeIndex index)
{ {
auto& node = get_node(index); auto& node = get_node(index);
@ -799,10 +805,10 @@ private:
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
} }
auto inner_pos = compile_node_inner(index, forward); auto inner_pos = compile_node_inner<direction>(index);
// Write the node multiple times when we have a min count quantifier // Write the node multiple times when we have a min count quantifier
for (int i = 1; i < quantifier.min; ++i) for (int i = 1; i < quantifier.min; ++i)
inner_pos = compile_node_inner(index, forward); inner_pos = compile_node_inner<direction>(index);
if (quantifier.allows_infinite_repeat()) if (quantifier.allows_infinite_repeat())
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
@ -816,7 +822,7 @@ private:
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild); : CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
compile_node_inner(index, forward); compile_node_inner<direction>(index);
} }
for (auto offset : goto_ends) for (auto offset : goto_ends)
@ -845,7 +851,8 @@ private:
return res; return res;
} }
uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool reversed, bool ignore_case) template<MatchDirection direction>
uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case)
{ {
uint32_t res = m_program.lookarounds.size(); uint32_t res = m_program.lookarounds.size();
auto write_matcher = [this, ignore_case](ParsedRegex::NodeIndex child) { auto write_matcher = [this, ignore_case](ParsedRegex::NodeIndex child) {
@ -864,10 +871,7 @@ private:
return true; return true;
}; };
if (reversed) ForEachChild<direction>::apply(m_parsed_regex, index, write_matcher);
for_each_child_reverse(m_parsed_regex, index, write_matcher);
else
for_each_child(m_parsed_regex, index, write_matcher);
m_program.lookarounds.push_back((Codepoint)-1); m_program.lookarounds.push_back((Codepoint)-1);
return res; return res;
@ -876,8 +880,9 @@ private:
// Fills accepted and rejected according to which chars can start the given node, // Fills accepted and rejected according to which chars can start the given node,
// returns true if the node did not consume the char, hence a following node in // returns true if the node did not consume the char, hence a following node in
// sequence would be still relevant for the parent node start chars computation. // sequence would be still relevant for the parent node start chars computation.
template<MatchDirection direction>
bool compute_start_desc(ParsedRegex::NodeIndex index, bool compute_start_desc(ParsedRegex::NodeIndex index,
CompiledRegex::StartDesc& start_desc, bool forward) const CompiledRegex::StartDesc& start_desc) const
{ {
auto& node = get_node(index); auto& node = get_node(index);
switch (node.op) switch (node.op)
@ -939,20 +944,17 @@ private:
{ {
bool did_not_consume = false; bool did_not_consume = false;
auto does_not_consume = [&, this](auto child) { auto does_not_consume = [&, this](auto child) {
return this->compute_start_desc(child, start_desc, forward); return this->compute_start_desc<direction>(child, start_desc);
}; };
if (forward) did_not_consume = ForEachChild<direction>::apply(m_parsed_regex, index, does_not_consume);
did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume);
else
did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume);
return did_not_consume or node.quantifier.allows_none(); return did_not_consume or node.quantifier.allows_none();
} }
case ParsedRegex::Alternation: case ParsedRegex::Alternation:
{ {
bool all_consumed = not node.quantifier.allows_none(); bool all_consumed = not node.quantifier.allows_none();
for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) { ForEachChild<>::apply(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) {
if (compute_start_desc(child, start_desc, forward)) if (compute_start_desc<direction>(child, start_desc))
all_consumed = false; all_consumed = false;
return true; return true;
}); });
@ -974,11 +976,12 @@ private:
return false; return false;
} }
template<MatchDirection direction>
[[gnu::noinline]] [[gnu::noinline]]
std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc(bool forward) const std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc() const
{ {
CompiledRegex::StartDesc start_desc{}; CompiledRegex::StartDesc start_desc{};
if (compute_start_desc(0, start_desc, forward) or if (compute_start_desc<direction>(0, start_desc) or
not contains(start_desc.map, false)) not contains(start_desc.map, false))
return nullptr; return nullptr;