Reduce the amount of Regex VM Instruction code
Merge all lookarounds into the same instruction, merge splits, merge literal ignore case with literal... Besides reducing the amount of almost duplicated code, this improves performance by reducing pressure on the (often failing) branch target prediction for instruction dispatching by moving branches into the instruction code themselves where they are more likely to be well predicted.
This commit is contained in:
parent
0e2612f1ad
commit
8566ae14a0
|
@ -81,7 +81,7 @@ else
|
||||||
LDFLAGS += -rdynamic
|
LDFLAGS += -rdynamic
|
||||||
endif
|
endif
|
||||||
|
|
||||||
CXXFLAGS += -pedantic -std=c++17 -g -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-address
|
CXXFLAGS += -pedantic -std=c++2a -g -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-address
|
||||||
|
|
||||||
compiler := $(shell $(CXX) --version)
|
compiler := $(shell $(CXX) --version)
|
||||||
ifneq (,$(findstring clang,$(compiler)))
|
ifneq (,$(findstring clang,$(compiler)))
|
||||||
|
|
|
@ -27,8 +27,8 @@ struct ParsedRegex
|
||||||
Literal,
|
Literal,
|
||||||
AnyChar,
|
AnyChar,
|
||||||
AnyCharExceptNewLine,
|
AnyCharExceptNewLine,
|
||||||
Class,
|
CharClass,
|
||||||
CharacterType,
|
CharType,
|
||||||
Sequence,
|
Sequence,
|
||||||
Alternation,
|
Alternation,
|
||||||
LineStart,
|
LineStart,
|
||||||
|
@ -73,7 +73,7 @@ struct ParsedRegex
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
using NodeIndex = uint16_t;
|
using NodeIndex = int16_t;
|
||||||
struct [[gnu::packed]] Node
|
struct [[gnu::packed]] Node
|
||||||
{
|
{
|
||||||
Op op;
|
Op op;
|
||||||
|
@ -397,7 +397,7 @@ private:
|
||||||
// CharacterClassEscape
|
// CharacterClassEscape
|
||||||
auto class_it = find_if(character_class_escapes, [cp](auto& c) { return c.cp == cp; });
|
auto class_it = find_if(character_class_escapes, [cp](auto& c) { return c.cp == cp; });
|
||||||
if (class_it != std::end(character_class_escapes))
|
if (class_it != std::end(character_class_escapes))
|
||||||
return new_node(ParsedRegex::CharacterType, (Codepoint)class_it->ctype);
|
return new_node(ParsedRegex::CharType, (Codepoint)class_it->ctype);
|
||||||
|
|
||||||
// CharacterEscape
|
// CharacterEscape
|
||||||
for (auto& control : control_escapes)
|
for (auto& control : control_escapes)
|
||||||
|
@ -546,12 +546,12 @@ private:
|
||||||
|
|
||||||
if (character_class.ctypes != CharacterType::None and not character_class.negative and
|
if (character_class.ctypes != CharacterType::None and not character_class.negative and
|
||||||
character_class.ranges.empty())
|
character_class.ranges.empty())
|
||||||
return new_node(ParsedRegex::CharacterType, (Codepoint)character_class.ctypes);
|
return new_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes);
|
||||||
|
|
||||||
auto class_id = m_parsed_regex.character_classes.size();
|
auto class_id = m_parsed_regex.character_classes.size();
|
||||||
m_parsed_regex.character_classes.push_back(std::move(character_class));
|
m_parsed_regex.character_classes.push_back(std::move(character_class));
|
||||||
|
|
||||||
return new_node(ParsedRegex::Class, class_id);
|
return new_node(ParsedRegex::CharClass, class_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
ParsedRegex::Quantifier quantifier()
|
ParsedRegex::Quantifier quantifier()
|
||||||
|
@ -638,8 +638,8 @@ private:
|
||||||
for (auto child_index : Children<>{m_parsed_regex, index})
|
for (auto child_index : Children<>{m_parsed_regex, index})
|
||||||
{
|
{
|
||||||
auto& child = get_node(child_index);
|
auto& child = get_node(child_index);
|
||||||
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and
|
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::CharClass and
|
||||||
child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and
|
child.op != ParsedRegex::CharType and child.op != ParsedRegex::AnyChar and
|
||||||
child.op != ParsedRegex::AnyCharExceptNewLine)
|
child.op != ParsedRegex::AnyCharExceptNewLine)
|
||||||
parse_error("Lookaround can only contain literals, any chars or character classes");
|
parse_error("Lookaround can only contain literals, any chars or character classes");
|
||||||
if (child.op == ParsedRegex::Literal and
|
if (child.op == ParsedRegex::Literal and
|
||||||
|
@ -684,6 +684,8 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[];
|
||||||
|
|
||||||
struct RegexCompiler
|
struct RegexCompiler
|
||||||
{
|
{
|
||||||
|
using OpIndex = int16_t;
|
||||||
|
|
||||||
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags)
|
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags)
|
||||||
: m_flags(flags), m_parsed_regex{parsed_regex}
|
: m_flags(flags), m_parsed_regex{parsed_regex}
|
||||||
{
|
{
|
||||||
|
@ -722,7 +724,7 @@ struct RegexCompiler
|
||||||
private:
|
private:
|
||||||
|
|
||||||
template<RegexMode direction>
|
template<RegexMode direction>
|
||||||
uint32_t compile_node_inner(ParsedRegex::NodeIndex index)
|
OpIndex compile_node_inner(ParsedRegex::NodeIndex index)
|
||||||
{
|
{
|
||||||
auto& node = get_node(index);
|
auto& node = get_node(index);
|
||||||
|
|
||||||
|
@ -733,16 +735,13 @@ private:
|
||||||
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
|
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
|
||||||
constexpr bool forward = direction == RegexMode::Forward;
|
constexpr bool forward = direction == RegexMode::Forward;
|
||||||
if (save)
|
if (save)
|
||||||
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1));
|
push_inst(CompiledRegex::Save, {.save_index = int16_t(node.value * 2 + (forward ? 0 : 1))});
|
||||||
|
|
||||||
Vector<uint32_t> goto_inner_end_offsets;
|
Vector<uint32_t> goto_inner_end_offsets;
|
||||||
switch (node.op)
|
switch (node.op)
|
||||||
{
|
{
|
||||||
case ParsedRegex::Literal:
|
case ParsedRegex::Literal:
|
||||||
if (ignore_case)
|
push_inst(CompiledRegex::Literal, {.literal={.codepoint=ignore_case ? to_lower(node.value) : node.value, .ignore_case=ignore_case}});
|
||||||
push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node.value));
|
|
||||||
else
|
|
||||||
push_inst(CompiledRegex::Literal, node.value);
|
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::AnyChar:
|
case ParsedRegex::AnyChar:
|
||||||
push_inst(CompiledRegex::AnyChar);
|
push_inst(CompiledRegex::AnyChar);
|
||||||
|
@ -750,11 +749,11 @@ private:
|
||||||
case ParsedRegex::AnyCharExceptNewLine:
|
case ParsedRegex::AnyCharExceptNewLine:
|
||||||
push_inst(CompiledRegex::AnyCharExceptNewLine);
|
push_inst(CompiledRegex::AnyCharExceptNewLine);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::Class:
|
case ParsedRegex::CharClass:
|
||||||
push_inst(CompiledRegex::Class, node.value);
|
push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::CharacterType:
|
case ParsedRegex::CharType:
|
||||||
push_inst(CompiledRegex::CharacterType, node.value);
|
push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::Sequence:
|
case ParsedRegex::Sequence:
|
||||||
{
|
{
|
||||||
|
@ -768,7 +767,7 @@ private:
|
||||||
for (auto child : Children<>{m_parsed_regex, index})
|
for (auto child : Children<>{m_parsed_regex, index})
|
||||||
{
|
{
|
||||||
if (child != index+1)
|
if (child != index+1)
|
||||||
push_inst(CompiledRegex::Split_PrioritizeParent);
|
push_inst(CompiledRegex::Split);
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto end = node.children_end;
|
const auto end = node.children_end;
|
||||||
|
@ -776,7 +775,7 @@ private:
|
||||||
{
|
{
|
||||||
auto node = compile_node<direction>(child);
|
auto node = compile_node<direction>(child);
|
||||||
if (child != index+1)
|
if (child != index+1)
|
||||||
m_program.instructions[split_pos++].param = node;
|
m_program.instructions[split_pos++].param.split = CompiledRegex::Param::Split{.target = node, .prioritize_parent = true};
|
||||||
if (get_node(child).children_end != end)
|
if (get_node(child).children_end != end)
|
||||||
{
|
{
|
||||||
auto jump = push_inst(CompiledRegex::Jump);
|
auto jump = push_inst(CompiledRegex::Jump);
|
||||||
|
@ -786,71 +785,66 @@ private:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
|
||||||
: CompiledRegex::LookAhead,
|
|
||||||
push_lookaround<RegexMode::Forward>(index, ignore_case));
|
|
||||||
break;
|
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
push_inst(CompiledRegex::LookAround, {.lookaround={
|
||||||
: CompiledRegex::NegativeLookAhead,
|
.index=push_lookaround<RegexMode::Forward>(index, ignore_case),
|
||||||
push_lookaround<RegexMode::Forward>(index, ignore_case));
|
.ahead=true,
|
||||||
|
.positive=node.op == ParsedRegex::LookAhead,
|
||||||
|
.ignore_case=ignore_case}});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LookBehind:
|
case ParsedRegex::LookBehind:
|
||||||
push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
|
||||||
: CompiledRegex::LookBehind,
|
|
||||||
push_lookaround<RegexMode::Backward>(index, ignore_case));
|
|
||||||
break;
|
|
||||||
case ParsedRegex::NegativeLookBehind:
|
case ParsedRegex::NegativeLookBehind:
|
||||||
push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
push_inst(CompiledRegex::LookAround, {.lookaround={
|
||||||
: CompiledRegex::NegativeLookBehind,
|
.index=push_lookaround<RegexMode::Backward>(index, ignore_case),
|
||||||
push_lookaround<RegexMode::Backward>(index, ignore_case));
|
.ahead=false,
|
||||||
|
.positive=node.op == ParsedRegex::LookBehind,
|
||||||
|
.ignore_case=ignore_case}});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineStart:
|
case ParsedRegex::LineStart:
|
||||||
push_inst(CompiledRegex::LineStart);
|
push_inst(CompiledRegex::LineAssertion, {.line_start=true});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineEnd:
|
case ParsedRegex::LineEnd:
|
||||||
push_inst(CompiledRegex::LineEnd);
|
push_inst(CompiledRegex::LineAssertion, {.line_start=false});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::WordBoundary:
|
case ParsedRegex::WordBoundary:
|
||||||
push_inst(CompiledRegex::WordBoundary);
|
push_inst(CompiledRegex::WordBoundary, {.word_boundary_positive=true});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NotWordBoundary:
|
case ParsedRegex::NotWordBoundary:
|
||||||
push_inst(CompiledRegex::NotWordBoundary);
|
push_inst(CompiledRegex::WordBoundary, {.word_boundary_positive=false});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectBegin:
|
case ParsedRegex::SubjectBegin:
|
||||||
push_inst(CompiledRegex::SubjectBegin);
|
push_inst(CompiledRegex::SubjectAssertion, {.subject_begin=true});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectEnd:
|
case ParsedRegex::SubjectEnd:
|
||||||
push_inst(CompiledRegex::SubjectEnd);
|
push_inst(CompiledRegex::SubjectAssertion, {.subject_begin=false});
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::ResetStart:
|
case ParsedRegex::ResetStart:
|
||||||
push_inst(CompiledRegex::Save, 0);
|
push_inst(CompiledRegex::Save, {.save_index=0});
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& offset : goto_inner_end_offsets)
|
for (auto& offset : goto_inner_end_offsets)
|
||||||
m_program.instructions[offset].param = m_program.instructions.size();
|
m_program.instructions[offset].param.jump_target = m_program.instructions.size();
|
||||||
|
|
||||||
if (save)
|
if (save)
|
||||||
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0));
|
push_inst(CompiledRegex::Save, {.save_index=int16_t(node.value * 2 + (forward ? 1 : 0))});
|
||||||
|
|
||||||
return start_pos;
|
return start_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<RegexMode direction>
|
template<RegexMode direction>
|
||||||
uint32_t compile_node(ParsedRegex::NodeIndex index)
|
OpIndex compile_node(ParsedRegex::NodeIndex index)
|
||||||
{
|
{
|
||||||
auto& node = get_node(index);
|
auto& node = get_node(index);
|
||||||
|
|
||||||
const uint32_t start_pos = (uint32_t)m_program.instructions.size();
|
const OpIndex start_pos = (OpIndex)m_program.instructions.size();
|
||||||
Vector<uint32_t> goto_ends;
|
Vector<OpIndex> goto_ends;
|
||||||
|
|
||||||
auto& quantifier = node.quantifier;
|
auto& quantifier = node.quantifier;
|
||||||
|
|
||||||
if (quantifier.allows_none())
|
if (quantifier.allows_none())
|
||||||
{
|
{
|
||||||
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}});
|
||||||
: CompiledRegex::Split_PrioritizeChild);
|
|
||||||
goto_ends.push_back(split_pos);
|
goto_ends.push_back(split_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -860,41 +854,38 @@ private:
|
||||||
inner_pos = compile_node_inner<direction>(index);
|
inner_pos = compile_node_inner<direction>(index);
|
||||||
|
|
||||||
if (quantifier.allows_infinite_repeat())
|
if (quantifier.allows_infinite_repeat())
|
||||||
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
|
push_inst(CompiledRegex::Split, {.split = {.target=inner_pos, .prioritize_parent=not quantifier.greedy}});
|
||||||
: CompiledRegex::Split_PrioritizeParent,
|
|
||||||
inner_pos);
|
|
||||||
// Write the node as an optional match for the min -> max counts
|
// Write the node as an optional match for the min -> max counts
|
||||||
else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY !
|
else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY !
|
||||||
i < quantifier.max; ++i)
|
i < quantifier.max; ++i)
|
||||||
{
|
{
|
||||||
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}});
|
||||||
: CompiledRegex::Split_PrioritizeChild);
|
|
||||||
goto_ends.push_back(split_pos);
|
goto_ends.push_back(split_pos);
|
||||||
compile_node_inner<direction>(index);
|
compile_node_inner<direction>(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto offset : goto_ends)
|
for (auto offset : goto_ends)
|
||||||
m_program.instructions[offset].param = m_program.instructions.size();
|
m_program.instructions[offset].param.split.target = m_program.instructions.size();
|
||||||
|
|
||||||
return start_pos;
|
return start_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
|
OpIndex push_inst(CompiledRegex::Op op, CompiledRegex::Param param = {})
|
||||||
{
|
{
|
||||||
constexpr auto max_instructions = std::numeric_limits<int16_t>::max();
|
constexpr auto max_instructions = std::numeric_limits<OpIndex>::max();
|
||||||
const uint32_t res = m_program.instructions.size();
|
const auto res = m_program.instructions.size();
|
||||||
if (res > max_instructions)
|
if (res >= max_instructions)
|
||||||
throw regex_error(format("regex compiled to more than {} instructions", max_instructions));
|
throw regex_error(format("regex compiled to more than {} instructions", max_instructions));
|
||||||
m_program.instructions.push_back({ op, false, 0, param });
|
m_program.instructions.push_back({ op, false, 0, param });
|
||||||
return res;
|
return OpIndex(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<RegexMode direction>
|
template<RegexMode direction>
|
||||||
uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case)
|
int16_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case)
|
||||||
{
|
{
|
||||||
using Lookaround = CompiledRegex::Lookaround;
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
|
|
||||||
const uint32_t res = m_program.lookarounds.size();
|
const int16_t res = m_program.lookarounds.size();
|
||||||
for (auto child : Children<direction>{m_parsed_regex, index})
|
for (auto child : Children<direction>{m_parsed_regex, index})
|
||||||
{
|
{
|
||||||
auto& character = get_node(child);
|
auto& character = get_node(child);
|
||||||
|
@ -905,9 +896,9 @@ private:
|
||||||
m_program.lookarounds.push_back(Lookaround::AnyChar);
|
m_program.lookarounds.push_back(Lookaround::AnyChar);
|
||||||
else if (character.op == ParsedRegex::AnyCharExceptNewLine)
|
else if (character.op == ParsedRegex::AnyCharExceptNewLine)
|
||||||
m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine);
|
m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine);
|
||||||
else if (character.op == ParsedRegex::Class)
|
else if (character.op == ParsedRegex::CharClass)
|
||||||
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterClass) + character.value));
|
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterClass) + character.value));
|
||||||
else if (character.op == ParsedRegex::CharacterType)
|
else if (character.op == ParsedRegex::CharType)
|
||||||
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterType) | character.value));
|
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterType) | character.value));
|
||||||
else
|
else
|
||||||
kak_assert(false);
|
kak_assert(false);
|
||||||
|
@ -951,7 +942,7 @@ private:
|
||||||
start_desc.map[cp] = true;
|
start_desc.map[cp] = true;
|
||||||
}
|
}
|
||||||
return node.quantifier.allows_none();
|
return node.quantifier.allows_none();
|
||||||
case ParsedRegex::Class:
|
case ParsedRegex::CharClass:
|
||||||
{
|
{
|
||||||
auto& character_class = m_parsed_regex.character_classes[node.value];
|
auto& character_class = m_parsed_regex.character_classes[node.value];
|
||||||
if (character_class.ctypes == CharacterType::None and
|
if (character_class.ctypes == CharacterType::None and
|
||||||
|
@ -978,7 +969,7 @@ private:
|
||||||
start_desc.map[CompiledRegex::StartDesc::other] = true;
|
start_desc.map[CompiledRegex::StartDesc::other] = true;
|
||||||
return node.quantifier.allows_none();
|
return node.quantifier.allows_none();
|
||||||
}
|
}
|
||||||
case ParsedRegex::CharacterType:
|
case ParsedRegex::CharType:
|
||||||
{
|
{
|
||||||
const CharacterType ctype = (CharacterType)node.value;
|
const CharacterType ctype = (CharacterType)node.value;
|
||||||
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
|
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
|
||||||
|
@ -1041,12 +1032,12 @@ private:
|
||||||
if (not (m_flags & RegexCompileFlags::Optimize))
|
if (not (m_flags & RegexCompileFlags::Optimize))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
auto is_jump = [](CompiledRegex::Op op) { return op >= CompiledRegex::Op::Jump and op <= CompiledRegex::Op::Split_PrioritizeChild; };
|
auto is_jump = [](CompiledRegex::Op op) { return op >= CompiledRegex::Op::Jump and op <= CompiledRegex::Op::Split; };
|
||||||
for (auto i = begin; i < end; ++i)
|
for (auto i = begin; i < end; ++i)
|
||||||
{
|
{
|
||||||
auto& inst = m_program.instructions[i];
|
auto& inst = m_program.instructions[i];
|
||||||
if (is_jump(inst.op))
|
if (is_jump(inst.op))
|
||||||
m_program.instructions[inst.param].last_step = 0xffff; // tag as jump target
|
m_program.instructions[inst.param.jump_target].last_step = 0xffff; // tag as jump target
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto block_begin = begin; block_begin < end; )
|
for (auto block_begin = begin; block_begin < end; )
|
||||||
|
@ -1064,7 +1055,7 @@ private:
|
||||||
void peephole_optimize(size_t begin, size_t end)
|
void peephole_optimize(size_t begin, size_t end)
|
||||||
{
|
{
|
||||||
// Move saves after all assertions on the same character
|
// Move saves after all assertions on the same character
|
||||||
auto is_assertion = [](CompiledRegex::Op op) { return op >= CompiledRegex::LineStart; };
|
auto is_assertion = [](CompiledRegex::Op op) { return op >= CompiledRegex::LineAssertion; };
|
||||||
for (auto i = begin, j = begin + 1; j < end; ++i, ++j)
|
for (auto i = begin, j = begin + 1; j < end; ++i, ++j)
|
||||||
{
|
{
|
||||||
if (m_program.instructions[i].op == CompiledRegex::Save and
|
if (m_program.instructions[i].op == CompiledRegex::Save and
|
||||||
|
@ -1095,10 +1086,7 @@ String dump_regex(const CompiledRegex& program)
|
||||||
switch (inst.op)
|
switch (inst.op)
|
||||||
{
|
{
|
||||||
case CompiledRegex::Literal:
|
case CompiledRegex::Literal:
|
||||||
res += format("literal {}\n", inst.param);
|
res += format("literal {}{}\n", inst.param.literal.ignore_case ? "(ignore case) " : "", inst.param.literal.codepoint);
|
||||||
break;
|
|
||||||
case CompiledRegex::Literal_IgnoreCase:
|
|
||||||
res += format("literal (ignore case) {}\n", inst.param);
|
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::AnyChar:
|
case CompiledRegex::AnyChar:
|
||||||
res += "any char\n";
|
res += "any char\n";
|
||||||
|
@ -1107,73 +1095,44 @@ String dump_regex(const CompiledRegex& program)
|
||||||
res += "anything but newline\n";
|
res += "anything but newline\n";
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Jump:
|
case CompiledRegex::Jump:
|
||||||
res += format("jump {}\n", inst.param);
|
res += format("jump {}\n", inst.param.jump_target);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Split_PrioritizeParent:
|
case CompiledRegex::Split:
|
||||||
case CompiledRegex::Split_PrioritizeChild:
|
|
||||||
{
|
{
|
||||||
res += format("split (prioritize {}) {}\n",
|
res += format("split (prioritize {}) {}\n",
|
||||||
inst.op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
|
(inst.param.split.prioritize_parent) ? "parent" : "child",
|
||||||
inst.param);
|
inst.param.split.target);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Save:
|
case CompiledRegex::Save:
|
||||||
res += format("save {}\n", inst.param);
|
res += format("save {}\n", inst.param.save_index);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Class:
|
case CompiledRegex::CharClass:
|
||||||
res += format("class {}\n", inst.param);
|
res += format("character class {}\n", inst.param.character_class_index);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::CharacterType:
|
case CompiledRegex::CharType:
|
||||||
res += format("character type {}\n", inst.param);
|
res += format("character type {}\n", to_underlying(inst.param.character_type));
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LineStart:
|
case CompiledRegex::LineAssertion:
|
||||||
res += "line start\n";
|
res += format("line {}\n", inst.param.line_start ? "start" : "end");;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LineEnd:
|
case CompiledRegex::SubjectAssertion:
|
||||||
res += "line end\n";
|
res += format("subject {}\n", inst.param.subject_begin ? "begin" : "end");
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::WordBoundary:
|
case CompiledRegex::WordBoundary:
|
||||||
res += "word boundary\n";
|
res += format("{}word boundary\n", inst.param.word_boundary_positive ? "" : "not ");
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::NotWordBoundary:
|
case CompiledRegex::LookAround:
|
||||||
res += "not word boundary\n";
|
|
||||||
break;
|
|
||||||
case CompiledRegex::SubjectBegin:
|
|
||||||
res += "subject begin\n";
|
|
||||||
break;
|
|
||||||
case CompiledRegex::SubjectEnd:
|
|
||||||
res += "subject end\n";
|
|
||||||
break;
|
|
||||||
case CompiledRegex::LookAhead:
|
|
||||||
case CompiledRegex::NegativeLookAhead:
|
|
||||||
case CompiledRegex::LookBehind:
|
|
||||||
case CompiledRegex::NegativeLookBehind:
|
|
||||||
case CompiledRegex::LookAhead_IgnoreCase:
|
|
||||||
case CompiledRegex::NegativeLookAhead_IgnoreCase:
|
|
||||||
case CompiledRegex::LookBehind_IgnoreCase:
|
|
||||||
case CompiledRegex::NegativeLookBehind_IgnoreCase:
|
|
||||||
{
|
{
|
||||||
const char* name = nullptr;
|
String name;
|
||||||
if (inst.op == CompiledRegex::LookAhead)
|
name += inst.param.lookaround.positive ? "" : "negative ";
|
||||||
name = "look ahead";
|
name += "look ";
|
||||||
if (inst.op == CompiledRegex::NegativeLookAhead)
|
name += inst.param.lookaround.ahead ? "ahead " : "behind ";
|
||||||
name = "negative look ahead";
|
if (inst.param.lookaround.ignore_case)
|
||||||
if (inst.op == CompiledRegex::LookBehind)
|
name += " (ignore case)";
|
||||||
name = "look behind";
|
|
||||||
if (inst.op == CompiledRegex::NegativeLookBehind)
|
|
||||||
name = "negative look behind";
|
|
||||||
|
|
||||||
if (inst.op == CompiledRegex::LookAhead_IgnoreCase)
|
|
||||||
name = "look ahead (ignore case)";
|
|
||||||
if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase)
|
|
||||||
name = "negative look ahead (ignore case)";
|
|
||||||
if (inst.op == CompiledRegex::LookBehind_IgnoreCase)
|
|
||||||
name = "look behind (ignore case)";
|
|
||||||
if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase)
|
|
||||||
name = "negative look behind (ignore case)";
|
|
||||||
|
|
||||||
String str;
|
String str;
|
||||||
for (auto it = program.lookarounds.begin() + inst.param;
|
for (auto it = program.lookarounds.begin() + inst.param.lookaround.index;
|
||||||
*it != CompiledRegex::Lookaround::EndOfLookaround; ++it)
|
*it != CompiledRegex::Lookaround::EndOfLookaround; ++it)
|
||||||
utf8::dump(std::back_inserter(str), to_underlying(*it));
|
utf8::dump(std::back_inserter(str), to_underlying(*it));
|
||||||
res += format("{} ({})\n", name, str);
|
res += format("{} ({})\n", name, str);
|
||||||
|
|
|
@ -50,29 +50,17 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
{
|
{
|
||||||
Match,
|
Match,
|
||||||
Literal,
|
Literal,
|
||||||
Literal_IgnoreCase,
|
|
||||||
AnyChar,
|
AnyChar,
|
||||||
AnyCharExceptNewLine,
|
AnyCharExceptNewLine,
|
||||||
Class,
|
CharClass,
|
||||||
CharacterType,
|
CharType,
|
||||||
Jump,
|
Jump,
|
||||||
Split_PrioritizeParent,
|
Split,
|
||||||
Split_PrioritizeChild,
|
|
||||||
Save,
|
Save,
|
||||||
LineStart,
|
LineAssertion,
|
||||||
LineEnd,
|
SubjectAssertion,
|
||||||
WordBoundary,
|
WordBoundary,
|
||||||
NotWordBoundary,
|
LookAround,
|
||||||
SubjectBegin,
|
|
||||||
SubjectEnd,
|
|
||||||
LookAhead,
|
|
||||||
NegativeLookAhead,
|
|
||||||
LookBehind,
|
|
||||||
NegativeLookBehind,
|
|
||||||
LookAhead_IgnoreCase,
|
|
||||||
NegativeLookAhead_IgnoreCase,
|
|
||||||
LookBehind_IgnoreCase,
|
|
||||||
NegativeLookBehind_IgnoreCase,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class Lookaround : Codepoint
|
enum class Lookaround : Codepoint
|
||||||
|
@ -86,15 +74,46 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
EndOfLookaround = static_cast<Codepoint>(-1)
|
EndOfLookaround = static_cast<Codepoint>(-1)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
union Param
|
||||||
|
{
|
||||||
|
struct Literal
|
||||||
|
{
|
||||||
|
uint32_t codepoint : 24;
|
||||||
|
bool ignore_case : 1;
|
||||||
|
} literal;
|
||||||
|
int16_t character_class_index;
|
||||||
|
CharacterType character_type;
|
||||||
|
int16_t jump_target;
|
||||||
|
int16_t save_index;
|
||||||
|
struct Split
|
||||||
|
{
|
||||||
|
int16_t target;
|
||||||
|
bool prioritize_parent : 1;
|
||||||
|
} split;
|
||||||
|
bool line_start;
|
||||||
|
bool subject_begin;
|
||||||
|
bool word_boundary_positive;
|
||||||
|
struct Lookaround
|
||||||
|
{
|
||||||
|
int16_t index;
|
||||||
|
bool ahead : 1;
|
||||||
|
bool positive : 1;
|
||||||
|
bool ignore_case : 1;
|
||||||
|
} lookaround;
|
||||||
|
};
|
||||||
|
static_assert(sizeof(Param) == 4);
|
||||||
|
|
||||||
struct Instruction
|
struct Instruction
|
||||||
{
|
{
|
||||||
Op op;
|
Op op;
|
||||||
// Those mutables are used during execution
|
// Those mutables are used during execution
|
||||||
mutable bool scheduled;
|
mutable bool scheduled;
|
||||||
mutable uint16_t last_step;
|
mutable uint16_t last_step;
|
||||||
uint32_t param;
|
Param param;
|
||||||
};
|
};
|
||||||
static_assert(sizeof(Instruction) == 8, "");
|
static_assert(sizeof(Instruction) == 8);
|
||||||
|
|
||||||
|
static constexpr uint32_t prioritize_parent{1 << 16};
|
||||||
|
|
||||||
explicit operator bool() const { return not instructions.empty(); }
|
explicit operator bool() const { return not instructions.empty(); }
|
||||||
|
|
||||||
|
@ -343,110 +362,6 @@ private:
|
||||||
|
|
||||||
switch (inst.op)
|
switch (inst.op)
|
||||||
{
|
{
|
||||||
case CompiledRegex::Literal:
|
|
||||||
if (pos != config.end and inst.param == codepoint(pos, config))
|
|
||||||
return consumed();
|
|
||||||
return failed();
|
|
||||||
case CompiledRegex::Literal_IgnoreCase:
|
|
||||||
if (pos != config.end and inst.param == to_lower(codepoint(pos, config)))
|
|
||||||
return consumed();
|
|
||||||
return failed();
|
|
||||||
case CompiledRegex::AnyChar:
|
|
||||||
return consumed();
|
|
||||||
case CompiledRegex::AnyCharExceptNewLine:
|
|
||||||
if (pos != config.end and codepoint(pos, config) != '\n')
|
|
||||||
return consumed();
|
|
||||||
return failed();
|
|
||||||
case CompiledRegex::Jump:
|
|
||||||
thread.inst = static_cast<int16_t>(inst.param);
|
|
||||||
break;
|
|
||||||
case CompiledRegex::Split_PrioritizeParent:
|
|
||||||
{
|
|
||||||
if (thread.saves >= 0)
|
|
||||||
++m_saves[thread.saves]->refcount;
|
|
||||||
m_threads.push_current({static_cast<int16_t>(inst.param), thread.saves});
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case CompiledRegex::Split_PrioritizeChild:
|
|
||||||
{
|
|
||||||
if (thread.saves >= 0)
|
|
||||||
++m_saves[thread.saves]->refcount;
|
|
||||||
m_threads.push_current({thread.inst, thread.saves});
|
|
||||||
thread.inst = static_cast<uint16_t>(inst.param);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case CompiledRegex::Save:
|
|
||||||
{
|
|
||||||
if (mode & RegexMode::NoSaves)
|
|
||||||
break;
|
|
||||||
if (thread.saves < 0)
|
|
||||||
thread.saves = new_saves<false>(nullptr);
|
|
||||||
else if (m_saves[thread.saves]->refcount > 1)
|
|
||||||
{
|
|
||||||
--m_saves[thread.saves]->refcount;
|
|
||||||
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
|
|
||||||
}
|
|
||||||
m_saves[thread.saves]->pos[inst.param] = pos;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case CompiledRegex::Class:
|
|
||||||
if (pos == config.end)
|
|
||||||
return failed();
|
|
||||||
return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ?
|
|
||||||
consumed() : failed();
|
|
||||||
case CompiledRegex::CharacterType:
|
|
||||||
if (pos == config.end)
|
|
||||||
return failed();
|
|
||||||
return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ?
|
|
||||||
consumed() : failed();
|
|
||||||
case CompiledRegex::LineStart:
|
|
||||||
if (not is_line_start(pos, config))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::LineEnd:
|
|
||||||
if (not is_line_end(pos, config))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::WordBoundary:
|
|
||||||
if (not is_word_boundary(pos, config))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::NotWordBoundary:
|
|
||||||
if (is_word_boundary(pos, config))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::SubjectBegin:
|
|
||||||
if (pos != config.subject_begin)
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::SubjectEnd:
|
|
||||||
if (pos != config.subject_end)
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::LookAhead:
|
|
||||||
case CompiledRegex::NegativeLookAhead:
|
|
||||||
if (lookaround<true, false>(inst.param, pos, config) !=
|
|
||||||
(inst.op == CompiledRegex::LookAhead))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::LookAhead_IgnoreCase:
|
|
||||||
case CompiledRegex::NegativeLookAhead_IgnoreCase:
|
|
||||||
if (lookaround<true, true>(inst.param, pos, config) !=
|
|
||||||
(inst.op == CompiledRegex::LookAhead_IgnoreCase))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::LookBehind:
|
|
||||||
case CompiledRegex::NegativeLookBehind:
|
|
||||||
if (lookaround<false, false>(inst.param, pos, config) !=
|
|
||||||
(inst.op == CompiledRegex::LookBehind))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::LookBehind_IgnoreCase:
|
|
||||||
case CompiledRegex::NegativeLookBehind_IgnoreCase:
|
|
||||||
if (lookaround<false, true>(inst.param, pos, config) !=
|
|
||||||
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
|
|
||||||
return failed();
|
|
||||||
break;
|
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
if ((pos != config.end and not (mode & RegexMode::Search)) or
|
if ((pos != config.end and not (mode & RegexMode::Search)) or
|
||||||
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
|
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
|
||||||
|
@ -460,6 +375,71 @@ private:
|
||||||
while (not m_threads.current_is_empty())
|
while (not m_threads.current_is_empty())
|
||||||
release_saves(m_threads.pop_current().saves);
|
release_saves(m_threads.pop_current().saves);
|
||||||
return;
|
return;
|
||||||
|
case CompiledRegex::Literal:
|
||||||
|
if (pos != config.end and
|
||||||
|
inst.param.literal.codepoint == (inst.param.literal.ignore_case ? to_lower(codepoint(pos, config))
|
||||||
|
: codepoint(pos, config)))
|
||||||
|
return consumed();
|
||||||
|
return failed();
|
||||||
|
case CompiledRegex::AnyChar:
|
||||||
|
return consumed();
|
||||||
|
case CompiledRegex::AnyCharExceptNewLine:
|
||||||
|
if (pos != config.end and codepoint(pos, config) != '\n')
|
||||||
|
return consumed();
|
||||||
|
return failed();
|
||||||
|
case CompiledRegex::Jump:
|
||||||
|
thread.inst = inst.param.jump_target;
|
||||||
|
break;
|
||||||
|
case CompiledRegex::Split:
|
||||||
|
if (thread.saves >= 0)
|
||||||
|
++m_saves[thread.saves]->refcount;
|
||||||
|
|
||||||
|
if (inst.param.split.prioritize_parent)
|
||||||
|
m_threads.push_current({inst.param.split.target, thread.saves});
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_threads.push_current(thread);
|
||||||
|
thread.inst = inst.param.split.target;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case CompiledRegex::Save:
|
||||||
|
if (mode & RegexMode::NoSaves)
|
||||||
|
break;
|
||||||
|
if (thread.saves < 0)
|
||||||
|
thread.saves = new_saves<false>(nullptr);
|
||||||
|
else if (m_saves[thread.saves]->refcount > 1)
|
||||||
|
{
|
||||||
|
--m_saves[thread.saves]->refcount;
|
||||||
|
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
|
||||||
|
}
|
||||||
|
m_saves[thread.saves]->pos[inst.param.save_index] = pos;
|
||||||
|
break;
|
||||||
|
case CompiledRegex::CharClass:
|
||||||
|
if (pos == config.end)
|
||||||
|
return failed();
|
||||||
|
return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ?
|
||||||
|
consumed() : failed();
|
||||||
|
case CompiledRegex::CharType:
|
||||||
|
if (pos == config.end)
|
||||||
|
return failed();
|
||||||
|
return is_ctype(inst.param.character_type, codepoint(pos, config)) ?
|
||||||
|
consumed() : failed();
|
||||||
|
case CompiledRegex::LineAssertion:
|
||||||
|
if (not (inst.param.line_start ? is_line_start(pos, config) : is_line_end(pos, config)))
|
||||||
|
return failed();
|
||||||
|
break;
|
||||||
|
case CompiledRegex::SubjectAssertion:
|
||||||
|
if (pos != (inst.param.subject_begin ? config.subject_begin : config.subject_end))
|
||||||
|
return failed();
|
||||||
|
break;
|
||||||
|
case CompiledRegex::WordBoundary:
|
||||||
|
if (is_word_boundary(pos, config) != inst.param.word_boundary_positive)
|
||||||
|
return failed();
|
||||||
|
break;
|
||||||
|
case CompiledRegex::LookAround:
|
||||||
|
if (lookaround(inst.param.lookaround, pos, config) != inst.param.lookaround.positive)
|
||||||
|
return failed();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return failed();
|
return failed();
|
||||||
|
@ -544,25 +524,24 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<bool look_forward, bool ignore_case>
|
bool lookaround(CompiledRegex::Param::Lookaround param, Iterator pos, const ExecConfig& config) const
|
||||||
bool lookaround(uint32_t index, Iterator pos, const ExecConfig& config) const
|
|
||||||
{
|
{
|
||||||
using Lookaround = CompiledRegex::Lookaround;
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
|
|
||||||
if (not look_forward)
|
if (not param.ahead)
|
||||||
{
|
{
|
||||||
if (pos == config.subject_begin)
|
if (pos == config.subject_begin)
|
||||||
return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
|
return m_program.lookarounds[param.index] == Lookaround::EndOfLookaround;
|
||||||
utf8::to_previous(pos, config.subject_begin);
|
utf8::to_previous(pos, config.subject_begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
for (auto it = m_program.lookarounds.begin() + param.index; *it != Lookaround::EndOfLookaround; ++it)
|
||||||
{
|
{
|
||||||
if (look_forward and pos == config.subject_end)
|
if (param.ahead and pos == config.subject_end)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
Codepoint cp = utf8::codepoint(pos, config.subject_end);
|
Codepoint cp = utf8::codepoint(pos, config.subject_end);
|
||||||
if (ignore_case)
|
if (param.ignore_case)
|
||||||
cp = to_lower(cp);
|
cp = to_lower(cp);
|
||||||
|
|
||||||
const Lookaround op = *it;
|
const Lookaround op = *it;
|
||||||
|
@ -588,11 +567,11 @@ private:
|
||||||
else if (static_cast<Codepoint>(op) != cp)
|
else if (static_cast<Codepoint>(op) != cp)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (not look_forward and pos == config.subject_begin)
|
if (not param.ahead and pos == config.subject_begin)
|
||||||
return *++it == Lookaround::EndOfLookaround;
|
return *++it == Lookaround::EndOfLookaround;
|
||||||
|
|
||||||
look_forward ? utf8::to_next(pos, config.subject_end)
|
param.ahead ? utf8::to_next(pos, config.subject_end)
|
||||||
: utf8::to_previous(pos, config.subject_begin);
|
: utf8::to_previous(pos, config.subject_begin);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ String trim_indent(StringView str)
|
||||||
throw runtime_error("inconsistent indentation in the string");
|
throw runtime_error("inconsistent indentation in the string");
|
||||||
|
|
||||||
return line.substr(indent.length());
|
return line.substr(indent.length());
|
||||||
}), String{}, [](String& s, StringView l) -> decltype(auto) { return s += l; });
|
}), String{}, [](String s, StringView l) { return s += l; });
|
||||||
}
|
}
|
||||||
|
|
||||||
String escape(StringView str, StringView characters, char escape)
|
String escape(StringView str, StringView characters, char escape)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user