Reduce the amount of Regex VM Instruction code

Merge all lookarounds into the same instruction, merge splits, merge
literal ignore case with literal...

Besides reducing the amount of almost duplicated code, this improves
performance by reducing pressure on the (often failing) branch target
prediction for instruction dispatching by moving branches into the
instruction code themselves where they are more likely to be well
predicted.
This commit is contained in:
Maxime Coste 2020-07-25 14:07:57 +10:00
parent 0e2612f1ad
commit 8566ae14a0
4 changed files with 197 additions and 259 deletions

View File

@ -81,7 +81,7 @@ else
LDFLAGS += -rdynamic LDFLAGS += -rdynamic
endif endif
CXXFLAGS += -pedantic -std=c++17 -g -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-address CXXFLAGS += -pedantic -std=c++2a -g -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-address
compiler := $(shell $(CXX) --version) compiler := $(shell $(CXX) --version)
ifneq (,$(findstring clang,$(compiler))) ifneq (,$(findstring clang,$(compiler)))

View File

@ -27,8 +27,8 @@ struct ParsedRegex
Literal, Literal,
AnyChar, AnyChar,
AnyCharExceptNewLine, AnyCharExceptNewLine,
Class, CharClass,
CharacterType, CharType,
Sequence, Sequence,
Alternation, Alternation,
LineStart, LineStart,
@ -73,7 +73,7 @@ struct ParsedRegex
}; };
}; };
using NodeIndex = uint16_t; using NodeIndex = int16_t;
struct [[gnu::packed]] Node struct [[gnu::packed]] Node
{ {
Op op; Op op;
@ -397,7 +397,7 @@ private:
// CharacterClassEscape // CharacterClassEscape
auto class_it = find_if(character_class_escapes, [cp](auto& c) { return c.cp == cp; }); auto class_it = find_if(character_class_escapes, [cp](auto& c) { return c.cp == cp; });
if (class_it != std::end(character_class_escapes)) if (class_it != std::end(character_class_escapes))
return new_node(ParsedRegex::CharacterType, (Codepoint)class_it->ctype); return new_node(ParsedRegex::CharType, (Codepoint)class_it->ctype);
// CharacterEscape // CharacterEscape
for (auto& control : control_escapes) for (auto& control : control_escapes)
@ -546,12 +546,12 @@ private:
if (character_class.ctypes != CharacterType::None and not character_class.negative and if (character_class.ctypes != CharacterType::None and not character_class.negative and
character_class.ranges.empty()) character_class.ranges.empty())
return new_node(ParsedRegex::CharacterType, (Codepoint)character_class.ctypes); return new_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes);
auto class_id = m_parsed_regex.character_classes.size(); auto class_id = m_parsed_regex.character_classes.size();
m_parsed_regex.character_classes.push_back(std::move(character_class)); m_parsed_regex.character_classes.push_back(std::move(character_class));
return new_node(ParsedRegex::Class, class_id); return new_node(ParsedRegex::CharClass, class_id);
} }
ParsedRegex::Quantifier quantifier() ParsedRegex::Quantifier quantifier()
@ -638,8 +638,8 @@ private:
for (auto child_index : Children<>{m_parsed_regex, index}) for (auto child_index : Children<>{m_parsed_regex, index})
{ {
auto& child = get_node(child_index); auto& child = get_node(child_index);
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::CharClass and
child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and child.op != ParsedRegex::CharType and child.op != ParsedRegex::AnyChar and
child.op != ParsedRegex::AnyCharExceptNewLine) child.op != ParsedRegex::AnyCharExceptNewLine)
parse_error("Lookaround can only contain literals, any chars or character classes"); parse_error("Lookaround can only contain literals, any chars or character classes");
if (child.op == ParsedRegex::Literal and if (child.op == ParsedRegex::Literal and
@ -684,6 +684,8 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[];
struct RegexCompiler struct RegexCompiler
{ {
using OpIndex = int16_t;
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags) RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags)
: m_flags(flags), m_parsed_regex{parsed_regex} : m_flags(flags), m_parsed_regex{parsed_regex}
{ {
@ -722,7 +724,7 @@ struct RegexCompiler
private: private:
template<RegexMode direction> template<RegexMode direction>
uint32_t compile_node_inner(ParsedRegex::NodeIndex index) OpIndex compile_node_inner(ParsedRegex::NodeIndex index)
{ {
auto& node = get_node(index); auto& node = get_node(index);
@ -733,16 +735,13 @@ private:
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs))); (node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
constexpr bool forward = direction == RegexMode::Forward; constexpr bool forward = direction == RegexMode::Forward;
if (save) if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1)); push_inst(CompiledRegex::Save, {.save_index = int16_t(node.value * 2 + (forward ? 0 : 1))});
Vector<uint32_t> goto_inner_end_offsets; Vector<uint32_t> goto_inner_end_offsets;
switch (node.op) switch (node.op)
{ {
case ParsedRegex::Literal: case ParsedRegex::Literal:
if (ignore_case) push_inst(CompiledRegex::Literal, {.literal={.codepoint=ignore_case ? to_lower(node.value) : node.value, .ignore_case=ignore_case}});
push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node.value));
else
push_inst(CompiledRegex::Literal, node.value);
break; break;
case ParsedRegex::AnyChar: case ParsedRegex::AnyChar:
push_inst(CompiledRegex::AnyChar); push_inst(CompiledRegex::AnyChar);
@ -750,11 +749,11 @@ private:
case ParsedRegex::AnyCharExceptNewLine: case ParsedRegex::AnyCharExceptNewLine:
push_inst(CompiledRegex::AnyCharExceptNewLine); push_inst(CompiledRegex::AnyCharExceptNewLine);
break; break;
case ParsedRegex::Class: case ParsedRegex::CharClass:
push_inst(CompiledRegex::Class, node.value); push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)});
break; break;
case ParsedRegex::CharacterType: case ParsedRegex::CharType:
push_inst(CompiledRegex::CharacterType, node.value); push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}});
break; break;
case ParsedRegex::Sequence: case ParsedRegex::Sequence:
{ {
@ -768,7 +767,7 @@ private:
for (auto child : Children<>{m_parsed_regex, index}) for (auto child : Children<>{m_parsed_regex, index})
{ {
if (child != index+1) if (child != index+1)
push_inst(CompiledRegex::Split_PrioritizeParent); push_inst(CompiledRegex::Split);
} }
const auto end = node.children_end; const auto end = node.children_end;
@ -776,7 +775,7 @@ private:
{ {
auto node = compile_node<direction>(child); auto node = compile_node<direction>(child);
if (child != index+1) if (child != index+1)
m_program.instructions[split_pos++].param = node; m_program.instructions[split_pos++].param.split = CompiledRegex::Param::Split{.target = node, .prioritize_parent = true};
if (get_node(child).children_end != end) if (get_node(child).children_end != end)
{ {
auto jump = push_inst(CompiledRegex::Jump); auto jump = push_inst(CompiledRegex::Jump);
@ -786,71 +785,66 @@ private:
break; break;
} }
case ParsedRegex::LookAhead: case ParsedRegex::LookAhead:
push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead,
push_lookaround<RegexMode::Forward>(index, ignore_case));
break;
case ParsedRegex::NegativeLookAhead: case ParsedRegex::NegativeLookAhead:
push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase push_inst(CompiledRegex::LookAround, {.lookaround={
: CompiledRegex::NegativeLookAhead, .index=push_lookaround<RegexMode::Forward>(index, ignore_case),
push_lookaround<RegexMode::Forward>(index, ignore_case)); .ahead=true,
.positive=node.op == ParsedRegex::LookAhead,
.ignore_case=ignore_case}});
break; break;
case ParsedRegex::LookBehind: case ParsedRegex::LookBehind:
push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind,
push_lookaround<RegexMode::Backward>(index, ignore_case));
break;
case ParsedRegex::NegativeLookBehind: case ParsedRegex::NegativeLookBehind:
push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase push_inst(CompiledRegex::LookAround, {.lookaround={
: CompiledRegex::NegativeLookBehind, .index=push_lookaround<RegexMode::Backward>(index, ignore_case),
push_lookaround<RegexMode::Backward>(index, ignore_case)); .ahead=false,
.positive=node.op == ParsedRegex::LookBehind,
.ignore_case=ignore_case}});
break; break;
case ParsedRegex::LineStart: case ParsedRegex::LineStart:
push_inst(CompiledRegex::LineStart); push_inst(CompiledRegex::LineAssertion, {.line_start=true});
break; break;
case ParsedRegex::LineEnd: case ParsedRegex::LineEnd:
push_inst(CompiledRegex::LineEnd); push_inst(CompiledRegex::LineAssertion, {.line_start=false});
break; break;
case ParsedRegex::WordBoundary: case ParsedRegex::WordBoundary:
push_inst(CompiledRegex::WordBoundary); push_inst(CompiledRegex::WordBoundary, {.word_boundary_positive=true});
break; break;
case ParsedRegex::NotWordBoundary: case ParsedRegex::NotWordBoundary:
push_inst(CompiledRegex::NotWordBoundary); push_inst(CompiledRegex::WordBoundary, {.word_boundary_positive=false});
break; break;
case ParsedRegex::SubjectBegin: case ParsedRegex::SubjectBegin:
push_inst(CompiledRegex::SubjectBegin); push_inst(CompiledRegex::SubjectAssertion, {.subject_begin=true});
break; break;
case ParsedRegex::SubjectEnd: case ParsedRegex::SubjectEnd:
push_inst(CompiledRegex::SubjectEnd); push_inst(CompiledRegex::SubjectAssertion, {.subject_begin=false});
break; break;
case ParsedRegex::ResetStart: case ParsedRegex::ResetStart:
push_inst(CompiledRegex::Save, 0); push_inst(CompiledRegex::Save, {.save_index=0});
break; break;
} }
for (auto& offset : goto_inner_end_offsets) for (auto& offset : goto_inner_end_offsets)
m_program.instructions[offset].param = m_program.instructions.size(); m_program.instructions[offset].param.jump_target = m_program.instructions.size();
if (save) if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0)); push_inst(CompiledRegex::Save, {.save_index=int16_t(node.value * 2 + (forward ? 1 : 0))});
return start_pos; return start_pos;
} }
template<RegexMode direction> template<RegexMode direction>
uint32_t compile_node(ParsedRegex::NodeIndex index) OpIndex compile_node(ParsedRegex::NodeIndex index)
{ {
auto& node = get_node(index); auto& node = get_node(index);
const uint32_t start_pos = (uint32_t)m_program.instructions.size(); const OpIndex start_pos = (OpIndex)m_program.instructions.size();
Vector<uint32_t> goto_ends; Vector<OpIndex> goto_ends;
auto& quantifier = node.quantifier; auto& quantifier = node.quantifier;
if (quantifier.allows_none()) if (quantifier.allows_none())
{ {
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}});
: CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
} }
@ -860,41 +854,38 @@ private:
inner_pos = compile_node_inner<direction>(index); inner_pos = compile_node_inner<direction>(index);
if (quantifier.allows_infinite_repeat()) if (quantifier.allows_infinite_repeat())
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild push_inst(CompiledRegex::Split, {.split = {.target=inner_pos, .prioritize_parent=not quantifier.greedy}});
: CompiledRegex::Split_PrioritizeParent,
inner_pos);
// Write the node as an optional match for the min -> max counts // Write the node as an optional match for the min -> max counts
else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY ! else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY !
i < quantifier.max; ++i) i < quantifier.max; ++i)
{ {
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}});
: CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
compile_node_inner<direction>(index); compile_node_inner<direction>(index);
} }
for (auto offset : goto_ends) for (auto offset : goto_ends)
m_program.instructions[offset].param = m_program.instructions.size(); m_program.instructions[offset].param.split.target = m_program.instructions.size();
return start_pos; return start_pos;
} }
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) OpIndex push_inst(CompiledRegex::Op op, CompiledRegex::Param param = {})
{ {
constexpr auto max_instructions = std::numeric_limits<int16_t>::max(); constexpr auto max_instructions = std::numeric_limits<OpIndex>::max();
const uint32_t res = m_program.instructions.size(); const auto res = m_program.instructions.size();
if (res > max_instructions) if (res >= max_instructions)
throw regex_error(format("regex compiled to more than {} instructions", max_instructions)); throw regex_error(format("regex compiled to more than {} instructions", max_instructions));
m_program.instructions.push_back({ op, false, 0, param }); m_program.instructions.push_back({ op, false, 0, param });
return res; return OpIndex(res);
} }
template<RegexMode direction> template<RegexMode direction>
uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case) int16_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case)
{ {
using Lookaround = CompiledRegex::Lookaround; using Lookaround = CompiledRegex::Lookaround;
const uint32_t res = m_program.lookarounds.size(); const int16_t res = m_program.lookarounds.size();
for (auto child : Children<direction>{m_parsed_regex, index}) for (auto child : Children<direction>{m_parsed_regex, index})
{ {
auto& character = get_node(child); auto& character = get_node(child);
@ -905,9 +896,9 @@ private:
m_program.lookarounds.push_back(Lookaround::AnyChar); m_program.lookarounds.push_back(Lookaround::AnyChar);
else if (character.op == ParsedRegex::AnyCharExceptNewLine) else if (character.op == ParsedRegex::AnyCharExceptNewLine)
m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine); m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine);
else if (character.op == ParsedRegex::Class) else if (character.op == ParsedRegex::CharClass)
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterClass) + character.value)); m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterClass) + character.value));
else if (character.op == ParsedRegex::CharacterType) else if (character.op == ParsedRegex::CharType)
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterType) | character.value)); m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterType) | character.value));
else else
kak_assert(false); kak_assert(false);
@ -951,7 +942,7 @@ private:
start_desc.map[cp] = true; start_desc.map[cp] = true;
} }
return node.quantifier.allows_none(); return node.quantifier.allows_none();
case ParsedRegex::Class: case ParsedRegex::CharClass:
{ {
auto& character_class = m_parsed_regex.character_classes[node.value]; auto& character_class = m_parsed_regex.character_classes[node.value];
if (character_class.ctypes == CharacterType::None and if (character_class.ctypes == CharacterType::None and
@ -978,7 +969,7 @@ private:
start_desc.map[CompiledRegex::StartDesc::other] = true; start_desc.map[CompiledRegex::StartDesc::other] = true;
return node.quantifier.allows_none(); return node.quantifier.allows_none();
} }
case ParsedRegex::CharacterType: case ParsedRegex::CharType:
{ {
const CharacterType ctype = (CharacterType)node.value; const CharacterType ctype = (CharacterType)node.value;
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
@ -1041,12 +1032,12 @@ private:
if (not (m_flags & RegexCompileFlags::Optimize)) if (not (m_flags & RegexCompileFlags::Optimize))
return; return;
auto is_jump = [](CompiledRegex::Op op) { return op >= CompiledRegex::Op::Jump and op <= CompiledRegex::Op::Split_PrioritizeChild; }; auto is_jump = [](CompiledRegex::Op op) { return op >= CompiledRegex::Op::Jump and op <= CompiledRegex::Op::Split; };
for (auto i = begin; i < end; ++i) for (auto i = begin; i < end; ++i)
{ {
auto& inst = m_program.instructions[i]; auto& inst = m_program.instructions[i];
if (is_jump(inst.op)) if (is_jump(inst.op))
m_program.instructions[inst.param].last_step = 0xffff; // tag as jump target m_program.instructions[inst.param.jump_target].last_step = 0xffff; // tag as jump target
} }
for (auto block_begin = begin; block_begin < end; ) for (auto block_begin = begin; block_begin < end; )
@ -1064,7 +1055,7 @@ private:
void peephole_optimize(size_t begin, size_t end) void peephole_optimize(size_t begin, size_t end)
{ {
// Move saves after all assertions on the same character // Move saves after all assertions on the same character
auto is_assertion = [](CompiledRegex::Op op) { return op >= CompiledRegex::LineStart; }; auto is_assertion = [](CompiledRegex::Op op) { return op >= CompiledRegex::LineAssertion; };
for (auto i = begin, j = begin + 1; j < end; ++i, ++j) for (auto i = begin, j = begin + 1; j < end; ++i, ++j)
{ {
if (m_program.instructions[i].op == CompiledRegex::Save and if (m_program.instructions[i].op == CompiledRegex::Save and
@ -1095,10 +1086,7 @@ String dump_regex(const CompiledRegex& program)
switch (inst.op) switch (inst.op)
{ {
case CompiledRegex::Literal: case CompiledRegex::Literal:
res += format("literal {}\n", inst.param); res += format("literal {}{}\n", inst.param.literal.ignore_case ? "(ignore case) " : "", inst.param.literal.codepoint);
break;
case CompiledRegex::Literal_IgnoreCase:
res += format("literal (ignore case) {}\n", inst.param);
break; break;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
res += "any char\n"; res += "any char\n";
@ -1107,73 +1095,44 @@ String dump_regex(const CompiledRegex& program)
res += "anything but newline\n"; res += "anything but newline\n";
break; break;
case CompiledRegex::Jump: case CompiledRegex::Jump:
res += format("jump {}\n", inst.param); res += format("jump {}\n", inst.param.jump_target);
break; break;
case CompiledRegex::Split_PrioritizeParent: case CompiledRegex::Split:
case CompiledRegex::Split_PrioritizeChild:
{ {
res += format("split (prioritize {}) {}\n", res += format("split (prioritize {}) {}\n",
inst.op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child", (inst.param.split.prioritize_parent) ? "parent" : "child",
inst.param); inst.param.split.target);
break; break;
} }
case CompiledRegex::Save: case CompiledRegex::Save:
res += format("save {}\n", inst.param); res += format("save {}\n", inst.param.save_index);
break; break;
case CompiledRegex::Class: case CompiledRegex::CharClass:
res += format("class {}\n", inst.param); res += format("character class {}\n", inst.param.character_class_index);
break; break;
case CompiledRegex::CharacterType: case CompiledRegex::CharType:
res += format("character type {}\n", inst.param); res += format("character type {}\n", to_underlying(inst.param.character_type));
break; break;
case CompiledRegex::LineStart: case CompiledRegex::LineAssertion:
res += "line start\n"; res += format("line {}\n", inst.param.line_start ? "start" : "end");;
break; break;
case CompiledRegex::LineEnd: case CompiledRegex::SubjectAssertion:
res += "line end\n"; res += format("subject {}\n", inst.param.subject_begin ? "begin" : "end");
break; break;
case CompiledRegex::WordBoundary: case CompiledRegex::WordBoundary:
res += "word boundary\n"; res += format("{}word boundary\n", inst.param.word_boundary_positive ? "" : "not ");
break; break;
case CompiledRegex::NotWordBoundary: case CompiledRegex::LookAround:
res += "not word boundary\n";
break;
case CompiledRegex::SubjectBegin:
res += "subject begin\n";
break;
case CompiledRegex::SubjectEnd:
res += "subject end\n";
break;
case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead:
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase:
case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase:
{ {
const char* name = nullptr; String name;
if (inst.op == CompiledRegex::LookAhead) name += inst.param.lookaround.positive ? "" : "negative ";
name = "look ahead"; name += "look ";
if (inst.op == CompiledRegex::NegativeLookAhead) name += inst.param.lookaround.ahead ? "ahead " : "behind ";
name = "negative look ahead"; if (inst.param.lookaround.ignore_case)
if (inst.op == CompiledRegex::LookBehind) name += " (ignore case)";
name = "look behind";
if (inst.op == CompiledRegex::NegativeLookBehind)
name = "negative look behind";
if (inst.op == CompiledRegex::LookAhead_IgnoreCase)
name = "look ahead (ignore case)";
if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase)
name = "negative look ahead (ignore case)";
if (inst.op == CompiledRegex::LookBehind_IgnoreCase)
name = "look behind (ignore case)";
if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase)
name = "negative look behind (ignore case)";
String str; String str;
for (auto it = program.lookarounds.begin() + inst.param; for (auto it = program.lookarounds.begin() + inst.param.lookaround.index;
*it != CompiledRegex::Lookaround::EndOfLookaround; ++it) *it != CompiledRegex::Lookaround::EndOfLookaround; ++it)
utf8::dump(std::back_inserter(str), to_underlying(*it)); utf8::dump(std::back_inserter(str), to_underlying(*it));
res += format("{} ({})\n", name, str); res += format("{} ({})\n", name, str);

View File

@ -50,29 +50,17 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
{ {
Match, Match,
Literal, Literal,
Literal_IgnoreCase,
AnyChar, AnyChar,
AnyCharExceptNewLine, AnyCharExceptNewLine,
Class, CharClass,
CharacterType, CharType,
Jump, Jump,
Split_PrioritizeParent, Split,
Split_PrioritizeChild,
Save, Save,
LineStart, LineAssertion,
LineEnd, SubjectAssertion,
WordBoundary, WordBoundary,
NotWordBoundary, LookAround,
SubjectBegin,
SubjectEnd,
LookAhead,
NegativeLookAhead,
LookBehind,
NegativeLookBehind,
LookAhead_IgnoreCase,
NegativeLookAhead_IgnoreCase,
LookBehind_IgnoreCase,
NegativeLookBehind_IgnoreCase,
}; };
enum class Lookaround : Codepoint enum class Lookaround : Codepoint
@ -86,15 +74,46 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
EndOfLookaround = static_cast<Codepoint>(-1) EndOfLookaround = static_cast<Codepoint>(-1)
}; };
union Param
{
struct Literal
{
uint32_t codepoint : 24;
bool ignore_case : 1;
} literal;
int16_t character_class_index;
CharacterType character_type;
int16_t jump_target;
int16_t save_index;
struct Split
{
int16_t target;
bool prioritize_parent : 1;
} split;
bool line_start;
bool subject_begin;
bool word_boundary_positive;
struct Lookaround
{
int16_t index;
bool ahead : 1;
bool positive : 1;
bool ignore_case : 1;
} lookaround;
};
static_assert(sizeof(Param) == 4);
struct Instruction struct Instruction
{ {
Op op; Op op;
// Those mutables are used during execution // Those mutables are used during execution
mutable bool scheduled; mutable bool scheduled;
mutable uint16_t last_step; mutable uint16_t last_step;
uint32_t param; Param param;
}; };
static_assert(sizeof(Instruction) == 8, ""); static_assert(sizeof(Instruction) == 8);
static constexpr uint32_t prioritize_parent{1 << 16};
explicit operator bool() const { return not instructions.empty(); } explicit operator bool() const { return not instructions.empty(); }
@ -343,110 +362,6 @@ private:
switch (inst.op) switch (inst.op)
{ {
case CompiledRegex::Literal:
if (pos != config.end and inst.param == codepoint(pos, config))
return consumed();
return failed();
case CompiledRegex::Literal_IgnoreCase:
if (pos != config.end and inst.param == to_lower(codepoint(pos, config)))
return consumed();
return failed();
case CompiledRegex::AnyChar:
return consumed();
case CompiledRegex::AnyCharExceptNewLine:
if (pos != config.end and codepoint(pos, config) != '\n')
return consumed();
return failed();
case CompiledRegex::Jump:
thread.inst = static_cast<int16_t>(inst.param);
break;
case CompiledRegex::Split_PrioritizeParent:
{
if (thread.saves >= 0)
++m_saves[thread.saves]->refcount;
m_threads.push_current({static_cast<int16_t>(inst.param), thread.saves});
break;
}
case CompiledRegex::Split_PrioritizeChild:
{
if (thread.saves >= 0)
++m_saves[thread.saves]->refcount;
m_threads.push_current({thread.inst, thread.saves});
thread.inst = static_cast<uint16_t>(inst.param);
break;
}
case CompiledRegex::Save:
{
if (mode & RegexMode::NoSaves)
break;
if (thread.saves < 0)
thread.saves = new_saves<false>(nullptr);
else if (m_saves[thread.saves]->refcount > 1)
{
--m_saves[thread.saves]->refcount;
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
}
m_saves[thread.saves]->pos[inst.param] = pos;
break;
}
case CompiledRegex::Class:
if (pos == config.end)
return failed();
return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ?
consumed() : failed();
case CompiledRegex::CharacterType:
if (pos == config.end)
return failed();
return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ?
consumed() : failed();
case CompiledRegex::LineStart:
if (not is_line_start(pos, config))
return failed();
break;
case CompiledRegex::LineEnd:
if (not is_line_end(pos, config))
return failed();
break;
case CompiledRegex::WordBoundary:
if (not is_word_boundary(pos, config))
return failed();
break;
case CompiledRegex::NotWordBoundary:
if (is_word_boundary(pos, config))
return failed();
break;
case CompiledRegex::SubjectBegin:
if (pos != config.subject_begin)
return failed();
break;
case CompiledRegex::SubjectEnd:
if (pos != config.subject_end)
return failed();
break;
case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead:
if (lookaround<true, false>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookAhead))
return failed();
break;
case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase:
if (lookaround<true, true>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookAhead_IgnoreCase))
return failed();
break;
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
if (lookaround<false, false>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookBehind))
return failed();
break;
case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase:
if (lookaround<false, true>(inst.param, pos, config) !=
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
return failed();
break;
case CompiledRegex::Match: case CompiledRegex::Match:
if ((pos != config.end and not (mode & RegexMode::Search)) or if ((pos != config.end and not (mode & RegexMode::Search)) or
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin)) (config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
@ -460,6 +375,71 @@ private:
while (not m_threads.current_is_empty()) while (not m_threads.current_is_empty())
release_saves(m_threads.pop_current().saves); release_saves(m_threads.pop_current().saves);
return; return;
case CompiledRegex::Literal:
if (pos != config.end and
inst.param.literal.codepoint == (inst.param.literal.ignore_case ? to_lower(codepoint(pos, config))
: codepoint(pos, config)))
return consumed();
return failed();
case CompiledRegex::AnyChar:
return consumed();
case CompiledRegex::AnyCharExceptNewLine:
if (pos != config.end and codepoint(pos, config) != '\n')
return consumed();
return failed();
case CompiledRegex::Jump:
thread.inst = inst.param.jump_target;
break;
case CompiledRegex::Split:
if (thread.saves >= 0)
++m_saves[thread.saves]->refcount;
if (inst.param.split.prioritize_parent)
m_threads.push_current({inst.param.split.target, thread.saves});
else
{
m_threads.push_current(thread);
thread.inst = inst.param.split.target;
}
break;
case CompiledRegex::Save:
if (mode & RegexMode::NoSaves)
break;
if (thread.saves < 0)
thread.saves = new_saves<false>(nullptr);
else if (m_saves[thread.saves]->refcount > 1)
{
--m_saves[thread.saves]->refcount;
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
}
m_saves[thread.saves]->pos[inst.param.save_index] = pos;
break;
case CompiledRegex::CharClass:
if (pos == config.end)
return failed();
return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ?
consumed() : failed();
case CompiledRegex::CharType:
if (pos == config.end)
return failed();
return is_ctype(inst.param.character_type, codepoint(pos, config)) ?
consumed() : failed();
case CompiledRegex::LineAssertion:
if (not (inst.param.line_start ? is_line_start(pos, config) : is_line_end(pos, config)))
return failed();
break;
case CompiledRegex::SubjectAssertion:
if (pos != (inst.param.subject_begin ? config.subject_begin : config.subject_end))
return failed();
break;
case CompiledRegex::WordBoundary:
if (is_word_boundary(pos, config) != inst.param.word_boundary_positive)
return failed();
break;
case CompiledRegex::LookAround:
if (lookaround(inst.param.lookaround, pos, config) != inst.param.lookaround.positive)
return failed();
break;
} }
} }
return failed(); return failed();
@ -544,25 +524,24 @@ private:
} }
} }
template<bool look_forward, bool ignore_case> bool lookaround(CompiledRegex::Param::Lookaround param, Iterator pos, const ExecConfig& config) const
bool lookaround(uint32_t index, Iterator pos, const ExecConfig& config) const
{ {
using Lookaround = CompiledRegex::Lookaround; using Lookaround = CompiledRegex::Lookaround;
if (not look_forward) if (not param.ahead)
{ {
if (pos == config.subject_begin) if (pos == config.subject_begin)
return m_program.lookarounds[index] == Lookaround::EndOfLookaround; return m_program.lookarounds[param.index] == Lookaround::EndOfLookaround;
utf8::to_previous(pos, config.subject_begin); utf8::to_previous(pos, config.subject_begin);
} }
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) for (auto it = m_program.lookarounds.begin() + param.index; *it != Lookaround::EndOfLookaround; ++it)
{ {
if (look_forward and pos == config.subject_end) if (param.ahead and pos == config.subject_end)
return false; return false;
Codepoint cp = utf8::codepoint(pos, config.subject_end); Codepoint cp = utf8::codepoint(pos, config.subject_end);
if (ignore_case) if (param.ignore_case)
cp = to_lower(cp); cp = to_lower(cp);
const Lookaround op = *it; const Lookaround op = *it;
@ -588,11 +567,11 @@ private:
else if (static_cast<Codepoint>(op) != cp) else if (static_cast<Codepoint>(op) != cp)
return false; return false;
if (not look_forward and pos == config.subject_begin) if (not param.ahead and pos == config.subject_begin)
return *++it == Lookaround::EndOfLookaround; return *++it == Lookaround::EndOfLookaround;
look_forward ? utf8::to_next(pos, config.subject_end) param.ahead ? utf8::to_next(pos, config.subject_end)
: utf8::to_previous(pos, config.subject_begin); : utf8::to_previous(pos, config.subject_begin);
} }
return true; return true;
} }

View File

@ -31,7 +31,7 @@ String trim_indent(StringView str)
throw runtime_error("inconsistent indentation in the string"); throw runtime_error("inconsistent indentation in the string");
return line.substr(indent.length()); return line.substr(indent.length());
}), String{}, [](String& s, StringView l) -> decltype(auto) { return s += l; }); }), String{}, [](String s, StringView l) { return s += l; });
} }
String escape(StringView str, StringView characters, char escape) String escape(StringView str, StringView characters, char escape)