From 8566ae14a01a813a4b44846785675ae5d61b8eac Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Sat, 25 Jul 2020 14:07:57 +1000 Subject: [PATCH] Reduce the amount of Regex VM Instruction code Merge all lookarounds into the same instruction, merge splits, merge literal ignore case with literal... Besides reducing the amount of almost duplicated code, this improves performance by reducing pressure on the (often failing) branch target prediction for instruction dispatching by moving branches into the instruction code themselves where they are more likely to be well predicted. --- src/Makefile | 2 +- src/regex_impl.cc | 205 +++++++++++++++--------------------- src/regex_impl.hh | 247 ++++++++++++++++++++------------------------ src/string_utils.cc | 2 +- 4 files changed, 197 insertions(+), 259 deletions(-) diff --git a/src/Makefile b/src/Makefile index 705f5b7d..a8a34f96 100644 --- a/src/Makefile +++ b/src/Makefile @@ -81,7 +81,7 @@ else LDFLAGS += -rdynamic endif -CXXFLAGS += -pedantic -std=c++17 -g -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-address +CXXFLAGS += -pedantic -std=c++2a -g -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-address compiler := $(shell $(CXX) --version) ifneq (,$(findstring clang,$(compiler))) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 16b22681..ff65cdbe 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -27,8 +27,8 @@ struct ParsedRegex Literal, AnyChar, AnyCharExceptNewLine, - Class, - CharacterType, + CharClass, + CharType, Sequence, Alternation, LineStart, @@ -73,7 +73,7 @@ struct ParsedRegex }; }; - using NodeIndex = uint16_t; + using NodeIndex = int16_t; struct [[gnu::packed]] Node { Op op; @@ -397,7 +397,7 @@ private: // CharacterClassEscape auto class_it = find_if(character_class_escapes, [cp](auto& c) { return c.cp == cp; }); if (class_it != std::end(character_class_escapes)) - return new_node(ParsedRegex::CharacterType, (Codepoint)class_it->ctype); + return new_node(ParsedRegex::CharType, (Codepoint)class_it->ctype); // CharacterEscape for (auto& control : control_escapes) @@ -546,12 +546,12 @@ private: if (character_class.ctypes != CharacterType::None and not character_class.negative and character_class.ranges.empty()) - return new_node(ParsedRegex::CharacterType, (Codepoint)character_class.ctypes); + return new_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes); auto class_id = m_parsed_regex.character_classes.size(); m_parsed_regex.character_classes.push_back(std::move(character_class)); - return new_node(ParsedRegex::Class, class_id); + return new_node(ParsedRegex::CharClass, class_id); } ParsedRegex::Quantifier quantifier() @@ -638,8 +638,8 @@ private: for (auto child_index : Children<>{m_parsed_regex, index}) { auto& child = get_node(child_index); - if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and - child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and + if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::CharClass and + child.op != ParsedRegex::CharType and child.op != ParsedRegex::AnyChar and child.op != ParsedRegex::AnyCharExceptNewLine) parse_error("Lookaround can only contain literals, any chars or character classes"); if (child.op == ParsedRegex::Literal and @@ -684,6 +684,8 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[]; struct RegexCompiler { + using OpIndex = int16_t; + RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags) : m_flags(flags), m_parsed_regex{parsed_regex} { @@ -722,7 +724,7 @@ struct RegexCompiler private: template - uint32_t compile_node_inner(ParsedRegex::NodeIndex index) + OpIndex compile_node_inner(ParsedRegex::NodeIndex index) { auto& node = get_node(index); @@ -733,16 +735,13 @@ private: (node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs))); constexpr bool forward = direction == RegexMode::Forward; if (save) - push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1)); + push_inst(CompiledRegex::Save, {.save_index = int16_t(node.value * 2 + (forward ? 0 : 1))}); Vector goto_inner_end_offsets; switch (node.op) { case ParsedRegex::Literal: - if (ignore_case) - push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node.value)); - else - push_inst(CompiledRegex::Literal, node.value); + push_inst(CompiledRegex::Literal, {.literal={.codepoint=ignore_case ? to_lower(node.value) : node.value, .ignore_case=ignore_case}}); break; case ParsedRegex::AnyChar: push_inst(CompiledRegex::AnyChar); @@ -750,11 +749,11 @@ private: case ParsedRegex::AnyCharExceptNewLine: push_inst(CompiledRegex::AnyCharExceptNewLine); break; - case ParsedRegex::Class: - push_inst(CompiledRegex::Class, node.value); + case ParsedRegex::CharClass: + push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)}); break; - case ParsedRegex::CharacterType: - push_inst(CompiledRegex::CharacterType, node.value); + case ParsedRegex::CharType: + push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}}); break; case ParsedRegex::Sequence: { @@ -768,7 +767,7 @@ private: for (auto child : Children<>{m_parsed_regex, index}) { if (child != index+1) - push_inst(CompiledRegex::Split_PrioritizeParent); + push_inst(CompiledRegex::Split); } const auto end = node.children_end; @@ -776,7 +775,7 @@ private: { auto node = compile_node(child); if (child != index+1) - m_program.instructions[split_pos++].param = node; + m_program.instructions[split_pos++].param.split = CompiledRegex::Param::Split{.target = node, .prioritize_parent = true}; if (get_node(child).children_end != end) { auto jump = push_inst(CompiledRegex::Jump); @@ -786,71 +785,66 @@ private: break; } case ParsedRegex::LookAhead: - push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase - : CompiledRegex::LookAhead, - push_lookaround(index, ignore_case)); - break; case ParsedRegex::NegativeLookAhead: - push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase - : CompiledRegex::NegativeLookAhead, - push_lookaround(index, ignore_case)); + push_inst(CompiledRegex::LookAround, {.lookaround={ + .index=push_lookaround(index, ignore_case), + .ahead=true, + .positive=node.op == ParsedRegex::LookAhead, + .ignore_case=ignore_case}}); break; case ParsedRegex::LookBehind: - push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase - : CompiledRegex::LookBehind, - push_lookaround(index, ignore_case)); - break; case ParsedRegex::NegativeLookBehind: - push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase - : CompiledRegex::NegativeLookBehind, - push_lookaround(index, ignore_case)); + push_inst(CompiledRegex::LookAround, {.lookaround={ + .index=push_lookaround(index, ignore_case), + .ahead=false, + .positive=node.op == ParsedRegex::LookBehind, + .ignore_case=ignore_case}}); break; case ParsedRegex::LineStart: - push_inst(CompiledRegex::LineStart); + push_inst(CompiledRegex::LineAssertion, {.line_start=true}); break; case ParsedRegex::LineEnd: - push_inst(CompiledRegex::LineEnd); + push_inst(CompiledRegex::LineAssertion, {.line_start=false}); break; case ParsedRegex::WordBoundary: - push_inst(CompiledRegex::WordBoundary); + push_inst(CompiledRegex::WordBoundary, {.word_boundary_positive=true}); break; case ParsedRegex::NotWordBoundary: - push_inst(CompiledRegex::NotWordBoundary); + push_inst(CompiledRegex::WordBoundary, {.word_boundary_positive=false}); break; case ParsedRegex::SubjectBegin: - push_inst(CompiledRegex::SubjectBegin); + push_inst(CompiledRegex::SubjectAssertion, {.subject_begin=true}); break; case ParsedRegex::SubjectEnd: - push_inst(CompiledRegex::SubjectEnd); + push_inst(CompiledRegex::SubjectAssertion, {.subject_begin=false}); break; case ParsedRegex::ResetStart: - push_inst(CompiledRegex::Save, 0); + push_inst(CompiledRegex::Save, {.save_index=0}); break; } for (auto& offset : goto_inner_end_offsets) - m_program.instructions[offset].param = m_program.instructions.size(); + m_program.instructions[offset].param.jump_target = m_program.instructions.size(); if (save) - push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0)); + push_inst(CompiledRegex::Save, {.save_index=int16_t(node.value * 2 + (forward ? 1 : 0))}); return start_pos; } template - uint32_t compile_node(ParsedRegex::NodeIndex index) + OpIndex compile_node(ParsedRegex::NodeIndex index) { auto& node = get_node(index); - const uint32_t start_pos = (uint32_t)m_program.instructions.size(); - Vector goto_ends; + const OpIndex start_pos = (OpIndex)m_program.instructions.size(); + Vector goto_ends; auto& quantifier = node.quantifier; if (quantifier.allows_none()) { - auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent - : CompiledRegex::Split_PrioritizeChild); + auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}}); goto_ends.push_back(split_pos); } @@ -860,41 +854,38 @@ private: inner_pos = compile_node_inner(index); if (quantifier.allows_infinite_repeat()) - push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild - : CompiledRegex::Split_PrioritizeParent, - inner_pos); + push_inst(CompiledRegex::Split, {.split = {.target=inner_pos, .prioritize_parent=not quantifier.greedy}}); // Write the node as an optional match for the min -> max counts else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY ! i < quantifier.max; ++i) { - auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent - : CompiledRegex::Split_PrioritizeChild); + auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}}); goto_ends.push_back(split_pos); compile_node_inner(index); } for (auto offset : goto_ends) - m_program.instructions[offset].param = m_program.instructions.size(); + m_program.instructions[offset].param.split.target = m_program.instructions.size(); return start_pos; } - uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) + OpIndex push_inst(CompiledRegex::Op op, CompiledRegex::Param param = {}) { - constexpr auto max_instructions = std::numeric_limits::max(); - const uint32_t res = m_program.instructions.size(); - if (res > max_instructions) + constexpr auto max_instructions = std::numeric_limits::max(); + const auto res = m_program.instructions.size(); + if (res >= max_instructions) throw regex_error(format("regex compiled to more than {} instructions", max_instructions)); m_program.instructions.push_back({ op, false, 0, param }); - return res; + return OpIndex(res); } template - uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case) + int16_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case) { using Lookaround = CompiledRegex::Lookaround; - const uint32_t res = m_program.lookarounds.size(); + const int16_t res = m_program.lookarounds.size(); for (auto child : Children{m_parsed_regex, index}) { auto& character = get_node(child); @@ -905,9 +896,9 @@ private: m_program.lookarounds.push_back(Lookaround::AnyChar); else if (character.op == ParsedRegex::AnyCharExceptNewLine) m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine); - else if (character.op == ParsedRegex::Class) + else if (character.op == ParsedRegex::CharClass) m_program.lookarounds.push_back(static_cast(to_underlying(Lookaround::CharacterClass) + character.value)); - else if (character.op == ParsedRegex::CharacterType) + else if (character.op == ParsedRegex::CharType) m_program.lookarounds.push_back(static_cast(to_underlying(Lookaround::CharacterType) | character.value)); else kak_assert(false); @@ -951,7 +942,7 @@ private: start_desc.map[cp] = true; } return node.quantifier.allows_none(); - case ParsedRegex::Class: + case ParsedRegex::CharClass: { auto& character_class = m_parsed_regex.character_classes[node.value]; if (character_class.ctypes == CharacterType::None and @@ -978,7 +969,7 @@ private: start_desc.map[CompiledRegex::StartDesc::other] = true; return node.quantifier.allows_none(); } - case ParsedRegex::CharacterType: + case ParsedRegex::CharType: { const CharacterType ctype = (CharacterType)node.value; for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) @@ -1041,12 +1032,12 @@ private: if (not (m_flags & RegexCompileFlags::Optimize)) return; - auto is_jump = [](CompiledRegex::Op op) { return op >= CompiledRegex::Op::Jump and op <= CompiledRegex::Op::Split_PrioritizeChild; }; + auto is_jump = [](CompiledRegex::Op op) { return op >= CompiledRegex::Op::Jump and op <= CompiledRegex::Op::Split; }; for (auto i = begin; i < end; ++i) { auto& inst = m_program.instructions[i]; if (is_jump(inst.op)) - m_program.instructions[inst.param].last_step = 0xffff; // tag as jump target + m_program.instructions[inst.param.jump_target].last_step = 0xffff; // tag as jump target } for (auto block_begin = begin; block_begin < end; ) @@ -1064,7 +1055,7 @@ private: void peephole_optimize(size_t begin, size_t end) { // Move saves after all assertions on the same character - auto is_assertion = [](CompiledRegex::Op op) { return op >= CompiledRegex::LineStart; }; + auto is_assertion = [](CompiledRegex::Op op) { return op >= CompiledRegex::LineAssertion; }; for (auto i = begin, j = begin + 1; j < end; ++i, ++j) { if (m_program.instructions[i].op == CompiledRegex::Save and @@ -1095,10 +1086,7 @@ String dump_regex(const CompiledRegex& program) switch (inst.op) { case CompiledRegex::Literal: - res += format("literal {}\n", inst.param); - break; - case CompiledRegex::Literal_IgnoreCase: - res += format("literal (ignore case) {}\n", inst.param); + res += format("literal {}{}\n", inst.param.literal.ignore_case ? "(ignore case) " : "", inst.param.literal.codepoint); break; case CompiledRegex::AnyChar: res += "any char\n"; @@ -1107,73 +1095,44 @@ String dump_regex(const CompiledRegex& program) res += "anything but newline\n"; break; case CompiledRegex::Jump: - res += format("jump {}\n", inst.param); + res += format("jump {}\n", inst.param.jump_target); break; - case CompiledRegex::Split_PrioritizeParent: - case CompiledRegex::Split_PrioritizeChild: + case CompiledRegex::Split: { res += format("split (prioritize {}) {}\n", - inst.op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child", - inst.param); + (inst.param.split.prioritize_parent) ? "parent" : "child", + inst.param.split.target); break; } case CompiledRegex::Save: - res += format("save {}\n", inst.param); + res += format("save {}\n", inst.param.save_index); break; - case CompiledRegex::Class: - res += format("class {}\n", inst.param); + case CompiledRegex::CharClass: + res += format("character class {}\n", inst.param.character_class_index); break; - case CompiledRegex::CharacterType: - res += format("character type {}\n", inst.param); + case CompiledRegex::CharType: + res += format("character type {}\n", to_underlying(inst.param.character_type)); break; - case CompiledRegex::LineStart: - res += "line start\n"; + case CompiledRegex::LineAssertion: + res += format("line {}\n", inst.param.line_start ? "start" : "end");; break; - case CompiledRegex::LineEnd: - res += "line end\n"; + case CompiledRegex::SubjectAssertion: + res += format("subject {}\n", inst.param.subject_begin ? "begin" : "end"); break; case CompiledRegex::WordBoundary: - res += "word boundary\n"; + res += format("{}word boundary\n", inst.param.word_boundary_positive ? "" : "not "); break; - case CompiledRegex::NotWordBoundary: - res += "not word boundary\n"; - break; - case CompiledRegex::SubjectBegin: - res += "subject begin\n"; - break; - case CompiledRegex::SubjectEnd: - res += "subject end\n"; - break; - case CompiledRegex::LookAhead: - case CompiledRegex::NegativeLookAhead: - case CompiledRegex::LookBehind: - case CompiledRegex::NegativeLookBehind: - case CompiledRegex::LookAhead_IgnoreCase: - case CompiledRegex::NegativeLookAhead_IgnoreCase: - case CompiledRegex::LookBehind_IgnoreCase: - case CompiledRegex::NegativeLookBehind_IgnoreCase: + case CompiledRegex::LookAround: { - const char* name = nullptr; - if (inst.op == CompiledRegex::LookAhead) - name = "look ahead"; - if (inst.op == CompiledRegex::NegativeLookAhead) - name = "negative look ahead"; - if (inst.op == CompiledRegex::LookBehind) - name = "look behind"; - if (inst.op == CompiledRegex::NegativeLookBehind) - name = "negative look behind"; - - if (inst.op == CompiledRegex::LookAhead_IgnoreCase) - name = "look ahead (ignore case)"; - if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase) - name = "negative look ahead (ignore case)"; - if (inst.op == CompiledRegex::LookBehind_IgnoreCase) - name = "look behind (ignore case)"; - if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase) - name = "negative look behind (ignore case)"; + String name; + name += inst.param.lookaround.positive ? "" : "negative "; + name += "look "; + name += inst.param.lookaround.ahead ? "ahead " : "behind "; + if (inst.param.lookaround.ignore_case) + name += " (ignore case)"; String str; - for (auto it = program.lookarounds.begin() + inst.param; + for (auto it = program.lookarounds.begin() + inst.param.lookaround.index; *it != CompiledRegex::Lookaround::EndOfLookaround; ++it) utf8::dump(std::back_inserter(str), to_underlying(*it)); res += format("{} ({})\n", name, str); diff --git a/src/regex_impl.hh b/src/regex_impl.hh index d73c7aa6..db64c758 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -50,29 +50,17 @@ struct CompiledRegex : RefCountable, UseMemoryDomain { Match, Literal, - Literal_IgnoreCase, AnyChar, AnyCharExceptNewLine, - Class, - CharacterType, + CharClass, + CharType, Jump, - Split_PrioritizeParent, - Split_PrioritizeChild, + Split, Save, - LineStart, - LineEnd, + LineAssertion, + SubjectAssertion, WordBoundary, - NotWordBoundary, - SubjectBegin, - SubjectEnd, - LookAhead, - NegativeLookAhead, - LookBehind, - NegativeLookBehind, - LookAhead_IgnoreCase, - NegativeLookAhead_IgnoreCase, - LookBehind_IgnoreCase, - NegativeLookBehind_IgnoreCase, + LookAround, }; enum class Lookaround : Codepoint @@ -86,15 +74,46 @@ struct CompiledRegex : RefCountable, UseMemoryDomain EndOfLookaround = static_cast(-1) }; + union Param + { + struct Literal + { + uint32_t codepoint : 24; + bool ignore_case : 1; + } literal; + int16_t character_class_index; + CharacterType character_type; + int16_t jump_target; + int16_t save_index; + struct Split + { + int16_t target; + bool prioritize_parent : 1; + } split; + bool line_start; + bool subject_begin; + bool word_boundary_positive; + struct Lookaround + { + int16_t index; + bool ahead : 1; + bool positive : 1; + bool ignore_case : 1; + } lookaround; + }; + static_assert(sizeof(Param) == 4); + struct Instruction { Op op; // Those mutables are used during execution mutable bool scheduled; mutable uint16_t last_step; - uint32_t param; + Param param; }; - static_assert(sizeof(Instruction) == 8, ""); + static_assert(sizeof(Instruction) == 8); + + static constexpr uint32_t prioritize_parent{1 << 16}; explicit operator bool() const { return not instructions.empty(); } @@ -343,110 +362,6 @@ private: switch (inst.op) { - case CompiledRegex::Literal: - if (pos != config.end and inst.param == codepoint(pos, config)) - return consumed(); - return failed(); - case CompiledRegex::Literal_IgnoreCase: - if (pos != config.end and inst.param == to_lower(codepoint(pos, config))) - return consumed(); - return failed(); - case CompiledRegex::AnyChar: - return consumed(); - case CompiledRegex::AnyCharExceptNewLine: - if (pos != config.end and codepoint(pos, config) != '\n') - return consumed(); - return failed(); - case CompiledRegex::Jump: - thread.inst = static_cast(inst.param); - break; - case CompiledRegex::Split_PrioritizeParent: - { - if (thread.saves >= 0) - ++m_saves[thread.saves]->refcount; - m_threads.push_current({static_cast(inst.param), thread.saves}); - break; - } - case CompiledRegex::Split_PrioritizeChild: - { - if (thread.saves >= 0) - ++m_saves[thread.saves]->refcount; - m_threads.push_current({thread.inst, thread.saves}); - thread.inst = static_cast(inst.param); - break; - } - case CompiledRegex::Save: - { - if (mode & RegexMode::NoSaves) - break; - if (thread.saves < 0) - thread.saves = new_saves(nullptr); - else if (m_saves[thread.saves]->refcount > 1) - { - --m_saves[thread.saves]->refcount; - thread.saves = new_saves(m_saves[thread.saves]->pos); - } - m_saves[thread.saves]->pos[inst.param] = pos; - break; - } - case CompiledRegex::Class: - if (pos == config.end) - return failed(); - return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ? - consumed() : failed(); - case CompiledRegex::CharacterType: - if (pos == config.end) - return failed(); - return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ? - consumed() : failed(); - case CompiledRegex::LineStart: - if (not is_line_start(pos, config)) - return failed(); - break; - case CompiledRegex::LineEnd: - if (not is_line_end(pos, config)) - return failed(); - break; - case CompiledRegex::WordBoundary: - if (not is_word_boundary(pos, config)) - return failed(); - break; - case CompiledRegex::NotWordBoundary: - if (is_word_boundary(pos, config)) - return failed(); - break; - case CompiledRegex::SubjectBegin: - if (pos != config.subject_begin) - return failed(); - break; - case CompiledRegex::SubjectEnd: - if (pos != config.subject_end) - return failed(); - break; - case CompiledRegex::LookAhead: - case CompiledRegex::NegativeLookAhead: - if (lookaround(inst.param, pos, config) != - (inst.op == CompiledRegex::LookAhead)) - return failed(); - break; - case CompiledRegex::LookAhead_IgnoreCase: - case CompiledRegex::NegativeLookAhead_IgnoreCase: - if (lookaround(inst.param, pos, config) != - (inst.op == CompiledRegex::LookAhead_IgnoreCase)) - return failed(); - break; - case CompiledRegex::LookBehind: - case CompiledRegex::NegativeLookBehind: - if (lookaround(inst.param, pos, config) != - (inst.op == CompiledRegex::LookBehind)) - return failed(); - break; - case CompiledRegex::LookBehind_IgnoreCase: - case CompiledRegex::NegativeLookBehind_IgnoreCase: - if (lookaround(inst.param, pos, config) != - (inst.op == CompiledRegex::LookBehind_IgnoreCase)) - return failed(); - break; case CompiledRegex::Match: if ((pos != config.end and not (mode & RegexMode::Search)) or (config.flags & RegexExecFlags::NotInitialNull and pos == config.begin)) @@ -460,6 +375,71 @@ private: while (not m_threads.current_is_empty()) release_saves(m_threads.pop_current().saves); return; + case CompiledRegex::Literal: + if (pos != config.end and + inst.param.literal.codepoint == (inst.param.literal.ignore_case ? to_lower(codepoint(pos, config)) + : codepoint(pos, config))) + return consumed(); + return failed(); + case CompiledRegex::AnyChar: + return consumed(); + case CompiledRegex::AnyCharExceptNewLine: + if (pos != config.end and codepoint(pos, config) != '\n') + return consumed(); + return failed(); + case CompiledRegex::Jump: + thread.inst = inst.param.jump_target; + break; + case CompiledRegex::Split: + if (thread.saves >= 0) + ++m_saves[thread.saves]->refcount; + + if (inst.param.split.prioritize_parent) + m_threads.push_current({inst.param.split.target, thread.saves}); + else + { + m_threads.push_current(thread); + thread.inst = inst.param.split.target; + } + break; + case CompiledRegex::Save: + if (mode & RegexMode::NoSaves) + break; + if (thread.saves < 0) + thread.saves = new_saves(nullptr); + else if (m_saves[thread.saves]->refcount > 1) + { + --m_saves[thread.saves]->refcount; + thread.saves = new_saves(m_saves[thread.saves]->pos); + } + m_saves[thread.saves]->pos[inst.param.save_index] = pos; + break; + case CompiledRegex::CharClass: + if (pos == config.end) + return failed(); + return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ? + consumed() : failed(); + case CompiledRegex::CharType: + if (pos == config.end) + return failed(); + return is_ctype(inst.param.character_type, codepoint(pos, config)) ? + consumed() : failed(); + case CompiledRegex::LineAssertion: + if (not (inst.param.line_start ? is_line_start(pos, config) : is_line_end(pos, config))) + return failed(); + break; + case CompiledRegex::SubjectAssertion: + if (pos != (inst.param.subject_begin ? config.subject_begin : config.subject_end)) + return failed(); + break; + case CompiledRegex::WordBoundary: + if (is_word_boundary(pos, config) != inst.param.word_boundary_positive) + return failed(); + break; + case CompiledRegex::LookAround: + if (lookaround(inst.param.lookaround, pos, config) != inst.param.lookaround.positive) + return failed(); + break; } } return failed(); @@ -544,25 +524,24 @@ private: } } - template - bool lookaround(uint32_t index, Iterator pos, const ExecConfig& config) const + bool lookaround(CompiledRegex::Param::Lookaround param, Iterator pos, const ExecConfig& config) const { using Lookaround = CompiledRegex::Lookaround; - if (not look_forward) + if (not param.ahead) { if (pos == config.subject_begin) - return m_program.lookarounds[index] == Lookaround::EndOfLookaround; + return m_program.lookarounds[param.index] == Lookaround::EndOfLookaround; utf8::to_previous(pos, config.subject_begin); } - for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) + for (auto it = m_program.lookarounds.begin() + param.index; *it != Lookaround::EndOfLookaround; ++it) { - if (look_forward and pos == config.subject_end) + if (param.ahead and pos == config.subject_end) return false; Codepoint cp = utf8::codepoint(pos, config.subject_end); - if (ignore_case) + if (param.ignore_case) cp = to_lower(cp); const Lookaround op = *it; @@ -588,11 +567,11 @@ private: else if (static_cast(op) != cp) return false; - if (not look_forward and pos == config.subject_begin) + if (not param.ahead and pos == config.subject_begin) return *++it == Lookaround::EndOfLookaround; - look_forward ? utf8::to_next(pos, config.subject_end) - : utf8::to_previous(pos, config.subject_begin); + param.ahead ? utf8::to_next(pos, config.subject_end) + : utf8::to_previous(pos, config.subject_begin); } return true; } diff --git a/src/string_utils.cc b/src/string_utils.cc index 459bea0b..a1c1c38d 100644 --- a/src/string_utils.cc +++ b/src/string_utils.cc @@ -31,7 +31,7 @@ String trim_indent(StringView str) throw runtime_error("inconsistent indentation in the string"); return line.substr(indent.length()); - }), String{}, [](String& s, StringView l) -> decltype(auto) { return s += l; }); + }), String{}, [](String s, StringView l) { return s += l; }); } String escape(StringView str, StringView characters, char escape)