From 490c130e412583e21de15450bf746253369c235e Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Sun, 24 Sep 2017 23:14:35 +0900 Subject: [PATCH] Regex: Implement leftmost matching Ensure threads are maintained in "priority" order, by having two split instruction (prioritizing parent or child). --- src/regex_impl.cc | 71 +++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 151958f7..225b83cf 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -19,7 +19,8 @@ struct CompiledRegex CharRange, NegativeCharRange, Jump, - Split, + Split_PrioritizeParent, + Split_PrioritizeChild, Save, LineStart, LineEnd, @@ -378,7 +379,7 @@ CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedReg auto& children = node->children; kak_assert(children.size() == 2); - program.bytecode.push_back(CompiledRegex::Split); + program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); auto offset = alloc_offset(program); compile_node(program, parsed_regex, children[0]); @@ -429,7 +430,7 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& pa if (node->quantifier.allows_none()) { - program.bytecode.push_back(CompiledRegex::Split); + program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); goto_end_offsets.push_back(alloc_offset(program)); } @@ -440,14 +441,14 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& pa if (node->quantifier.allows_infinite_repeat()) { - program.bytecode.push_back(CompiledRegex::Split); + program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild); get_offset(program, alloc_offset(program)) = inner_pos; } // Write the node as an optional match for the min -> max counts else for (int i = std::max(1, node->quantifier.min); // STILL UGLY ! i < node->quantifier.max; ++i) { - program.bytecode.push_back(CompiledRegex::Split); + program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); goto_end_offsets.push_back(alloc_offset(program)); compile_node_inner(program, parsed_regex, node); } @@ -464,10 +465,10 @@ constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offs void write_search_prefix(CompiledRegex& program) { kak_assert(program.bytecode.empty()); - program.bytecode.push_back(CompiledRegex::Split); + program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild); get_offset(program, alloc_offset(program)) = prefix_size; program.bytecode.push_back(CompiledRegex::AnyChar); - program.bytecode.push_back(CompiledRegex::Split); + program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset); } @@ -507,9 +508,12 @@ void dump(const CompiledRegex& program) printf("jump %u\n", *reinterpret_cast(&*pos)); pos += sizeof(CompiledRegex::Offset); break; - case CompiledRegex::Split: + case CompiledRegex::Split_PrioritizeParent: + case CompiledRegex::Split_PrioritizeChild: { - printf("split %u\n", *reinterpret_cast(&*pos)); + printf("split (prioritize %s) %u\n", + op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child", + *reinterpret_cast(&*pos)); pos += sizeof(CompiledRegex::Offset); break; } @@ -594,13 +598,21 @@ struct ThreadedRegexVM thread.inst = inst; break; } - case CompiledRegex::Split: + case CompiledRegex::Split_PrioritizeParent: { - add_thread(*reinterpret_cast(thread.inst), thread.saves); + add_thread(thread_index+1, *reinterpret_cast(thread.inst), thread.saves); // thread is invalidated now, as we mutated the m_thread vector m_threads[thread_index].inst += sizeof(CompiledRegex::Offset); break; } + case CompiledRegex::Split_PrioritizeChild: + { + auto prog_start = m_program.bytecode.data(); + add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves); + // thread is invalidated now, as we mutated the m_thread vector + m_threads[thread_index].inst = prog_start + *reinterpret_cast(m_threads[thread_index].inst); + break; + } case CompiledRegex::Save: { const char index = *thread.inst++; @@ -659,16 +671,18 @@ struct ThreadedRegexVM return StepResult::Failed; break; case CompiledRegex::Match: + thread.inst = nullptr; return StepResult::Matched; } } return StepResult::Failed; } - bool exec(StringView data, bool match = true) + bool exec(StringView data, bool match = true, bool longest = false) { + bool found_match = false; m_threads.clear(); - add_thread(match ? RegexCompiler::prefix_size : 0, + add_thread(0, match ? RegexCompiler::prefix_size : 0, Vector(m_program.save_count, nullptr)); m_subject = data; @@ -682,7 +696,10 @@ struct ThreadedRegexVM if (res == StepResult::Matched) { m_captures = std::move(m_threads[i].saves); - return true; + found_match = true; + m_threads.resize(i); // remove this and lower priority threads + if (not longest) + return true; } else if (res == StepResult::Failed) m_threads[i].inst = nullptr; @@ -699,18 +716,21 @@ struct ThreadedRegexVM if (step(i) == StepResult::Matched) { m_captures = std::move(m_threads[i].saves); - return true; + found_match = true; + m_threads.resize(i); // remove this and lower priority threads + if (not longest) + return true; } } - return false; + return found_match; } - void add_thread(CompiledRegex::Offset pos, Vector saves) + void add_thread(int index, CompiledRegex::Offset pos, Vector saves) { const char* inst = m_program.bytecode.data() + pos; if (std::find_if(m_threads.begin(), m_threads.end(), [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end()) - m_threads.push_back({inst, std::move(saves)}); + m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)}); } bool is_line_start() const @@ -732,9 +752,10 @@ struct ThreadedRegexVM const CompiledRegex& m_program; Vector m_threads; - Vector m_captures; StringView m_subject; const char* m_pos; + + Vector m_captures; }; auto test_regex = UnitTest{[]{ @@ -829,14 +850,16 @@ auto test_regex = UnitTest{[]{ } { - StringView re = R"(f.*a)"; + StringView re = R"(f.*a(.*o))"; auto program = RegexCompiler::compile(re.begin(), re.end()); dump(program); ThreadedRegexVM vm{program}; - kak_assert(vm.exec("blahfoobarfoobaz", false)); - kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fooba"); // TODO: leftmost, longest - kak_assert(vm.exec("mais que fais la police", false)); - kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fa"); + kak_assert(vm.exec("blahfoobarfoobaz", false, true)); + kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "foobarfoo"); + kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "rfoo"); + kak_assert(vm.exec("mais que fais la police", false, true)); + kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fais la po"); + kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == " po"); } {