Regex: Implement leftmost matching
Ensure threads are maintained in "priority" order, by having two split instruction (prioritizing parent or child).
This commit is contained in:
parent
182b70cb0a
commit
490c130e41
|
@ -19,7 +19,8 @@ struct CompiledRegex
|
||||||
CharRange,
|
CharRange,
|
||||||
NegativeCharRange,
|
NegativeCharRange,
|
||||||
Jump,
|
Jump,
|
||||||
Split,
|
Split_PrioritizeParent,
|
||||||
|
Split_PrioritizeChild,
|
||||||
Save,
|
Save,
|
||||||
LineStart,
|
LineStart,
|
||||||
LineEnd,
|
LineEnd,
|
||||||
|
@ -378,7 +379,7 @@ CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedReg
|
||||||
auto& children = node->children;
|
auto& children = node->children;
|
||||||
kak_assert(children.size() == 2);
|
kak_assert(children.size() == 2);
|
||||||
|
|
||||||
program.bytecode.push_back(CompiledRegex::Split);
|
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
|
||||||
auto offset = alloc_offset(program);
|
auto offset = alloc_offset(program);
|
||||||
|
|
||||||
compile_node(program, parsed_regex, children[0]);
|
compile_node(program, parsed_regex, children[0]);
|
||||||
|
@ -429,7 +430,7 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& pa
|
||||||
|
|
||||||
if (node->quantifier.allows_none())
|
if (node->quantifier.allows_none())
|
||||||
{
|
{
|
||||||
program.bytecode.push_back(CompiledRegex::Split);
|
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
|
||||||
goto_end_offsets.push_back(alloc_offset(program));
|
goto_end_offsets.push_back(alloc_offset(program));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -440,14 +441,14 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& pa
|
||||||
|
|
||||||
if (node->quantifier.allows_infinite_repeat())
|
if (node->quantifier.allows_infinite_repeat())
|
||||||
{
|
{
|
||||||
program.bytecode.push_back(CompiledRegex::Split);
|
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
|
||||||
get_offset(program, alloc_offset(program)) = inner_pos;
|
get_offset(program, alloc_offset(program)) = inner_pos;
|
||||||
}
|
}
|
||||||
// Write the node as an optional match for the min -> max counts
|
// Write the node as an optional match for the min -> max counts
|
||||||
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
|
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
|
||||||
i < node->quantifier.max; ++i)
|
i < node->quantifier.max; ++i)
|
||||||
{
|
{
|
||||||
program.bytecode.push_back(CompiledRegex::Split);
|
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
|
||||||
goto_end_offsets.push_back(alloc_offset(program));
|
goto_end_offsets.push_back(alloc_offset(program));
|
||||||
compile_node_inner(program, parsed_regex, node);
|
compile_node_inner(program, parsed_regex, node);
|
||||||
}
|
}
|
||||||
|
@ -464,10 +465,10 @@ constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offs
|
||||||
void write_search_prefix(CompiledRegex& program)
|
void write_search_prefix(CompiledRegex& program)
|
||||||
{
|
{
|
||||||
kak_assert(program.bytecode.empty());
|
kak_assert(program.bytecode.empty());
|
||||||
program.bytecode.push_back(CompiledRegex::Split);
|
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
|
||||||
get_offset(program, alloc_offset(program)) = prefix_size;
|
get_offset(program, alloc_offset(program)) = prefix_size;
|
||||||
program.bytecode.push_back(CompiledRegex::AnyChar);
|
program.bytecode.push_back(CompiledRegex::AnyChar);
|
||||||
program.bytecode.push_back(CompiledRegex::Split);
|
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
|
||||||
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
|
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -507,9 +508,12 @@ void dump(const CompiledRegex& program)
|
||||||
printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||||
pos += sizeof(CompiledRegex::Offset);
|
pos += sizeof(CompiledRegex::Offset);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Split:
|
case CompiledRegex::Split_PrioritizeParent:
|
||||||
|
case CompiledRegex::Split_PrioritizeChild:
|
||||||
{
|
{
|
||||||
printf("split %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
printf("split (prioritize %s) %u\n",
|
||||||
|
op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
|
||||||
|
*reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||||
pos += sizeof(CompiledRegex::Offset);
|
pos += sizeof(CompiledRegex::Offset);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -594,13 +598,21 @@ struct ThreadedRegexVM
|
||||||
thread.inst = inst;
|
thread.inst = inst;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Split:
|
case CompiledRegex::Split_PrioritizeParent:
|
||||||
{
|
{
|
||||||
add_thread(*reinterpret_cast<const CompiledRegex::Offset*>(thread.inst), thread.saves);
|
add_thread(thread_index+1, *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst), thread.saves);
|
||||||
// thread is invalidated now, as we mutated the m_thread vector
|
// thread is invalidated now, as we mutated the m_thread vector
|
||||||
m_threads[thread_index].inst += sizeof(CompiledRegex::Offset);
|
m_threads[thread_index].inst += sizeof(CompiledRegex::Offset);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case CompiledRegex::Split_PrioritizeChild:
|
||||||
|
{
|
||||||
|
auto prog_start = m_program.bytecode.data();
|
||||||
|
add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves);
|
||||||
|
// thread is invalidated now, as we mutated the m_thread vector
|
||||||
|
m_threads[thread_index].inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(m_threads[thread_index].inst);
|
||||||
|
break;
|
||||||
|
}
|
||||||
case CompiledRegex::Save:
|
case CompiledRegex::Save:
|
||||||
{
|
{
|
||||||
const char index = *thread.inst++;
|
const char index = *thread.inst++;
|
||||||
|
@ -659,16 +671,18 @@ struct ThreadedRegexVM
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
|
thread.inst = nullptr;
|
||||||
return StepResult::Matched;
|
return StepResult::Matched;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec(StringView data, bool match = true)
|
bool exec(StringView data, bool match = true, bool longest = false)
|
||||||
{
|
{
|
||||||
|
bool found_match = false;
|
||||||
m_threads.clear();
|
m_threads.clear();
|
||||||
add_thread(match ? RegexCompiler::prefix_size : 0,
|
add_thread(0, match ? RegexCompiler::prefix_size : 0,
|
||||||
Vector<const char*>(m_program.save_count, nullptr));
|
Vector<const char*>(m_program.save_count, nullptr));
|
||||||
|
|
||||||
m_subject = data;
|
m_subject = data;
|
||||||
|
@ -682,7 +696,10 @@ struct ThreadedRegexVM
|
||||||
if (res == StepResult::Matched)
|
if (res == StepResult::Matched)
|
||||||
{
|
{
|
||||||
m_captures = std::move(m_threads[i].saves);
|
m_captures = std::move(m_threads[i].saves);
|
||||||
return true;
|
found_match = true;
|
||||||
|
m_threads.resize(i); // remove this and lower priority threads
|
||||||
|
if (not longest)
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
else if (res == StepResult::Failed)
|
else if (res == StepResult::Failed)
|
||||||
m_threads[i].inst = nullptr;
|
m_threads[i].inst = nullptr;
|
||||||
|
@ -699,18 +716,21 @@ struct ThreadedRegexVM
|
||||||
if (step(i) == StepResult::Matched)
|
if (step(i) == StepResult::Matched)
|
||||||
{
|
{
|
||||||
m_captures = std::move(m_threads[i].saves);
|
m_captures = std::move(m_threads[i].saves);
|
||||||
return true;
|
found_match = true;
|
||||||
|
m_threads.resize(i); // remove this and lower priority threads
|
||||||
|
if (not longest)
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return found_match;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_thread(CompiledRegex::Offset pos, Vector<const char*> saves)
|
void add_thread(int index, CompiledRegex::Offset pos, Vector<const char*> saves)
|
||||||
{
|
{
|
||||||
const char* inst = m_program.bytecode.data() + pos;
|
const char* inst = m_program.bytecode.data() + pos;
|
||||||
if (std::find_if(m_threads.begin(), m_threads.end(),
|
if (std::find_if(m_threads.begin(), m_threads.end(),
|
||||||
[inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
|
[inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
|
||||||
m_threads.push_back({inst, std::move(saves)});
|
m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)});
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_line_start() const
|
bool is_line_start() const
|
||||||
|
@ -732,9 +752,10 @@ struct ThreadedRegexVM
|
||||||
|
|
||||||
const CompiledRegex& m_program;
|
const CompiledRegex& m_program;
|
||||||
Vector<Thread> m_threads;
|
Vector<Thread> m_threads;
|
||||||
Vector<const char*> m_captures;
|
|
||||||
StringView m_subject;
|
StringView m_subject;
|
||||||
const char* m_pos;
|
const char* m_pos;
|
||||||
|
|
||||||
|
Vector<const char*> m_captures;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto test_regex = UnitTest{[]{
|
auto test_regex = UnitTest{[]{
|
||||||
|
@ -829,14 +850,16 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
StringView re = R"(f.*a)";
|
StringView re = R"(f.*a(.*o))";
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.exec("blahfoobarfoobaz", false));
|
kak_assert(vm.exec("blahfoobarfoobaz", false, true));
|
||||||
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fooba"); // TODO: leftmost, longest
|
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "foobarfoo");
|
||||||
kak_assert(vm.exec("mais que fais la police", false));
|
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "rfoo");
|
||||||
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fa");
|
kak_assert(vm.exec("mais que fais la police", false, true));
|
||||||
|
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fais la po");
|
||||||
|
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == " po");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue
Block a user