Regex: Add support for searching
Always compile a `.*` as the first instructions in a regex bytecode, depending on the match or search mode, the RegexVM will either execute this or skip it and start directly at the matching bytecode.
This commit is contained in:
parent
f7b8c1c79d
commit
52678fafa1
|
@ -27,7 +27,7 @@ struct CompiledRegex
|
|||
SubjectEnd,
|
||||
};
|
||||
|
||||
using Offset = size_t;
|
||||
using Offset = unsigned;
|
||||
|
||||
Vector<char> bytecode;
|
||||
size_t save_count;
|
||||
|
@ -35,6 +35,7 @@ struct CompiledRegex
|
|||
|
||||
namespace RegexCompiler
|
||||
{
|
||||
|
||||
struct Quantifier
|
||||
{
|
||||
enum Type
|
||||
|
@ -353,9 +354,23 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const AstNodePtr& nod
|
|||
return pos;
|
||||
}
|
||||
|
||||
constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset);
|
||||
|
||||
// Add a '.*' as the first instructions for the search use case
|
||||
void write_search_prefix(CompiledRegex& program)
|
||||
{
|
||||
kak_assert(program.bytecode.empty());
|
||||
program.bytecode.push_back(CompiledRegex::Split);
|
||||
get_offset(program, alloc_offset(program)) = prefix_size;
|
||||
program.bytecode.push_back(CompiledRegex::AnyChar);
|
||||
program.bytecode.push_back(CompiledRegex::Split);
|
||||
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
|
||||
}
|
||||
|
||||
CompiledRegex compile(const AstNodePtr& node, size_t capture_count)
|
||||
{
|
||||
CompiledRegex res;
|
||||
write_search_prefix(res);
|
||||
compile_node(res, node);
|
||||
res.bytecode.push_back(CompiledRegex::Match);
|
||||
res.save_count = capture_count * 2;
|
||||
|
@ -386,12 +401,12 @@ void dump(const CompiledRegex& program)
|
|||
printf("any char\n");
|
||||
break;
|
||||
case CompiledRegex::Jump:
|
||||
printf("jump %zd\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||
printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||
pos += sizeof(CompiledRegex::Offset);
|
||||
break;
|
||||
case CompiledRegex::Split:
|
||||
{
|
||||
printf("split %zd\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||
printf("split %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||
pos += sizeof(CompiledRegex::Offset);
|
||||
break;
|
||||
}
|
||||
|
@ -451,7 +466,7 @@ struct ThreadedRegexVM
|
|||
case CompiledRegex::Jump:
|
||||
{
|
||||
auto inst = m_program.bytecode.data() + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
|
||||
// if instruction is already going to be executed, drop this thread
|
||||
// if instruction is already going to be executed by another thread, drop this thread
|
||||
if (std::find_if(m_threads.begin(), m_threads.end(),
|
||||
[inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
|
||||
return StepResult::Failed;
|
||||
|
@ -502,10 +517,11 @@ struct ThreadedRegexVM
|
|||
return StepResult::Failed;
|
||||
}
|
||||
|
||||
bool match(StringView data)
|
||||
bool exec(StringView data, bool match = true)
|
||||
{
|
||||
m_threads.clear();
|
||||
add_thread(0, Vector<const char*>(m_program.save_count, nullptr));
|
||||
add_thread(match ? RegexCompiler::prefix_size : 0,
|
||||
Vector<const char*>(m_program.save_count, nullptr));
|
||||
|
||||
m_subject = data;
|
||||
m_pos = data.begin();
|
||||
|
@ -579,11 +595,11 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.match("b"));
|
||||
kak_assert(vm.match("ab"));
|
||||
kak_assert(vm.match("aaab"));
|
||||
kak_assert(not vm.match("acb"));
|
||||
kak_assert(not vm.match(""));
|
||||
kak_assert(vm.exec("b"));
|
||||
kak_assert(vm.exec("ab"));
|
||||
kak_assert(vm.exec("aaab"));
|
||||
kak_assert(not vm.exec("acb"));
|
||||
kak_assert(not vm.exec(""));
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -591,10 +607,10 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.match("afoob"));
|
||||
kak_assert(vm.match("ab"));
|
||||
kak_assert(not vm.match("bab"));
|
||||
kak_assert(not vm.match(""));
|
||||
kak_assert(vm.exec("afoob"));
|
||||
kak_assert(vm.exec("ab"));
|
||||
kak_assert(not vm.exec("bab"));
|
||||
kak_assert(not vm.exec(""));
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -602,13 +618,13 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.match("fooquxbarbaz"));
|
||||
kak_assert(vm.exec("fooquxbarbaz"));
|
||||
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "qux");
|
||||
kak_assert(not vm.match("fooquxbarbaze"));
|
||||
kak_assert(not vm.match("quxbar"));
|
||||
kak_assert(not vm.match("blahblah"));
|
||||
kak_assert(vm.match("bazbaz"));
|
||||
kak_assert(vm.match("quxbaz"));
|
||||
kak_assert(not vm.exec("fooquxbarbaze"));
|
||||
kak_assert(not vm.exec("quxbar"));
|
||||
kak_assert(not vm.exec("blahblah"));
|
||||
kak_assert(vm.exec("bazbaz"));
|
||||
kak_assert(vm.exec("quxbaz"));
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -616,20 +632,20 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.match("qux foo baz"));
|
||||
kak_assert(vm.exec("qux foo baz"));
|
||||
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "foo");
|
||||
kak_assert(not vm.match("quxfoobaz"));
|
||||
kak_assert(vm.match("bar"));
|
||||
kak_assert(not vm.match("foobar"));
|
||||
kak_assert(not vm.exec("quxfoobaz"));
|
||||
kak_assert(vm.exec("bar"));
|
||||
kak_assert(not vm.exec("foobar"));
|
||||
}
|
||||
{
|
||||
StringView re = R"(\`(foo|bar)\')";
|
||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.match("foo"));
|
||||
kak_assert(vm.match("bar"));
|
||||
kak_assert(not vm.match("foobar"));
|
||||
kak_assert(vm.exec("foo"));
|
||||
kak_assert(vm.exec("bar"));
|
||||
kak_assert(not vm.exec("foobar"));
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -637,10 +653,10 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(not vm.match("aab"));
|
||||
kak_assert(vm.match("aaab"));
|
||||
kak_assert(not vm.match("aaaaaab"));
|
||||
kak_assert(vm.match("aaaaab"));
|
||||
kak_assert(not vm.exec("aab"));
|
||||
kak_assert(vm.exec("aaab"));
|
||||
kak_assert(not vm.exec("aaaaaab"));
|
||||
kak_assert(vm.exec("aaaaab"));
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -648,9 +664,9 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(not vm.match("aab"));
|
||||
kak_assert(vm.match("aaab"));
|
||||
kak_assert(vm.match("aaaaab"));
|
||||
kak_assert(not vm.exec("aab"));
|
||||
kak_assert(vm.exec("aaab"));
|
||||
kak_assert(vm.exec("aaaaab"));
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -658,10 +674,21 @@ auto test_regex = UnitTest{[]{
|
|||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.match("b"));
|
||||
kak_assert(vm.match("ab"));
|
||||
kak_assert(vm.match("aaab"));
|
||||
kak_assert(not vm.match("aaaab"));
|
||||
kak_assert(vm.exec("b"));
|
||||
kak_assert(vm.exec("ab"));
|
||||
kak_assert(vm.exec("aaab"));
|
||||
kak_assert(not vm.exec("aaaab"));
|
||||
}
|
||||
|
||||
{
|
||||
StringView re = R"(f.*a)";
|
||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||
dump(program);
|
||||
ThreadedRegexVM vm{program};
|
||||
kak_assert(vm.exec("blahfoobarfoobaz", false));
|
||||
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fooba"); // TODO: leftmost, longest
|
||||
kak_assert(vm.exec("mais que fais la police", false));
|
||||
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fa");
|
||||
}
|
||||
}};
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user