Regex: Add support for searching
Always compile a `.*` as the first instructions in a regex bytecode, depending on the match or search mode, the RegexVM will either execute this or skip it and start directly at the matching bytecode.
This commit is contained in:
parent
f7b8c1c79d
commit
52678fafa1
|
@ -27,7 +27,7 @@ struct CompiledRegex
|
||||||
SubjectEnd,
|
SubjectEnd,
|
||||||
};
|
};
|
||||||
|
|
||||||
using Offset = size_t;
|
using Offset = unsigned;
|
||||||
|
|
||||||
Vector<char> bytecode;
|
Vector<char> bytecode;
|
||||||
size_t save_count;
|
size_t save_count;
|
||||||
|
@ -35,6 +35,7 @@ struct CompiledRegex
|
||||||
|
|
||||||
namespace RegexCompiler
|
namespace RegexCompiler
|
||||||
{
|
{
|
||||||
|
|
||||||
struct Quantifier
|
struct Quantifier
|
||||||
{
|
{
|
||||||
enum Type
|
enum Type
|
||||||
|
@ -353,9 +354,23 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const AstNodePtr& nod
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset);
|
||||||
|
|
||||||
|
// Add a '.*' as the first instructions for the search use case
|
||||||
|
void write_search_prefix(CompiledRegex& program)
|
||||||
|
{
|
||||||
|
kak_assert(program.bytecode.empty());
|
||||||
|
program.bytecode.push_back(CompiledRegex::Split);
|
||||||
|
get_offset(program, alloc_offset(program)) = prefix_size;
|
||||||
|
program.bytecode.push_back(CompiledRegex::AnyChar);
|
||||||
|
program.bytecode.push_back(CompiledRegex::Split);
|
||||||
|
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
|
||||||
|
}
|
||||||
|
|
||||||
CompiledRegex compile(const AstNodePtr& node, size_t capture_count)
|
CompiledRegex compile(const AstNodePtr& node, size_t capture_count)
|
||||||
{
|
{
|
||||||
CompiledRegex res;
|
CompiledRegex res;
|
||||||
|
write_search_prefix(res);
|
||||||
compile_node(res, node);
|
compile_node(res, node);
|
||||||
res.bytecode.push_back(CompiledRegex::Match);
|
res.bytecode.push_back(CompiledRegex::Match);
|
||||||
res.save_count = capture_count * 2;
|
res.save_count = capture_count * 2;
|
||||||
|
@ -386,12 +401,12 @@ void dump(const CompiledRegex& program)
|
||||||
printf("any char\n");
|
printf("any char\n");
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Jump:
|
case CompiledRegex::Jump:
|
||||||
printf("jump %zd\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||||
pos += sizeof(CompiledRegex::Offset);
|
pos += sizeof(CompiledRegex::Offset);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Split:
|
case CompiledRegex::Split:
|
||||||
{
|
{
|
||||||
printf("split %zd\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
printf("split %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
||||||
pos += sizeof(CompiledRegex::Offset);
|
pos += sizeof(CompiledRegex::Offset);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -451,7 +466,7 @@ struct ThreadedRegexVM
|
||||||
case CompiledRegex::Jump:
|
case CompiledRegex::Jump:
|
||||||
{
|
{
|
||||||
auto inst = m_program.bytecode.data() + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
|
auto inst = m_program.bytecode.data() + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
|
||||||
// if instruction is already going to be executed, drop this thread
|
// if instruction is already going to be executed by another thread, drop this thread
|
||||||
if (std::find_if(m_threads.begin(), m_threads.end(),
|
if (std::find_if(m_threads.begin(), m_threads.end(),
|
||||||
[inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
|
[inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
|
@ -502,10 +517,11 @@ struct ThreadedRegexVM
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool match(StringView data)
|
bool exec(StringView data, bool match = true)
|
||||||
{
|
{
|
||||||
m_threads.clear();
|
m_threads.clear();
|
||||||
add_thread(0, Vector<const char*>(m_program.save_count, nullptr));
|
add_thread(match ? RegexCompiler::prefix_size : 0,
|
||||||
|
Vector<const char*>(m_program.save_count, nullptr));
|
||||||
|
|
||||||
m_subject = data;
|
m_subject = data;
|
||||||
m_pos = data.begin();
|
m_pos = data.begin();
|
||||||
|
@ -579,11 +595,11 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.match("b"));
|
kak_assert(vm.exec("b"));
|
||||||
kak_assert(vm.match("ab"));
|
kak_assert(vm.exec("ab"));
|
||||||
kak_assert(vm.match("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(not vm.match("acb"));
|
kak_assert(not vm.exec("acb"));
|
||||||
kak_assert(not vm.match(""));
|
kak_assert(not vm.exec(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -591,10 +607,10 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.match("afoob"));
|
kak_assert(vm.exec("afoob"));
|
||||||
kak_assert(vm.match("ab"));
|
kak_assert(vm.exec("ab"));
|
||||||
kak_assert(not vm.match("bab"));
|
kak_assert(not vm.exec("bab"));
|
||||||
kak_assert(not vm.match(""));
|
kak_assert(not vm.exec(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -602,13 +618,13 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.match("fooquxbarbaz"));
|
kak_assert(vm.exec("fooquxbarbaz"));
|
||||||
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "qux");
|
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "qux");
|
||||||
kak_assert(not vm.match("fooquxbarbaze"));
|
kak_assert(not vm.exec("fooquxbarbaze"));
|
||||||
kak_assert(not vm.match("quxbar"));
|
kak_assert(not vm.exec("quxbar"));
|
||||||
kak_assert(not vm.match("blahblah"));
|
kak_assert(not vm.exec("blahblah"));
|
||||||
kak_assert(vm.match("bazbaz"));
|
kak_assert(vm.exec("bazbaz"));
|
||||||
kak_assert(vm.match("quxbaz"));
|
kak_assert(vm.exec("quxbaz"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -616,20 +632,20 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.match("qux foo baz"));
|
kak_assert(vm.exec("qux foo baz"));
|
||||||
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "foo");
|
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "foo");
|
||||||
kak_assert(not vm.match("quxfoobaz"));
|
kak_assert(not vm.exec("quxfoobaz"));
|
||||||
kak_assert(vm.match("bar"));
|
kak_assert(vm.exec("bar"));
|
||||||
kak_assert(not vm.match("foobar"));
|
kak_assert(not vm.exec("foobar"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
StringView re = R"(\`(foo|bar)\')";
|
StringView re = R"(\`(foo|bar)\')";
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.match("foo"));
|
kak_assert(vm.exec("foo"));
|
||||||
kak_assert(vm.match("bar"));
|
kak_assert(vm.exec("bar"));
|
||||||
kak_assert(not vm.match("foobar"));
|
kak_assert(not vm.exec("foobar"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -637,10 +653,10 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(not vm.match("aab"));
|
kak_assert(not vm.exec("aab"));
|
||||||
kak_assert(vm.match("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(not vm.match("aaaaaab"));
|
kak_assert(not vm.exec("aaaaaab"));
|
||||||
kak_assert(vm.match("aaaaab"));
|
kak_assert(vm.exec("aaaaab"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -648,9 +664,9 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(not vm.match("aab"));
|
kak_assert(not vm.exec("aab"));
|
||||||
kak_assert(vm.match("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(vm.match("aaaaab"));
|
kak_assert(vm.exec("aaaaab"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -658,10 +674,21 @@ auto test_regex = UnitTest{[]{
|
||||||
auto program = RegexCompiler::compile(re.begin(), re.end());
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
dump(program);
|
dump(program);
|
||||||
ThreadedRegexVM vm{program};
|
ThreadedRegexVM vm{program};
|
||||||
kak_assert(vm.match("b"));
|
kak_assert(vm.exec("b"));
|
||||||
kak_assert(vm.match("ab"));
|
kak_assert(vm.exec("ab"));
|
||||||
kak_assert(vm.match("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(not vm.match("aaaab"));
|
kak_assert(not vm.exec("aaaab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
StringView re = R"(f.*a)";
|
||||||
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
|
dump(program);
|
||||||
|
ThreadedRegexVM vm{program};
|
||||||
|
kak_assert(vm.exec("blahfoobarfoobaz", false));
|
||||||
|
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fooba"); // TODO: leftmost, longest
|
||||||
|
kak_assert(vm.exec("mais que fais la police", false));
|
||||||
|
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fa");
|
||||||
}
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user