Regex: Add support for searching

Always compile a `.*` as the first instructions in a regex bytecode,
depending on the match or search mode, the RegexVM will either execute
this or skip it and start directly at the matching bytecode.
This commit is contained in:
Maxime Coste 2017-09-19 18:26:24 +09:00
parent f7b8c1c79d
commit 52678fafa1

View File

@ -27,7 +27,7 @@ struct CompiledRegex
SubjectEnd, SubjectEnd,
}; };
using Offset = size_t; using Offset = unsigned;
Vector<char> bytecode; Vector<char> bytecode;
size_t save_count; size_t save_count;
@ -35,6 +35,7 @@ struct CompiledRegex
namespace RegexCompiler namespace RegexCompiler
{ {
struct Quantifier struct Quantifier
{ {
enum Type enum Type
@ -353,9 +354,23 @@ CompiledRegex::Offset compile_node(CompiledRegex& program, const AstNodePtr& nod
return pos; return pos;
} }
constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset);
// Add a '.*' as the first instructions for the search use case
void write_search_prefix(CompiledRegex& program)
{
kak_assert(program.bytecode.empty());
program.bytecode.push_back(CompiledRegex::Split);
get_offset(program, alloc_offset(program)) = prefix_size;
program.bytecode.push_back(CompiledRegex::AnyChar);
program.bytecode.push_back(CompiledRegex::Split);
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
}
CompiledRegex compile(const AstNodePtr& node, size_t capture_count) CompiledRegex compile(const AstNodePtr& node, size_t capture_count)
{ {
CompiledRegex res; CompiledRegex res;
write_search_prefix(res);
compile_node(res, node); compile_node(res, node);
res.bytecode.push_back(CompiledRegex::Match); res.bytecode.push_back(CompiledRegex::Match);
res.save_count = capture_count * 2; res.save_count = capture_count * 2;
@ -386,12 +401,12 @@ void dump(const CompiledRegex& program)
printf("any char\n"); printf("any char\n");
break; break;
case CompiledRegex::Jump: case CompiledRegex::Jump:
printf("jump %zd\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos)); printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
pos += sizeof(CompiledRegex::Offset); pos += sizeof(CompiledRegex::Offset);
break; break;
case CompiledRegex::Split: case CompiledRegex::Split:
{ {
printf("split %zd\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos)); printf("split %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
pos += sizeof(CompiledRegex::Offset); pos += sizeof(CompiledRegex::Offset);
break; break;
} }
@ -451,7 +466,7 @@ struct ThreadedRegexVM
case CompiledRegex::Jump: case CompiledRegex::Jump:
{ {
auto inst = m_program.bytecode.data() + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst); auto inst = m_program.bytecode.data() + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
// if instruction is already going to be executed, drop this thread // if instruction is already going to be executed by another thread, drop this thread
if (std::find_if(m_threads.begin(), m_threads.end(), if (std::find_if(m_threads.begin(), m_threads.end(),
[inst](const Thread& t) { return t.inst == inst; }) != m_threads.end()) [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
return StepResult::Failed; return StepResult::Failed;
@ -502,10 +517,11 @@ struct ThreadedRegexVM
return StepResult::Failed; return StepResult::Failed;
} }
bool match(StringView data) bool exec(StringView data, bool match = true)
{ {
m_threads.clear(); m_threads.clear();
add_thread(0, Vector<const char*>(m_program.save_count, nullptr)); add_thread(match ? RegexCompiler::prefix_size : 0,
Vector<const char*>(m_program.save_count, nullptr));
m_subject = data; m_subject = data;
m_pos = data.begin(); m_pos = data.begin();
@ -579,11 +595,11 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(vm.match("b")); kak_assert(vm.exec("b"));
kak_assert(vm.match("ab")); kak_assert(vm.exec("ab"));
kak_assert(vm.match("aaab")); kak_assert(vm.exec("aaab"));
kak_assert(not vm.match("acb")); kak_assert(not vm.exec("acb"));
kak_assert(not vm.match("")); kak_assert(not vm.exec(""));
} }
{ {
@ -591,10 +607,10 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(vm.match("afoob")); kak_assert(vm.exec("afoob"));
kak_assert(vm.match("ab")); kak_assert(vm.exec("ab"));
kak_assert(not vm.match("bab")); kak_assert(not vm.exec("bab"));
kak_assert(not vm.match("")); kak_assert(not vm.exec(""));
} }
{ {
@ -602,13 +618,13 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(vm.match("fooquxbarbaz")); kak_assert(vm.exec("fooquxbarbaz"));
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "qux"); kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "qux");
kak_assert(not vm.match("fooquxbarbaze")); kak_assert(not vm.exec("fooquxbarbaze"));
kak_assert(not vm.match("quxbar")); kak_assert(not vm.exec("quxbar"));
kak_assert(not vm.match("blahblah")); kak_assert(not vm.exec("blahblah"));
kak_assert(vm.match("bazbaz")); kak_assert(vm.exec("bazbaz"));
kak_assert(vm.match("quxbaz")); kak_assert(vm.exec("quxbaz"));
} }
{ {
@ -616,20 +632,20 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(vm.match("qux foo baz")); kak_assert(vm.exec("qux foo baz"));
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "foo"); kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "foo");
kak_assert(not vm.match("quxfoobaz")); kak_assert(not vm.exec("quxfoobaz"));
kak_assert(vm.match("bar")); kak_assert(vm.exec("bar"));
kak_assert(not vm.match("foobar")); kak_assert(not vm.exec("foobar"));
} }
{ {
StringView re = R"(\`(foo|bar)\')"; StringView re = R"(\`(foo|bar)\')";
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(vm.match("foo")); kak_assert(vm.exec("foo"));
kak_assert(vm.match("bar")); kak_assert(vm.exec("bar"));
kak_assert(not vm.match("foobar")); kak_assert(not vm.exec("foobar"));
} }
{ {
@ -637,10 +653,10 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(not vm.match("aab")); kak_assert(not vm.exec("aab"));
kak_assert(vm.match("aaab")); kak_assert(vm.exec("aaab"));
kak_assert(not vm.match("aaaaaab")); kak_assert(not vm.exec("aaaaaab"));
kak_assert(vm.match("aaaaab")); kak_assert(vm.exec("aaaaab"));
} }
{ {
@ -648,9 +664,9 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(not vm.match("aab")); kak_assert(not vm.exec("aab"));
kak_assert(vm.match("aaab")); kak_assert(vm.exec("aaab"));
kak_assert(vm.match("aaaaab")); kak_assert(vm.exec("aaaaab"));
} }
{ {
@ -658,10 +674,21 @@ auto test_regex = UnitTest{[]{
auto program = RegexCompiler::compile(re.begin(), re.end()); auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program); dump(program);
ThreadedRegexVM vm{program}; ThreadedRegexVM vm{program};
kak_assert(vm.match("b")); kak_assert(vm.exec("b"));
kak_assert(vm.match("ab")); kak_assert(vm.exec("ab"));
kak_assert(vm.match("aaab")); kak_assert(vm.exec("aaab"));
kak_assert(not vm.match("aaaab")); kak_assert(not vm.exec("aaaab"));
}
{
StringView re = R"(f.*a)";
auto program = RegexCompiler::compile(re.begin(), re.end());
dump(program);
ThreadedRegexVM vm{program};
kak_assert(vm.exec("blahfoobarfoobaz", false));
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fooba"); // TODO: leftmost, longest
kak_assert(vm.exec("mais que fais la police", false));
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "fa");
} }
}}; }};