Regex: Add support for backward matching
Regex can be compiled for backward matching instead of forward matching and the ThreadedRegexVM is able to iterate in reverse on the subject string to find the last match instead of the first.
This commit is contained in:
parent
071b897e00
commit
c47cdc06a7
|
@ -501,13 +501,14 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8]
|
||||||
|
|
||||||
struct RegexCompiler
|
struct RegexCompiler
|
||||||
{
|
{
|
||||||
RegexCompiler(const ParsedRegex& parsed_regex)
|
RegexCompiler(const ParsedRegex& parsed_regex, MatchDirection direction)
|
||||||
: m_parsed_regex{parsed_regex}
|
: m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
|
||||||
{
|
{
|
||||||
compile_node(m_parsed_regex.ast);
|
compile_node(m_parsed_regex.ast);
|
||||||
push_op(CompiledRegex::Match);
|
push_op(CompiledRegex::Match);
|
||||||
m_program.matchers = m_parsed_regex.matchers;
|
m_program.matchers = m_parsed_regex.matchers;
|
||||||
m_program.save_count = m_parsed_regex.capture_count * 2;
|
m_program.save_count = m_parsed_regex.capture_count * 2;
|
||||||
|
m_program.direction = direction;
|
||||||
m_program.start_chars = compute_start_chars();
|
m_program.start_chars = compute_start_chars();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -524,7 +525,7 @@ private:
|
||||||
if (capture != -1)
|
if (capture != -1)
|
||||||
{
|
{
|
||||||
push_op(CompiledRegex::Save);
|
push_op(CompiledRegex::Save);
|
||||||
push_byte(capture * 2);
|
push_byte(capture * 2 + (m_forward ? 0 : 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
Vector<Offset> goto_inner_end_offsets;
|
Vector<Offset> goto_inner_end_offsets;
|
||||||
|
@ -543,9 +544,15 @@ private:
|
||||||
push_op(CompiledRegex::Matcher);
|
push_op(CompiledRegex::Matcher);
|
||||||
push_byte(node->value);
|
push_byte(node->value);
|
||||||
case ParsedRegex::Sequence:
|
case ParsedRegex::Sequence:
|
||||||
|
{
|
||||||
|
if (m_forward)
|
||||||
for (auto& child : node->children)
|
for (auto& child : node->children)
|
||||||
compile_node(child);
|
compile_node(child);
|
||||||
|
else
|
||||||
|
for (auto& child : node->children | reverse())
|
||||||
|
compile_node(child);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case ParsedRegex::Alternation:
|
case ParsedRegex::Alternation:
|
||||||
{
|
{
|
||||||
auto& children = node->children;
|
auto& children = node->children;
|
||||||
|
@ -554,36 +561,42 @@ private:
|
||||||
push_op(CompiledRegex::Split_PrioritizeParent);
|
push_op(CompiledRegex::Split_PrioritizeParent);
|
||||||
auto offset = alloc_offset();
|
auto offset = alloc_offset();
|
||||||
|
|
||||||
compile_node(children[0]);
|
compile_node(children[m_forward ? 0 : 1]);
|
||||||
push_op(CompiledRegex::Jump);
|
push_op(CompiledRegex::Jump);
|
||||||
goto_inner_end_offsets.push_back(alloc_offset());
|
goto_inner_end_offsets.push_back(alloc_offset());
|
||||||
|
|
||||||
auto right_pos = compile_node(children[1]);
|
auto right_pos = compile_node(children[m_forward ? 1 : 0]);
|
||||||
set_offset(offset, right_pos);
|
set_offset(offset, right_pos);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
push_op(CompiledRegex::LookAhead);
|
push_op(m_forward ? CompiledRegex::LookAhead
|
||||||
push_string(node->children);
|
: CompiledRegex::LookBehind);
|
||||||
|
push_string(node->children, false);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
push_op(CompiledRegex::NegativeLookAhead);
|
push_op(m_forward ? CompiledRegex::NegativeLookAhead
|
||||||
push_string(node->children);
|
: CompiledRegex::NegativeLookBehind);
|
||||||
|
push_string(node->children, false);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LookBehind:
|
case ParsedRegex::LookBehind:
|
||||||
push_op(CompiledRegex::LookBehind);
|
push_op(m_forward ? CompiledRegex::LookBehind
|
||||||
|
: CompiledRegex::LookAhead);
|
||||||
push_string(node->children, true);
|
push_string(node->children, true);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookBehind:
|
case ParsedRegex::NegativeLookBehind:
|
||||||
push_op(CompiledRegex::NegativeLookBehind);
|
push_op(m_forward ? CompiledRegex::NegativeLookBehind
|
||||||
|
: CompiledRegex::NegativeLookAhead);
|
||||||
push_string(node->children, true);
|
push_string(node->children, true);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineStart:
|
case ParsedRegex::LineStart:
|
||||||
push_op(CompiledRegex::LineStart);
|
push_op(m_forward ? CompiledRegex::LineStart
|
||||||
|
: CompiledRegex::LineEnd);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineEnd:
|
case ParsedRegex::LineEnd:
|
||||||
push_op(CompiledRegex::LineEnd);
|
push_op(m_forward ? CompiledRegex::LineEnd
|
||||||
|
: CompiledRegex::LineStart);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::WordBoundary:
|
case ParsedRegex::WordBoundary:
|
||||||
push_op(CompiledRegex::WordBoundary);
|
push_op(CompiledRegex::WordBoundary);
|
||||||
|
@ -592,10 +605,12 @@ private:
|
||||||
push_op(CompiledRegex::NotWordBoundary);
|
push_op(CompiledRegex::NotWordBoundary);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectBegin:
|
case ParsedRegex::SubjectBegin:
|
||||||
push_op(CompiledRegex::SubjectBegin);
|
push_op(m_forward ? CompiledRegex::SubjectBegin
|
||||||
|
: CompiledRegex::SubjectEnd);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectEnd:
|
case ParsedRegex::SubjectEnd:
|
||||||
push_op(CompiledRegex::SubjectEnd);
|
push_op(m_forward ? CompiledRegex::SubjectEnd
|
||||||
|
: CompiledRegex::SubjectBegin);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::ResetStart:
|
case ParsedRegex::ResetStart:
|
||||||
push_op(CompiledRegex::Save);
|
push_op(CompiledRegex::Save);
|
||||||
|
@ -609,7 +624,7 @@ private:
|
||||||
if (capture != -1)
|
if (capture != -1)
|
||||||
{
|
{
|
||||||
push_op(CompiledRegex::Save);
|
push_op(CompiledRegex::Save);
|
||||||
push_byte(capture * 2 + 1);
|
push_byte(capture * 2 + (m_forward ? 1 : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
return start_pos;
|
return start_pos;
|
||||||
|
@ -622,6 +637,8 @@ private:
|
||||||
|
|
||||||
auto& quantifier = node->quantifier;
|
auto& quantifier = node->quantifier;
|
||||||
|
|
||||||
|
// TODO reverse, invert the way we write optional quantifiers ?
|
||||||
|
|
||||||
if (quantifier.allows_none())
|
if (quantifier.allows_none())
|
||||||
{
|
{
|
||||||
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
||||||
|
@ -720,14 +737,14 @@ private:
|
||||||
case ParsedRegex::Sequence:
|
case ParsedRegex::Sequence:
|
||||||
{
|
{
|
||||||
bool consumed = false;
|
bool consumed = false;
|
||||||
for (auto& child : node->children)
|
auto consumes = [&, this](auto& child) {
|
||||||
{
|
return not this->compute_start_chars(child, accepted, rejected);
|
||||||
if (not compute_start_chars(child, accepted, rejected))
|
};
|
||||||
{
|
if (m_forward)
|
||||||
consumed = true;
|
consumed = contains_that(node->children, consumes);
|
||||||
break;
|
else
|
||||||
}
|
consumed = contains_that(node->children | reverse(), consumes);
|
||||||
}
|
|
||||||
return not consumed or node->quantifier.allows_none();
|
return not consumed or node->quantifier.allows_none();
|
||||||
}
|
}
|
||||||
case ParsedRegex::Alternation:
|
case ParsedRegex::Alternation:
|
||||||
|
@ -750,11 +767,13 @@ private:
|
||||||
return true;
|
return true;
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
if (node->children.empty())
|
if (node->children.empty())
|
||||||
compute_start_chars(node->children.front(), accepted, rejected);
|
compute_start_chars(m_forward ? node->children.front() : node->children.back(),
|
||||||
|
accepted, rejected);
|
||||||
return true;
|
return true;
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
if (node->children.empty())
|
if (node->children.empty())
|
||||||
compute_start_chars(node->children.front(), rejected, accepted);
|
compute_start_chars(m_forward ? node->children.front() : node->children.back(),
|
||||||
|
rejected, accepted);
|
||||||
return true;
|
return true;
|
||||||
case ParsedRegex::LookBehind:
|
case ParsedRegex::LookBehind:
|
||||||
return true;
|
return true;
|
||||||
|
@ -780,6 +799,7 @@ private:
|
||||||
|
|
||||||
CompiledRegex m_program;
|
CompiledRegex m_program;
|
||||||
const ParsedRegex& m_parsed_regex;
|
const ParsedRegex& m_parsed_regex;
|
||||||
|
const bool m_forward;
|
||||||
};
|
};
|
||||||
|
|
||||||
void dump_regex(const CompiledRegex& program)
|
void dump_regex(const CompiledRegex& program)
|
||||||
|
@ -864,27 +884,34 @@ void dump_regex(const CompiledRegex& program)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CompiledRegex compile_regex(StringView re)
|
CompiledRegex compile_regex(StringView re, MatchDirection direction)
|
||||||
{
|
{
|
||||||
return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex();
|
return RegexCompiler{RegexParser::parse(re), direction}.get_compiled_regex();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto test_regex = UnitTest{[]{
|
namespace
|
||||||
struct TestVM : CompiledRegex, ThreadedRegexVM<const char*>
|
|
||||||
{
|
{
|
||||||
|
template<MatchDirection dir = MatchDirection::Forward>
|
||||||
|
struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
|
||||||
|
{
|
||||||
|
using VMType = ThreadedRegexVM<const char*, dir>;
|
||||||
|
|
||||||
TestVM(StringView re, bool dump = false)
|
TestVM(StringView re, bool dump = false)
|
||||||
: CompiledRegex{compile_regex(re)},
|
: CompiledRegex{compile_regex(re, dir)},
|
||||||
ThreadedRegexVM{(const CompiledRegex&)*this}
|
VMType{(const CompiledRegex&)*this}
|
||||||
{ if (dump) dump_regex(*this); }
|
{ if (dump) dump_regex(*this); }
|
||||||
|
|
||||||
bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch)
|
bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch)
|
||||||
{
|
{
|
||||||
return ThreadedRegexVM::exec(re.begin(), re.end(), flags);
|
return VMType::exec(re.begin(), re.end(), flags);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
auto test_regex = UnitTest{[]{
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(a*b)"};
|
TestVM<> vm{R"(a*b)"};
|
||||||
kak_assert(vm.exec("b"));
|
kak_assert(vm.exec("b"));
|
||||||
kak_assert(vm.exec("ab"));
|
kak_assert(vm.exec("ab"));
|
||||||
kak_assert(vm.exec("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
|
@ -894,7 +921,7 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(^a.*b$)"};
|
TestVM<> vm{R"(^a.*b$)"};
|
||||||
kak_assert(vm.exec("afoob"));
|
kak_assert(vm.exec("afoob"));
|
||||||
kak_assert(vm.exec("ab"));
|
kak_assert(vm.exec("ab"));
|
||||||
kak_assert(not vm.exec("bab"));
|
kak_assert(not vm.exec("bab"));
|
||||||
|
@ -902,7 +929,7 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(^(foo|qux|baz)+(bar)?baz$)"};
|
TestVM<> vm{R"(^(foo|qux|baz)+(bar)?baz$)"};
|
||||||
kak_assert(vm.exec("fooquxbarbaz"));
|
kak_assert(vm.exec("fooquxbarbaz"));
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "qux");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "qux");
|
||||||
kak_assert(not vm.exec("fooquxbarbaze"));
|
kak_assert(not vm.exec("fooquxbarbaze"));
|
||||||
|
@ -913,7 +940,7 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(.*\b(foo|bar)\b.*)"};
|
TestVM<> vm{R"(.*\b(foo|bar)\b.*)"};
|
||||||
kak_assert(vm.exec("qux foo baz"));
|
kak_assert(vm.exec("qux foo baz"));
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "foo");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "foo");
|
||||||
kak_assert(not vm.exec("quxfoobaz"));
|
kak_assert(not vm.exec("quxfoobaz"));
|
||||||
|
@ -922,14 +949,14 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((foo|bar))"};
|
TestVM<> vm{R"((foo|bar))"};
|
||||||
kak_assert(vm.exec("foo"));
|
kak_assert(vm.exec("foo"));
|
||||||
kak_assert(vm.exec("bar"));
|
kak_assert(vm.exec("bar"));
|
||||||
kak_assert(not vm.exec("foobar"));
|
kak_assert(not vm.exec("foobar"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(a{3,5}b)"};
|
TestVM<> vm{R"(a{3,5}b)"};
|
||||||
kak_assert(not vm.exec("aab"));
|
kak_assert(not vm.exec("aab"));
|
||||||
kak_assert(vm.exec("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(not vm.exec("aaaaaab"));
|
kak_assert(not vm.exec("aaaaaab"));
|
||||||
|
@ -937,21 +964,21 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(a{3}b)"};
|
TestVM<> vm{R"(a{3}b)"};
|
||||||
kak_assert(not vm.exec("aab"));
|
kak_assert(not vm.exec("aab"));
|
||||||
kak_assert(vm.exec("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(not vm.exec("aaaab"));
|
kak_assert(not vm.exec("aaaab"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(a{3,}b)"};
|
TestVM<> vm{R"(a{3,}b)"};
|
||||||
kak_assert(not vm.exec("aab"));
|
kak_assert(not vm.exec("aab"));
|
||||||
kak_assert(vm.exec("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
kak_assert(vm.exec("aaaaab"));
|
kak_assert(vm.exec("aaaaab"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(a{,3}b)"};
|
TestVM<> vm{R"(a{,3}b)"};
|
||||||
kak_assert(vm.exec("b"));
|
kak_assert(vm.exec("b"));
|
||||||
kak_assert(vm.exec("ab"));
|
kak_assert(vm.exec("ab"));
|
||||||
kak_assert(vm.exec("aaab"));
|
kak_assert(vm.exec("aaab"));
|
||||||
|
@ -959,7 +986,7 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(f.*a(.*o))"};
|
TestVM<> vm{R"(f.*a(.*o))"};
|
||||||
kak_assert(vm.exec("blahfoobarfoobaz", RegexExecFlags::Search));
|
kak_assert(vm.exec("blahfoobarfoobaz", RegexExecFlags::Search));
|
||||||
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foobarfoo");
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foobarfoo");
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "rfoo");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "rfoo");
|
||||||
|
@ -969,7 +996,7 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"([àb-dX-Z-]{3,5})"};
|
TestVM<> vm{R"([àb-dX-Z-]{3,5})"};
|
||||||
kak_assert(vm.exec("cà-Y"));
|
kak_assert(vm.exec("cà-Y"));
|
||||||
kak_assert(not vm.exec("àeY"));
|
kak_assert(not vm.exec("àeY"));
|
||||||
kak_assert(vm.exec("dcbàX"));
|
kak_assert(vm.exec("dcbàX"));
|
||||||
|
@ -977,115 +1004,128 @@ auto test_regex = UnitTest{[]{
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((a{3,5})a+)"};
|
TestVM<> vm{R"((a{3,5})a+)"};
|
||||||
kak_assert(vm.exec("aaaaaa", RegexExecFlags::None));
|
kak_assert(vm.exec("aaaaaa", RegexExecFlags::None));
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaaaa");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaaaa");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((a{3,5}?)a+)"};
|
TestVM<> vm{R"((a{3,5}?)a+)"};
|
||||||
kak_assert(vm.exec("aaaaaa", RegexExecFlags::None));
|
kak_assert(vm.exec("aaaaaa", RegexExecFlags::None));
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaa");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaa");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((a{3,5}?)a)"};
|
TestVM<> vm{R"((a{3,5}?)a)"};
|
||||||
kak_assert(vm.exec("aaaa"));
|
kak_assert(vm.exec("aaaa"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(\d{3})"};
|
TestVM<> vm{R"(\d{3})"};
|
||||||
kak_assert(vm.exec("123"));
|
kak_assert(vm.exec("123"));
|
||||||
kak_assert(not vm.exec("1x3"));
|
kak_assert(not vm.exec("1x3"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"([-\d]+)"};
|
TestVM<> vm{R"([-\d]+)"};
|
||||||
kak_assert(vm.exec("123-456"));
|
kak_assert(vm.exec("123-456"));
|
||||||
kak_assert(not vm.exec("123_456"));
|
kak_assert(not vm.exec("123_456"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"([ \H]+)"};
|
TestVM<> vm{R"([ \H]+)"};
|
||||||
kak_assert(vm.exec("abc "));
|
kak_assert(vm.exec("abc "));
|
||||||
kak_assert(not vm.exec("a \t"));
|
kak_assert(not vm.exec("a \t"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(\Q{}[]*+?\Ea+)"};
|
TestVM<> vm{R"(\Q{}[]*+?\Ea+)"};
|
||||||
kak_assert(vm.exec("{}[]*+?aa"));
|
kak_assert(vm.exec("{}[]*+?aa"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(\Q...)"};
|
TestVM<> vm{R"(\Q...)"};
|
||||||
kak_assert(vm.exec("..."));
|
kak_assert(vm.exec("..."));
|
||||||
kak_assert(not vm.exec("bla"));
|
kak_assert(not vm.exec("bla"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(foo\Kbar)"};
|
TestVM<> vm{R"(foo\Kbar)"};
|
||||||
kak_assert(vm.exec("foobar", RegexExecFlags::None));
|
kak_assert(vm.exec("foobar", RegexExecFlags::None));
|
||||||
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "bar");
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "bar");
|
||||||
kak_assert(not vm.exec("bar", RegexExecFlags::None));
|
kak_assert(not vm.exec("bar", RegexExecFlags::None));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((fo+?).*)"};
|
TestVM<> vm{R"((fo+?).*)"};
|
||||||
kak_assert(vm.exec("foooo", RegexExecFlags::None));
|
kak_assert(vm.exec("foooo", RegexExecFlags::None));
|
||||||
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "fo");
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "fo");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((?=foo).)"};
|
TestVM<> vm{R"((?=foo).)"};
|
||||||
kak_assert(vm.exec("barfoo", RegexExecFlags::Search));
|
kak_assert(vm.exec("barfoo", RegexExecFlags::Search));
|
||||||
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "f");
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "f");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((?!foo)...)"};
|
TestVM<> vm{R"((?!foo)...)"};
|
||||||
kak_assert(not vm.exec("foo"));
|
kak_assert(not vm.exec("foo"));
|
||||||
kak_assert(vm.exec("qux"));
|
kak_assert(vm.exec("qux"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(...(?<=foo))"};
|
TestVM<> vm{R"(...(?<=foo))"};
|
||||||
kak_assert(vm.exec("foo"));
|
kak_assert(vm.exec("foo"));
|
||||||
kak_assert(not vm.exec("qux"));
|
kak_assert(not vm.exec("qux"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(...(?<!foo))"};
|
TestVM<> vm{R"(...(?<!foo))"};
|
||||||
kak_assert(not vm.exec("foo"));
|
kak_assert(not vm.exec("foo"));
|
||||||
kak_assert(vm.exec("qux"));
|
kak_assert(vm.exec("qux"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(Foo(?i)f[oB]+)"};
|
TestVM<> vm{R"(Foo(?i)f[oB]+)"};
|
||||||
kak_assert(vm.exec("FooFOoBb"));
|
kak_assert(vm.exec("FooFOoBb"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"([^\]]+)"};
|
TestVM<> vm{R"([^\]]+)"};
|
||||||
kak_assert(not vm.exec("a]c"));
|
kak_assert(not vm.exec("a]c"));
|
||||||
kak_assert(vm.exec("abc"));
|
kak_assert(vm.exec("abc"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((?:foo)+)"};
|
TestVM<> vm{R"((?:foo)+)"};
|
||||||
kak_assert(vm.exec("foofoofoo"));
|
kak_assert(vm.exec("foofoofoo"));
|
||||||
kak_assert(not vm.exec("barbarbar"));
|
kak_assert(not vm.exec("barbarbar"));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"((?<!\\)(?:\\\\)*")"};
|
TestVM<> vm{R"((?<!\\)(?:\\\\)*")"};
|
||||||
kak_assert(vm.exec("foo\"", RegexExecFlags::Search));
|
kak_assert(vm.exec("foo\"", RegexExecFlags::Search));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"($)"};
|
TestVM<> vm{R"($)"};
|
||||||
kak_assert(vm.exec("foo\n", RegexExecFlags::Search));
|
kak_assert(vm.exec("foo\n", RegexExecFlags::Search));
|
||||||
kak_assert(*vm.captures()[0] == '\n');
|
kak_assert(*vm.captures()[0] == '\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<MatchDirection::Backward> vm{R"(fo{1,})"};
|
||||||
|
kak_assert(vm.exec("foo1fooo2", RegexExecFlags::Search));
|
||||||
|
kak_assert(*vm.captures()[1] == '2');
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<MatchDirection::Backward> vm{R"((?<=f)oo(b[ae]r)?(?=baz))"};
|
||||||
|
kak_assert(vm.exec("foobarbazfoobazfooberbaz", RegexExecFlags::Search));
|
||||||
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "oober");
|
||||||
|
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "ber");
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#ifndef regex_impl_hh_INCLUDED
|
#ifndef regex_impl_hh_INCLUDED
|
||||||
#define regex_impl_hh_INCLUDED
|
#define regex_impl_hh_INCLUDED
|
||||||
|
|
||||||
|
#include "exception.hh"
|
||||||
#include "flags.hh"
|
#include "flags.hh"
|
||||||
#include "ref_ptr.hh"
|
#include "ref_ptr.hh"
|
||||||
#include "unicode.hh"
|
#include "unicode.hh"
|
||||||
|
@ -13,6 +14,12 @@
|
||||||
namespace Kakoune
|
namespace Kakoune
|
||||||
{
|
{
|
||||||
|
|
||||||
|
enum class MatchDirection
|
||||||
|
{
|
||||||
|
Forward,
|
||||||
|
Backward
|
||||||
|
};
|
||||||
|
|
||||||
struct CompiledRegex : RefCountable
|
struct CompiledRegex : RefCountable
|
||||||
{
|
{
|
||||||
enum Op : char
|
enum Op : char
|
||||||
|
@ -43,13 +50,14 @@ struct CompiledRegex : RefCountable
|
||||||
|
|
||||||
Vector<char> bytecode;
|
Vector<char> bytecode;
|
||||||
Vector<std::function<bool (Codepoint)>> matchers;
|
Vector<std::function<bool (Codepoint)>> matchers;
|
||||||
|
MatchDirection direction;
|
||||||
size_t save_count;
|
size_t save_count;
|
||||||
|
|
||||||
struct StartChars { bool map[256]; };
|
struct StartChars { bool map[256]; };
|
||||||
std::unique_ptr<StartChars> start_chars;
|
std::unique_ptr<StartChars> start_chars;
|
||||||
};
|
};
|
||||||
|
|
||||||
CompiledRegex compile_regex(StringView re);
|
CompiledRegex compile_regex(StringView re, MatchDirection direction = MatchDirection::Forward);
|
||||||
|
|
||||||
enum class RegexExecFlags
|
enum class RegexExecFlags
|
||||||
{
|
{
|
||||||
|
@ -67,12 +75,29 @@ enum class RegexExecFlags
|
||||||
|
|
||||||
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
|
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
|
||||||
|
|
||||||
|
template<typename Iterator, MatchDirection direction>
|
||||||
|
struct ChooseUtf8It
|
||||||
|
{
|
||||||
|
using Type = utf8::iterator<Iterator>;
|
||||||
|
};
|
||||||
|
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
|
struct ChooseUtf8It<Iterator, MatchDirection::Backward>
|
||||||
|
{
|
||||||
|
using Type = std::reverse_iterator<utf8::iterator<Iterator>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Iterator, MatchDirection direction>
|
||||||
class ThreadedRegexVM
|
class ThreadedRegexVM
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ThreadedRegexVM(const CompiledRegex& program)
|
ThreadedRegexVM(const CompiledRegex& program)
|
||||||
: m_program{program} { kak_assert(m_program); }
|
: m_program{program}
|
||||||
|
{
|
||||||
|
kak_assert(m_program);
|
||||||
|
if (direction != program.direction)
|
||||||
|
throw runtime_error{"Regex and VM direction mismatch"};
|
||||||
|
}
|
||||||
|
|
||||||
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
|
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
|
||||||
ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete;
|
ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete;
|
||||||
|
@ -89,8 +114,9 @@ public:
|
||||||
|
|
||||||
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
||||||
{
|
{
|
||||||
m_begin = begin;
|
const bool forward = direction == MatchDirection::Forward;
|
||||||
m_end = end;
|
m_begin = Utf8It{utf8::iterator<Iterator>{forward ? begin : end, begin, end}};
|
||||||
|
m_end = Utf8It{utf8::iterator<Iterator>{forward ? end : begin, begin, end}};
|
||||||
m_flags = flags;
|
m_flags = flags;
|
||||||
|
|
||||||
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
|
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
|
||||||
|
@ -99,12 +125,12 @@ public:
|
||||||
Vector<Thread> current_threads, next_threads;
|
Vector<Thread> current_threads, next_threads;
|
||||||
|
|
||||||
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
||||||
Utf8It start{m_begin, m_begin, m_end};
|
Utf8It start{m_begin};
|
||||||
|
|
||||||
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
|
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
|
||||||
|
|
||||||
if (flags & RegexExecFlags::Search)
|
if (flags & RegexExecFlags::Search)
|
||||||
to_next_start(start, end, start_chars);
|
to_next_start(start, m_end, start_chars);
|
||||||
|
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
||||||
current_threads, next_threads))
|
current_threads, next_threads))
|
||||||
|
@ -115,12 +141,12 @@ public:
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
to_next_start(++start, end, start_chars);
|
to_next_start(++start, m_end, start_chars);
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
||||||
current_threads, next_threads))
|
current_threads, next_threads))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
while (start != end);
|
while (start != m_end);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -177,7 +203,7 @@ private:
|
||||||
Saves* saves;
|
Saves* saves;
|
||||||
};
|
};
|
||||||
|
|
||||||
using Utf8It = utf8::iterator<Iterator>;
|
using Utf8It = typename ChooseUtf8It<Iterator, direction>::Type;
|
||||||
|
|
||||||
enum class StepResult { Consumed, Matched, Failed };
|
enum class StepResult { Consumed, Matched, Failed };
|
||||||
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads)
|
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads)
|
||||||
|
@ -233,7 +259,7 @@ private:
|
||||||
--thread.saves->refcount;
|
--thread.saves->refcount;
|
||||||
thread.saves = new_saves<true>(thread.saves->pos);
|
thread.saves = new_saves<true>(thread.saves->pos);
|
||||||
}
|
}
|
||||||
thread.saves->pos[index] = pos.base();
|
thread.saves->pos[index] = get_base(pos);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Matcher:
|
case CompiledRegex::Matcher:
|
||||||
|
@ -350,12 +376,11 @@ private:
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
// Step remaining threads to see if they match without consuming anything else
|
// Step remaining threads to see if they match without consuming anything else
|
||||||
const Utf8It end{m_end, m_begin, m_end};
|
|
||||||
while (not current_threads.empty())
|
while (not current_threads.empty())
|
||||||
{
|
{
|
||||||
auto thread = current_threads.back();
|
auto thread = current_threads.back();
|
||||||
current_threads.pop_back();
|
current_threads.pop_back();
|
||||||
if (step(end, thread, current_threads) == StepResult::Matched)
|
if (step(m_end, thread, current_threads) == StepResult::Matched)
|
||||||
{
|
{
|
||||||
release_saves(m_captures);
|
release_saves(m_captures);
|
||||||
m_captures = thread.saves;
|
m_captures = thread.saves;
|
||||||
|
@ -365,7 +390,7 @@ private:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars)
|
void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars)
|
||||||
{
|
{
|
||||||
if (not start_chars)
|
if (not start_chars)
|
||||||
return;
|
return;
|
||||||
|
@ -401,10 +426,13 @@ private:
|
||||||
is_word(*(pos-1)) != is_word(*pos);
|
is_word(*(pos-1)) != is_word(*pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const Iterator& get_base(const utf8::iterator<Iterator>& it) { return it.base(); }
|
||||||
|
static const Iterator& get_base(const std::reverse_iterator<utf8::iterator<Iterator>>& it) { return it.base().base(); }
|
||||||
|
|
||||||
const CompiledRegex& m_program;
|
const CompiledRegex& m_program;
|
||||||
|
|
||||||
Iterator m_begin;
|
Utf8It m_begin;
|
||||||
Iterator m_end;
|
Utf8It m_end;
|
||||||
RegexExecFlags m_flags;
|
RegexExecFlags m_flags;
|
||||||
|
|
||||||
Vector<Saves*> m_saves;
|
Vector<Saves*> m_saves;
|
||||||
|
@ -413,19 +441,19 @@ private:
|
||||||
Saves* m_captures = nullptr;
|
Saves* m_captures = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename It>
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
||||||
bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None)
|
bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It, direction> vm{re};
|
||||||
return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) |
|
return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) |
|
||||||
RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
|
RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
||||||
bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
||||||
RegexExecFlags flags = RegexExecFlags::None)
|
RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It, direction> vm{re};
|
||||||
if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search)))
|
if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search)))
|
||||||
{
|
{
|
||||||
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
|
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
|
||||||
|
@ -434,19 +462,19 @@ bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
||||||
bool regex_search(It begin, It end, const CompiledRegex& re,
|
bool regex_search(It begin, It end, const CompiledRegex& re,
|
||||||
RegexExecFlags flags = RegexExecFlags::None)
|
RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It, direction> vm{re};
|
||||||
return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
|
return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
||||||
bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
||||||
RegexExecFlags flags = RegexExecFlags::None)
|
RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It, direction> vm{re};
|
||||||
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
|
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
|
||||||
{
|
{
|
||||||
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
|
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
|
||||||
|
|
Loading…
Reference in New Issue
Block a user