Regex: Add support for backward matching

Regex can be compiled for backward matching instead of forward matching
and the ThreadedRegexVM is able to iterate in reverse on the subject
string to find the last match instead of the first.
This commit is contained in:
Maxime Coste 2017-10-07 12:46:27 +08:00
parent 071b897e00
commit c47cdc06a7
2 changed files with 163 additions and 95 deletions

View File

@ -501,13 +501,14 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8]
struct RegexCompiler
{
RegexCompiler(const ParsedRegex& parsed_regex)
: m_parsed_regex{parsed_regex}
RegexCompiler(const ParsedRegex& parsed_regex, MatchDirection direction)
: m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
{
compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
m_program.save_count = m_parsed_regex.capture_count * 2;
m_program.direction = direction;
m_program.start_chars = compute_start_chars();
}
@ -524,7 +525,7 @@ private:
if (capture != -1)
{
push_op(CompiledRegex::Save);
push_byte(capture * 2);
push_byte(capture * 2 + (m_forward ? 0 : 1));
}
Vector<Offset> goto_inner_end_offsets;
@ -543,9 +544,15 @@ private:
push_op(CompiledRegex::Matcher);
push_byte(node->value);
case ParsedRegex::Sequence:
for (auto& child : node->children)
compile_node(child);
{
if (m_forward)
for (auto& child : node->children)
compile_node(child);
else
for (auto& child : node->children | reverse())
compile_node(child);
break;
}
case ParsedRegex::Alternation:
{
auto& children = node->children;
@ -554,36 +561,42 @@ private:
push_op(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset();
compile_node(children[0]);
compile_node(children[m_forward ? 0 : 1]);
push_op(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset());
auto right_pos = compile_node(children[1]);
auto right_pos = compile_node(children[m_forward ? 1 : 0]);
set_offset(offset, right_pos);
break;
}
case ParsedRegex::LookAhead:
push_op(CompiledRegex::LookAhead);
push_string(node->children);
push_op(m_forward ? CompiledRegex::LookAhead
: CompiledRegex::LookBehind);
push_string(node->children, false);
break;
case ParsedRegex::NegativeLookAhead:
push_op(CompiledRegex::NegativeLookAhead);
push_string(node->children);
push_op(m_forward ? CompiledRegex::NegativeLookAhead
: CompiledRegex::NegativeLookBehind);
push_string(node->children, false);
break;
case ParsedRegex::LookBehind:
push_op(CompiledRegex::LookBehind);
push_op(m_forward ? CompiledRegex::LookBehind
: CompiledRegex::LookAhead);
push_string(node->children, true);
break;
case ParsedRegex::NegativeLookBehind:
push_op(CompiledRegex::NegativeLookBehind);
push_op(m_forward ? CompiledRegex::NegativeLookBehind
: CompiledRegex::NegativeLookAhead);
push_string(node->children, true);
break;
case ParsedRegex::LineStart:
push_op(CompiledRegex::LineStart);
push_op(m_forward ? CompiledRegex::LineStart
: CompiledRegex::LineEnd);
break;
case ParsedRegex::LineEnd:
push_op(CompiledRegex::LineEnd);
push_op(m_forward ? CompiledRegex::LineEnd
: CompiledRegex::LineStart);
break;
case ParsedRegex::WordBoundary:
push_op(CompiledRegex::WordBoundary);
@ -592,10 +605,12 @@ private:
push_op(CompiledRegex::NotWordBoundary);
break;
case ParsedRegex::SubjectBegin:
push_op(CompiledRegex::SubjectBegin);
push_op(m_forward ? CompiledRegex::SubjectBegin
: CompiledRegex::SubjectEnd);
break;
case ParsedRegex::SubjectEnd:
push_op(CompiledRegex::SubjectEnd);
push_op(m_forward ? CompiledRegex::SubjectEnd
: CompiledRegex::SubjectBegin);
break;
case ParsedRegex::ResetStart:
push_op(CompiledRegex::Save);
@ -609,7 +624,7 @@ private:
if (capture != -1)
{
push_op(CompiledRegex::Save);
push_byte(capture * 2 + 1);
push_byte(capture * 2 + (m_forward ? 1 : 0));
}
return start_pos;
@ -622,6 +637,8 @@ private:
auto& quantifier = node->quantifier;
// TODO reverse, invert the way we write optional quantifiers ?
if (quantifier.allows_none())
{
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
@ -720,14 +737,14 @@ private:
case ParsedRegex::Sequence:
{
bool consumed = false;
for (auto& child : node->children)
{
if (not compute_start_chars(child, accepted, rejected))
{
consumed = true;
break;
}
}
auto consumes = [&, this](auto& child) {
return not this->compute_start_chars(child, accepted, rejected);
};
if (m_forward)
consumed = contains_that(node->children, consumes);
else
consumed = contains_that(node->children | reverse(), consumes);
return not consumed or node->quantifier.allows_none();
}
case ParsedRegex::Alternation:
@ -750,11 +767,13 @@ private:
return true;
case ParsedRegex::LookAhead:
if (node->children.empty())
compute_start_chars(node->children.front(), accepted, rejected);
compute_start_chars(m_forward ? node->children.front() : node->children.back(),
accepted, rejected);
return true;
case ParsedRegex::NegativeLookAhead:
if (node->children.empty())
compute_start_chars(node->children.front(), rejected, accepted);
compute_start_chars(m_forward ? node->children.front() : node->children.back(),
rejected, accepted);
return true;
case ParsedRegex::LookBehind:
return true;
@ -780,6 +799,7 @@ private:
CompiledRegex m_program;
const ParsedRegex& m_parsed_regex;
const bool m_forward;
};
void dump_regex(const CompiledRegex& program)
@ -864,27 +884,34 @@ void dump_regex(const CompiledRegex& program)
}
}
CompiledRegex compile_regex(StringView re)
CompiledRegex compile_regex(StringView re, MatchDirection direction)
{
return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex();
return RegexCompiler{RegexParser::parse(re), direction}.get_compiled_regex();
}
namespace
{
template<MatchDirection dir = MatchDirection::Forward>
struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
{
using VMType = ThreadedRegexVM<const char*, dir>;
TestVM(StringView re, bool dump = false)
: CompiledRegex{compile_regex(re, dir)},
VMType{(const CompiledRegex&)*this}
{ if (dump) dump_regex(*this); }
bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch)
{
return VMType::exec(re.begin(), re.end(), flags);
}
};
}
auto test_regex = UnitTest{[]{
struct TestVM : CompiledRegex, ThreadedRegexVM<const char*>
{
TestVM(StringView re, bool dump = false)
: CompiledRegex{compile_regex(re)},
ThreadedRegexVM{(const CompiledRegex&)*this}
{ if (dump) dump_regex(*this); }
bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch)
{
return ThreadedRegexVM::exec(re.begin(), re.end(), flags);
}
};
{
TestVM vm{R"(a*b)"};
TestVM<> vm{R"(a*b)"};
kak_assert(vm.exec("b"));
kak_assert(vm.exec("ab"));
kak_assert(vm.exec("aaab"));
@ -894,7 +921,7 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"(^a.*b$)"};
TestVM<> vm{R"(^a.*b$)"};
kak_assert(vm.exec("afoob"));
kak_assert(vm.exec("ab"));
kak_assert(not vm.exec("bab"));
@ -902,7 +929,7 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"(^(foo|qux|baz)+(bar)?baz$)"};
TestVM<> vm{R"(^(foo|qux|baz)+(bar)?baz$)"};
kak_assert(vm.exec("fooquxbarbaz"));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "qux");
kak_assert(not vm.exec("fooquxbarbaze"));
@ -913,7 +940,7 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"(.*\b(foo|bar)\b.*)"};
TestVM<> vm{R"(.*\b(foo|bar)\b.*)"};
kak_assert(vm.exec("qux foo baz"));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "foo");
kak_assert(not vm.exec("quxfoobaz"));
@ -922,14 +949,14 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"((foo|bar))"};
TestVM<> vm{R"((foo|bar))"};
kak_assert(vm.exec("foo"));
kak_assert(vm.exec("bar"));
kak_assert(not vm.exec("foobar"));
}
{
TestVM vm{R"(a{3,5}b)"};
TestVM<> vm{R"(a{3,5}b)"};
kak_assert(not vm.exec("aab"));
kak_assert(vm.exec("aaab"));
kak_assert(not vm.exec("aaaaaab"));
@ -937,21 +964,21 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"(a{3}b)"};
TestVM<> vm{R"(a{3}b)"};
kak_assert(not vm.exec("aab"));
kak_assert(vm.exec("aaab"));
kak_assert(not vm.exec("aaaab"));
}
{
TestVM vm{R"(a{3,}b)"};
TestVM<> vm{R"(a{3,}b)"};
kak_assert(not vm.exec("aab"));
kak_assert(vm.exec("aaab"));
kak_assert(vm.exec("aaaaab"));
}
{
TestVM vm{R"(a{,3}b)"};
TestVM<> vm{R"(a{,3}b)"};
kak_assert(vm.exec("b"));
kak_assert(vm.exec("ab"));
kak_assert(vm.exec("aaab"));
@ -959,7 +986,7 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"(f.*a(.*o))"};
TestVM<> vm{R"(f.*a(.*o))"};
kak_assert(vm.exec("blahfoobarfoobaz", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foobarfoo");
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "rfoo");
@ -969,7 +996,7 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"([àb-dX-Z-]{3,5})"};
TestVM<> vm{R"([àb-dX-Z-]{3,5})"};
kak_assert(vm.exec("cà-Y"));
kak_assert(not vm.exec("àeY"));
kak_assert(vm.exec("dcbàX"));
@ -977,115 +1004,128 @@ auto test_regex = UnitTest{[]{
}
{
TestVM vm{R"((a{3,5})a+)"};
TestVM<> vm{R"((a{3,5})a+)"};
kak_assert(vm.exec("aaaaaa", RegexExecFlags::None));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaaaa");
}
{
TestVM vm{R"((a{3,5}?)a+)"};
TestVM<> vm{R"((a{3,5}?)a+)"};
kak_assert(vm.exec("aaaaaa", RegexExecFlags::None));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaa");
}
{
TestVM vm{R"((a{3,5}?)a)"};
TestVM<> vm{R"((a{3,5}?)a)"};
kak_assert(vm.exec("aaaa"));
}
{
TestVM vm{R"(\d{3})"};
TestVM<> vm{R"(\d{3})"};
kak_assert(vm.exec("123"));
kak_assert(not vm.exec("1x3"));
}
{
TestVM vm{R"([-\d]+)"};
TestVM<> vm{R"([-\d]+)"};
kak_assert(vm.exec("123-456"));
kak_assert(not vm.exec("123_456"));
}
{
TestVM vm{R"([ \H]+)"};
TestVM<> vm{R"([ \H]+)"};
kak_assert(vm.exec("abc "));
kak_assert(not vm.exec("a \t"));
}
{
TestVM vm{R"(\Q{}[]*+?\Ea+)"};
TestVM<> vm{R"(\Q{}[]*+?\Ea+)"};
kak_assert(vm.exec("{}[]*+?aa"));
}
{
TestVM vm{R"(\Q...)"};
TestVM<> vm{R"(\Q...)"};
kak_assert(vm.exec("..."));
kak_assert(not vm.exec("bla"));
}
{
TestVM vm{R"(foo\Kbar)"};
TestVM<> vm{R"(foo\Kbar)"};
kak_assert(vm.exec("foobar", RegexExecFlags::None));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "bar");
kak_assert(not vm.exec("bar", RegexExecFlags::None));
}
{
TestVM vm{R"((fo+?).*)"};
TestVM<> vm{R"((fo+?).*)"};
kak_assert(vm.exec("foooo", RegexExecFlags::None));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "fo");
}
{
TestVM vm{R"((?=foo).)"};
TestVM<> vm{R"((?=foo).)"};
kak_assert(vm.exec("barfoo", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "f");
}
{
TestVM vm{R"((?!foo)...)"};
TestVM<> vm{R"((?!foo)...)"};
kak_assert(not vm.exec("foo"));
kak_assert(vm.exec("qux"));
}
{
TestVM vm{R"(...(?<=foo))"};
TestVM<> vm{R"(...(?<=foo))"};
kak_assert(vm.exec("foo"));
kak_assert(not vm.exec("qux"));
}
{
TestVM vm{R"(...(?<!foo))"};
TestVM<> vm{R"(...(?<!foo))"};
kak_assert(not vm.exec("foo"));
kak_assert(vm.exec("qux"));
}
{
TestVM vm{R"(Foo(?i)f[oB]+)"};
TestVM<> vm{R"(Foo(?i)f[oB]+)"};
kak_assert(vm.exec("FooFOoBb"));
}
{
TestVM vm{R"([^\]]+)"};
TestVM<> vm{R"([^\]]+)"};
kak_assert(not vm.exec("a]c"));
kak_assert(vm.exec("abc"));
}
{
TestVM vm{R"((?:foo)+)"};
TestVM<> vm{R"((?:foo)+)"};
kak_assert(vm.exec("foofoofoo"));
kak_assert(not vm.exec("barbarbar"));
}
{
TestVM vm{R"((?<!\\)(?:\\\\)*")"};
TestVM<> vm{R"((?<!\\)(?:\\\\)*")"};
kak_assert(vm.exec("foo\"", RegexExecFlags::Search));
}
{
TestVM vm{R"($)"};
TestVM<> vm{R"($)"};
kak_assert(vm.exec("foo\n", RegexExecFlags::Search));
kak_assert(*vm.captures()[0] == '\n');
}
{
TestVM<MatchDirection::Backward> vm{R"(fo{1,})"};
kak_assert(vm.exec("foo1fooo2", RegexExecFlags::Search));
kak_assert(*vm.captures()[1] == '2');
}
{
TestVM<MatchDirection::Backward> vm{R"((?<=f)oo(b[ae]r)?(?=baz))"};
kak_assert(vm.exec("foobarbazfoobazfooberbaz", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "oober");
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "ber");
}
}};
}

View File

@ -1,6 +1,7 @@
#ifndef regex_impl_hh_INCLUDED
#define regex_impl_hh_INCLUDED
#include "exception.hh"
#include "flags.hh"
#include "ref_ptr.hh"
#include "unicode.hh"
@ -13,6 +14,12 @@
namespace Kakoune
{
enum class MatchDirection
{
Forward,
Backward
};
struct CompiledRegex : RefCountable
{
enum Op : char
@ -43,13 +50,14 @@ struct CompiledRegex : RefCountable
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
MatchDirection direction;
size_t save_count;
struct StartChars { bool map[256]; };
std::unique_ptr<StartChars> start_chars;
};
CompiledRegex compile_regex(StringView re);
CompiledRegex compile_regex(StringView re, MatchDirection direction = MatchDirection::Forward);
enum class RegexExecFlags
{
@ -67,12 +75,29 @@ enum class RegexExecFlags
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
template<typename Iterator, MatchDirection direction>
struct ChooseUtf8It
{
using Type = utf8::iterator<Iterator>;
};
template<typename Iterator>
struct ChooseUtf8It<Iterator, MatchDirection::Backward>
{
using Type = std::reverse_iterator<utf8::iterator<Iterator>>;
};
template<typename Iterator, MatchDirection direction>
class ThreadedRegexVM
{
public:
ThreadedRegexVM(const CompiledRegex& program)
: m_program{program} { kak_assert(m_program); }
: m_program{program}
{
kak_assert(m_program);
if (direction != program.direction)
throw runtime_error{"Regex and VM direction mismatch"};
}
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete;
@ -89,8 +114,9 @@ public:
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
{
m_begin = begin;
m_end = end;
const bool forward = direction == MatchDirection::Forward;
m_begin = Utf8It{utf8::iterator<Iterator>{forward ? begin : end, begin, end}};
m_end = Utf8It{utf8::iterator<Iterator>{forward ? end : begin, begin, end}};
m_flags = flags;
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
@ -99,12 +125,12 @@ public:
Vector<Thread> current_threads, next_threads;
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
Utf8It start{m_begin, m_begin, m_end};
Utf8It start{m_begin};
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
if (flags & RegexExecFlags::Search)
to_next_start(start, end, start_chars);
to_next_start(start, m_end, start_chars);
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads))
@ -115,12 +141,12 @@ public:
do
{
to_next_start(++start, end, start_chars);
to_next_start(++start, m_end, start_chars);
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads))
return true;
}
while (start != end);
while (start != m_end);
return false;
}
@ -177,7 +203,7 @@ private:
Saves* saves;
};
using Utf8It = utf8::iterator<Iterator>;
using Utf8It = typename ChooseUtf8It<Iterator, direction>::Type;
enum class StepResult { Consumed, Matched, Failed };
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads)
@ -233,7 +259,7 @@ private:
--thread.saves->refcount;
thread.saves = new_saves<true>(thread.saves->pos);
}
thread.saves->pos[index] = pos.base();
thread.saves->pos[index] = get_base(pos);
break;
}
case CompiledRegex::Matcher:
@ -350,12 +376,11 @@ private:
return true;
// Step remaining threads to see if they match without consuming anything else
const Utf8It end{m_end, m_begin, m_end};
while (not current_threads.empty())
{
auto thread = current_threads.back();
current_threads.pop_back();
if (step(end, thread, current_threads) == StepResult::Matched)
if (step(m_end, thread, current_threads) == StepResult::Matched)
{
release_saves(m_captures);
m_captures = thread.saves;
@ -365,7 +390,7 @@ private:
return false;
}
void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars)
void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars)
{
if (not start_chars)
return;
@ -401,10 +426,13 @@ private:
is_word(*(pos-1)) != is_word(*pos);
}
static const Iterator& get_base(const utf8::iterator<Iterator>& it) { return it.base(); }
static const Iterator& get_base(const std::reverse_iterator<utf8::iterator<Iterator>>& it) { return it.base().base(); }
const CompiledRegex& m_program;
Iterator m_begin;
Iterator m_end;
Utf8It m_begin;
Utf8It m_end;
RegexExecFlags m_flags;
Vector<Saves*> m_saves;
@ -413,19 +441,19 @@ private:
Saves* m_captures = nullptr;
};
template<typename It>
template<typename It, MatchDirection direction = MatchDirection::Forward>
bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None)
{
ThreadedRegexVM<It> vm{re};
ThreadedRegexVM<It, direction> vm{re};
return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) |
RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
}
template<typename It>
template<typename It, MatchDirection direction = MatchDirection::Forward>
bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
RegexExecFlags flags = RegexExecFlags::None)
{
ThreadedRegexVM<It> vm{re};
ThreadedRegexVM<It, direction> vm{re};
if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search)))
{
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
@ -434,19 +462,19 @@ bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re
return false;
}
template<typename It>
template<typename It, MatchDirection direction = MatchDirection::Forward>
bool regex_search(It begin, It end, const CompiledRegex& re,
RegexExecFlags flags = RegexExecFlags::None)
{
ThreadedRegexVM<It> vm{re};
ThreadedRegexVM<It, direction> vm{re};
return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
}
template<typename It>
template<typename It, MatchDirection direction = MatchDirection::Forward>
bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
RegexExecFlags flags = RegexExecFlags::None)
{
ThreadedRegexVM<It> vm{re};
ThreadedRegexVM<It, direction> vm{re};
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
{
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));