From c47cdc06a7af0539c6ec0091f13790b6c1005688 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Sat, 7 Oct 2017 12:46:27 +0800 Subject: [PATCH] Regex: Add support for backward matching Regex can be compiled for backward matching instead of forward matching and the ThreadedRegexVM is able to iterate in reverse on the subject string to find the last match instead of the first. --- src/regex_impl.cc | 184 ++++++++++++++++++++++++++++------------------ src/regex_impl.hh | 74 +++++++++++++------ 2 files changed, 163 insertions(+), 95 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index c0a9371a..7e122c81 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -501,13 +501,14 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] struct RegexCompiler { - RegexCompiler(const ParsedRegex& parsed_regex) - : m_parsed_regex{parsed_regex} + RegexCompiler(const ParsedRegex& parsed_regex, MatchDirection direction) + : m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward} { compile_node(m_parsed_regex.ast); push_op(CompiledRegex::Match); m_program.matchers = m_parsed_regex.matchers; m_program.save_count = m_parsed_regex.capture_count * 2; + m_program.direction = direction; m_program.start_chars = compute_start_chars(); } @@ -524,7 +525,7 @@ private: if (capture != -1) { push_op(CompiledRegex::Save); - push_byte(capture * 2); + push_byte(capture * 2 + (m_forward ? 0 : 1)); } Vector goto_inner_end_offsets; @@ -543,9 +544,15 @@ private: push_op(CompiledRegex::Matcher); push_byte(node->value); case ParsedRegex::Sequence: - for (auto& child : node->children) - compile_node(child); + { + if (m_forward) + for (auto& child : node->children) + compile_node(child); + else + for (auto& child : node->children | reverse()) + compile_node(child); break; + } case ParsedRegex::Alternation: { auto& children = node->children; @@ -554,36 +561,42 @@ private: push_op(CompiledRegex::Split_PrioritizeParent); auto offset = alloc_offset(); - compile_node(children[0]); + compile_node(children[m_forward ? 0 : 1]); push_op(CompiledRegex::Jump); goto_inner_end_offsets.push_back(alloc_offset()); - auto right_pos = compile_node(children[1]); + auto right_pos = compile_node(children[m_forward ? 1 : 0]); set_offset(offset, right_pos); break; } case ParsedRegex::LookAhead: - push_op(CompiledRegex::LookAhead); - push_string(node->children); + push_op(m_forward ? CompiledRegex::LookAhead + : CompiledRegex::LookBehind); + push_string(node->children, false); break; case ParsedRegex::NegativeLookAhead: - push_op(CompiledRegex::NegativeLookAhead); - push_string(node->children); + push_op(m_forward ? CompiledRegex::NegativeLookAhead + : CompiledRegex::NegativeLookBehind); + push_string(node->children, false); break; case ParsedRegex::LookBehind: - push_op(CompiledRegex::LookBehind); + push_op(m_forward ? CompiledRegex::LookBehind + : CompiledRegex::LookAhead); push_string(node->children, true); break; case ParsedRegex::NegativeLookBehind: - push_op(CompiledRegex::NegativeLookBehind); + push_op(m_forward ? CompiledRegex::NegativeLookBehind + : CompiledRegex::NegativeLookAhead); push_string(node->children, true); break; case ParsedRegex::LineStart: - push_op(CompiledRegex::LineStart); + push_op(m_forward ? CompiledRegex::LineStart + : CompiledRegex::LineEnd); break; case ParsedRegex::LineEnd: - push_op(CompiledRegex::LineEnd); + push_op(m_forward ? CompiledRegex::LineEnd + : CompiledRegex::LineStart); break; case ParsedRegex::WordBoundary: push_op(CompiledRegex::WordBoundary); @@ -592,10 +605,12 @@ private: push_op(CompiledRegex::NotWordBoundary); break; case ParsedRegex::SubjectBegin: - push_op(CompiledRegex::SubjectBegin); + push_op(m_forward ? CompiledRegex::SubjectBegin + : CompiledRegex::SubjectEnd); break; case ParsedRegex::SubjectEnd: - push_op(CompiledRegex::SubjectEnd); + push_op(m_forward ? CompiledRegex::SubjectEnd + : CompiledRegex::SubjectBegin); break; case ParsedRegex::ResetStart: push_op(CompiledRegex::Save); @@ -609,7 +624,7 @@ private: if (capture != -1) { push_op(CompiledRegex::Save); - push_byte(capture * 2 + 1); + push_byte(capture * 2 + (m_forward ? 1 : 0)); } return start_pos; @@ -622,6 +637,8 @@ private: auto& quantifier = node->quantifier; + // TODO reverse, invert the way we write optional quantifiers ? + if (quantifier.allows_none()) { push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent @@ -720,14 +737,14 @@ private: case ParsedRegex::Sequence: { bool consumed = false; - for (auto& child : node->children) - { - if (not compute_start_chars(child, accepted, rejected)) - { - consumed = true; - break; - } - } + auto consumes = [&, this](auto& child) { + return not this->compute_start_chars(child, accepted, rejected); + }; + if (m_forward) + consumed = contains_that(node->children, consumes); + else + consumed = contains_that(node->children | reverse(), consumes); + return not consumed or node->quantifier.allows_none(); } case ParsedRegex::Alternation: @@ -750,11 +767,13 @@ private: return true; case ParsedRegex::LookAhead: if (node->children.empty()) - compute_start_chars(node->children.front(), accepted, rejected); + compute_start_chars(m_forward ? node->children.front() : node->children.back(), + accepted, rejected); return true; case ParsedRegex::NegativeLookAhead: if (node->children.empty()) - compute_start_chars(node->children.front(), rejected, accepted); + compute_start_chars(m_forward ? node->children.front() : node->children.back(), + rejected, accepted); return true; case ParsedRegex::LookBehind: return true; @@ -780,6 +799,7 @@ private: CompiledRegex m_program; const ParsedRegex& m_parsed_regex; + const bool m_forward; }; void dump_regex(const CompiledRegex& program) @@ -864,27 +884,34 @@ void dump_regex(const CompiledRegex& program) } } -CompiledRegex compile_regex(StringView re) +CompiledRegex compile_regex(StringView re, MatchDirection direction) { - return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex(); + return RegexCompiler{RegexParser::parse(re), direction}.get_compiled_regex(); +} + +namespace +{ +template +struct TestVM : CompiledRegex, ThreadedRegexVM +{ + using VMType = ThreadedRegexVM; + + TestVM(StringView re, bool dump = false) + : CompiledRegex{compile_regex(re, dir)}, + VMType{(const CompiledRegex&)*this} + { if (dump) dump_regex(*this); } + + bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch) + { + return VMType::exec(re.begin(), re.end(), flags); + } +}; } auto test_regex = UnitTest{[]{ - struct TestVM : CompiledRegex, ThreadedRegexVM - { - TestVM(StringView re, bool dump = false) - : CompiledRegex{compile_regex(re)}, - ThreadedRegexVM{(const CompiledRegex&)*this} - { if (dump) dump_regex(*this); } - - bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch) - { - return ThreadedRegexVM::exec(re.begin(), re.end(), flags); - } - }; { - TestVM vm{R"(a*b)"}; + TestVM<> vm{R"(a*b)"}; kak_assert(vm.exec("b")); kak_assert(vm.exec("ab")); kak_assert(vm.exec("aaab")); @@ -894,7 +921,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(^a.*b$)"}; + TestVM<> vm{R"(^a.*b$)"}; kak_assert(vm.exec("afoob")); kak_assert(vm.exec("ab")); kak_assert(not vm.exec("bab")); @@ -902,7 +929,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(^(foo|qux|baz)+(bar)?baz$)"}; + TestVM<> vm{R"(^(foo|qux|baz)+(bar)?baz$)"}; kak_assert(vm.exec("fooquxbarbaz")); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "qux"); kak_assert(not vm.exec("fooquxbarbaze")); @@ -913,7 +940,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(.*\b(foo|bar)\b.*)"}; + TestVM<> vm{R"(.*\b(foo|bar)\b.*)"}; kak_assert(vm.exec("qux foo baz")); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "foo"); kak_assert(not vm.exec("quxfoobaz")); @@ -922,14 +949,14 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"((foo|bar))"}; + TestVM<> vm{R"((foo|bar))"}; kak_assert(vm.exec("foo")); kak_assert(vm.exec("bar")); kak_assert(not vm.exec("foobar")); } { - TestVM vm{R"(a{3,5}b)"}; + TestVM<> vm{R"(a{3,5}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); kak_assert(not vm.exec("aaaaaab")); @@ -937,21 +964,21 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(a{3}b)"}; + TestVM<> vm{R"(a{3}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); kak_assert(not vm.exec("aaaab")); } { - TestVM vm{R"(a{3,}b)"}; + TestVM<> vm{R"(a{3,}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); kak_assert(vm.exec("aaaaab")); } { - TestVM vm{R"(a{,3}b)"}; + TestVM<> vm{R"(a{,3}b)"}; kak_assert(vm.exec("b")); kak_assert(vm.exec("ab")); kak_assert(vm.exec("aaab")); @@ -959,7 +986,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(f.*a(.*o))"}; + TestVM<> vm{R"(f.*a(.*o))"}; kak_assert(vm.exec("blahfoobarfoobaz", RegexExecFlags::Search)); kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foobarfoo"); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "rfoo"); @@ -969,7 +996,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"([àb-dX-Z-]{3,5})"}; + TestVM<> vm{R"([àb-dX-Z-]{3,5})"}; kak_assert(vm.exec("cà-Y")); kak_assert(not vm.exec("àeY")); kak_assert(vm.exec("dcbàX")); @@ -977,115 +1004,128 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"((a{3,5})a+)"}; + TestVM<> vm{R"((a{3,5})a+)"}; kak_assert(vm.exec("aaaaaa", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaaaa"); } { - TestVM vm{R"((a{3,5}?)a+)"}; + TestVM<> vm{R"((a{3,5}?)a+)"}; kak_assert(vm.exec("aaaaaa", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaa"); } { - TestVM vm{R"((a{3,5}?)a)"}; + TestVM<> vm{R"((a{3,5}?)a)"}; kak_assert(vm.exec("aaaa")); } { - TestVM vm{R"(\d{3})"}; + TestVM<> vm{R"(\d{3})"}; kak_assert(vm.exec("123")); kak_assert(not vm.exec("1x3")); } { - TestVM vm{R"([-\d]+)"}; + TestVM<> vm{R"([-\d]+)"}; kak_assert(vm.exec("123-456")); kak_assert(not vm.exec("123_456")); } { - TestVM vm{R"([ \H]+)"}; + TestVM<> vm{R"([ \H]+)"}; kak_assert(vm.exec("abc ")); kak_assert(not vm.exec("a \t")); } { - TestVM vm{R"(\Q{}[]*+?\Ea+)"}; + TestVM<> vm{R"(\Q{}[]*+?\Ea+)"}; kak_assert(vm.exec("{}[]*+?aa")); } { - TestVM vm{R"(\Q...)"}; + TestVM<> vm{R"(\Q...)"}; kak_assert(vm.exec("...")); kak_assert(not vm.exec("bla")); } { - TestVM vm{R"(foo\Kbar)"}; + TestVM<> vm{R"(foo\Kbar)"}; kak_assert(vm.exec("foobar", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "bar"); kak_assert(not vm.exec("bar", RegexExecFlags::None)); } { - TestVM vm{R"((fo+?).*)"}; + TestVM<> vm{R"((fo+?).*)"}; kak_assert(vm.exec("foooo", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "fo"); } { - TestVM vm{R"((?=foo).)"}; + TestVM<> vm{R"((?=foo).)"}; kak_assert(vm.exec("barfoo", RegexExecFlags::Search)); kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "f"); } { - TestVM vm{R"((?!foo)...)"}; + TestVM<> vm{R"((?!foo)...)"}; kak_assert(not vm.exec("foo")); kak_assert(vm.exec("qux")); } { - TestVM vm{R"(...(?<=foo))"}; + TestVM<> vm{R"(...(?<=foo))"}; kak_assert(vm.exec("foo")); kak_assert(not vm.exec("qux")); } { - TestVM vm{R"(...(? vm{R"(...(? vm{R"(Foo(?i)f[oB]+)"}; kak_assert(vm.exec("FooFOoBb")); } { - TestVM vm{R"([^\]]+)"}; + TestVM<> vm{R"([^\]]+)"}; kak_assert(not vm.exec("a]c")); kak_assert(vm.exec("abc")); } { - TestVM vm{R"((?:foo)+)"}; + TestVM<> vm{R"((?:foo)+)"}; kak_assert(vm.exec("foofoofoo")); kak_assert(not vm.exec("barbarbar")); } { - TestVM vm{R"((? vm{R"((? vm{R"($)"}; kak_assert(vm.exec("foo\n", RegexExecFlags::Search)); kak_assert(*vm.captures()[0] == '\n'); } + + { + TestVM vm{R"(fo{1,})"}; + kak_assert(vm.exec("foo1fooo2", RegexExecFlags::Search)); + kak_assert(*vm.captures()[1] == '2'); + } + + { + TestVM vm{R"((?<=f)oo(b[ae]r)?(?=baz))"}; + kak_assert(vm.exec("foobarbazfoobazfooberbaz", RegexExecFlags::Search)); + kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "oober"); + kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "ber"); + } }}; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 16daba22..6b4d1369 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -1,6 +1,7 @@ #ifndef regex_impl_hh_INCLUDED #define regex_impl_hh_INCLUDED +#include "exception.hh" #include "flags.hh" #include "ref_ptr.hh" #include "unicode.hh" @@ -13,6 +14,12 @@ namespace Kakoune { +enum class MatchDirection +{ + Forward, + Backward +}; + struct CompiledRegex : RefCountable { enum Op : char @@ -43,13 +50,14 @@ struct CompiledRegex : RefCountable Vector bytecode; Vector> matchers; + MatchDirection direction; size_t save_count; struct StartChars { bool map[256]; }; std::unique_ptr start_chars; }; -CompiledRegex compile_regex(StringView re); +CompiledRegex compile_regex(StringView re, MatchDirection direction = MatchDirection::Forward); enum class RegexExecFlags { @@ -67,12 +75,29 @@ enum class RegexExecFlags constexpr bool with_bit_ops(Meta::Type) { return true; } +template +struct ChooseUtf8It +{ + using Type = utf8::iterator; +}; + template +struct ChooseUtf8It +{ + using Type = std::reverse_iterator>; +}; + +template class ThreadedRegexVM { public: ThreadedRegexVM(const CompiledRegex& program) - : m_program{program} { kak_assert(m_program); } + : m_program{program} + { + kak_assert(m_program); + if (direction != program.direction) + throw runtime_error{"Regex and VM direction mismatch"}; + } ThreadedRegexVM(const ThreadedRegexVM&) = delete; ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete; @@ -89,8 +114,9 @@ public: bool exec(Iterator begin, Iterator end, RegexExecFlags flags) { - m_begin = begin; - m_end = end; + const bool forward = direction == MatchDirection::Forward; + m_begin = Utf8It{utf8::iterator{forward ? begin : end, begin, end}}; + m_end = Utf8It{utf8::iterator{forward ? end : begin, begin, end}}; m_flags = flags; if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end) @@ -99,12 +125,12 @@ public: Vector current_threads, next_threads; const bool no_saves = (m_flags & RegexExecFlags::NoSaves); - Utf8It start{m_begin, m_begin, m_end}; + Utf8It start{m_begin}; const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr; if (flags & RegexExecFlags::Search) - to_next_start(start, end, start_chars); + to_next_start(start, m_end, start_chars); if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), current_threads, next_threads)) @@ -115,12 +141,12 @@ public: do { - to_next_start(++start, end, start_chars); + to_next_start(++start, m_end, start_chars); if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), current_threads, next_threads)) return true; } - while (start != end); + while (start != m_end); return false; } @@ -177,7 +203,7 @@ private: Saves* saves; }; - using Utf8It = utf8::iterator; + using Utf8It = typename ChooseUtf8It::Type; enum class StepResult { Consumed, Matched, Failed }; StepResult step(const Utf8It& pos, Thread& thread, Vector& threads) @@ -233,7 +259,7 @@ private: --thread.saves->refcount; thread.saves = new_saves(thread.saves->pos); } - thread.saves->pos[index] = pos.base(); + thread.saves->pos[index] = get_base(pos); break; } case CompiledRegex::Matcher: @@ -350,12 +376,11 @@ private: return true; // Step remaining threads to see if they match without consuming anything else - const Utf8It end{m_end, m_begin, m_end}; while (not current_threads.empty()) { auto thread = current_threads.back(); current_threads.pop_back(); - if (step(end, thread, current_threads) == StepResult::Matched) + if (step(m_end, thread, current_threads) == StepResult::Matched) { release_saves(m_captures); m_captures = thread.saves; @@ -365,7 +390,7 @@ private: return false; } - void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars) + void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars) { if (not start_chars) return; @@ -401,10 +426,13 @@ private: is_word(*(pos-1)) != is_word(*pos); } + static const Iterator& get_base(const utf8::iterator& it) { return it.base(); } + static const Iterator& get_base(const std::reverse_iterator>& it) { return it.base().base(); } + const CompiledRegex& m_program; - Iterator m_begin; - Iterator m_end; + Utf8It m_begin; + Utf8It m_end; RegexExecFlags m_flags; Vector m_saves; @@ -413,19 +441,19 @@ private: Saves* m_captures = nullptr; }; -template +template bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); } -template +template bool regex_match(It begin, It end, Vector& captures, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search))) { std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures)); @@ -434,19 +462,19 @@ bool regex_match(It begin, It end, Vector& captures, const CompiledRegex& re return false; } -template +template bool regex_search(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); } -template +template bool regex_search(It begin, It end, Vector& captures, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; if (vm.exec(begin, end, flags | RegexExecFlags::Search)) { std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));