diff --git a/src/regex_impl.cc b/src/regex_impl.cc index c0a9371a..7e122c81 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -501,13 +501,14 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] struct RegexCompiler { - RegexCompiler(const ParsedRegex& parsed_regex) - : m_parsed_regex{parsed_regex} + RegexCompiler(const ParsedRegex& parsed_regex, MatchDirection direction) + : m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward} { compile_node(m_parsed_regex.ast); push_op(CompiledRegex::Match); m_program.matchers = m_parsed_regex.matchers; m_program.save_count = m_parsed_regex.capture_count * 2; + m_program.direction = direction; m_program.start_chars = compute_start_chars(); } @@ -524,7 +525,7 @@ private: if (capture != -1) { push_op(CompiledRegex::Save); - push_byte(capture * 2); + push_byte(capture * 2 + (m_forward ? 0 : 1)); } Vector goto_inner_end_offsets; @@ -543,9 +544,15 @@ private: push_op(CompiledRegex::Matcher); push_byte(node->value); case ParsedRegex::Sequence: - for (auto& child : node->children) - compile_node(child); + { + if (m_forward) + for (auto& child : node->children) + compile_node(child); + else + for (auto& child : node->children | reverse()) + compile_node(child); break; + } case ParsedRegex::Alternation: { auto& children = node->children; @@ -554,36 +561,42 @@ private: push_op(CompiledRegex::Split_PrioritizeParent); auto offset = alloc_offset(); - compile_node(children[0]); + compile_node(children[m_forward ? 0 : 1]); push_op(CompiledRegex::Jump); goto_inner_end_offsets.push_back(alloc_offset()); - auto right_pos = compile_node(children[1]); + auto right_pos = compile_node(children[m_forward ? 1 : 0]); set_offset(offset, right_pos); break; } case ParsedRegex::LookAhead: - push_op(CompiledRegex::LookAhead); - push_string(node->children); + push_op(m_forward ? CompiledRegex::LookAhead + : CompiledRegex::LookBehind); + push_string(node->children, false); break; case ParsedRegex::NegativeLookAhead: - push_op(CompiledRegex::NegativeLookAhead); - push_string(node->children); + push_op(m_forward ? CompiledRegex::NegativeLookAhead + : CompiledRegex::NegativeLookBehind); + push_string(node->children, false); break; case ParsedRegex::LookBehind: - push_op(CompiledRegex::LookBehind); + push_op(m_forward ? CompiledRegex::LookBehind + : CompiledRegex::LookAhead); push_string(node->children, true); break; case ParsedRegex::NegativeLookBehind: - push_op(CompiledRegex::NegativeLookBehind); + push_op(m_forward ? CompiledRegex::NegativeLookBehind + : CompiledRegex::NegativeLookAhead); push_string(node->children, true); break; case ParsedRegex::LineStart: - push_op(CompiledRegex::LineStart); + push_op(m_forward ? CompiledRegex::LineStart + : CompiledRegex::LineEnd); break; case ParsedRegex::LineEnd: - push_op(CompiledRegex::LineEnd); + push_op(m_forward ? CompiledRegex::LineEnd + : CompiledRegex::LineStart); break; case ParsedRegex::WordBoundary: push_op(CompiledRegex::WordBoundary); @@ -592,10 +605,12 @@ private: push_op(CompiledRegex::NotWordBoundary); break; case ParsedRegex::SubjectBegin: - push_op(CompiledRegex::SubjectBegin); + push_op(m_forward ? CompiledRegex::SubjectBegin + : CompiledRegex::SubjectEnd); break; case ParsedRegex::SubjectEnd: - push_op(CompiledRegex::SubjectEnd); + push_op(m_forward ? CompiledRegex::SubjectEnd + : CompiledRegex::SubjectBegin); break; case ParsedRegex::ResetStart: push_op(CompiledRegex::Save); @@ -609,7 +624,7 @@ private: if (capture != -1) { push_op(CompiledRegex::Save); - push_byte(capture * 2 + 1); + push_byte(capture * 2 + (m_forward ? 1 : 0)); } return start_pos; @@ -622,6 +637,8 @@ private: auto& quantifier = node->quantifier; + // TODO reverse, invert the way we write optional quantifiers ? + if (quantifier.allows_none()) { push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent @@ -720,14 +737,14 @@ private: case ParsedRegex::Sequence: { bool consumed = false; - for (auto& child : node->children) - { - if (not compute_start_chars(child, accepted, rejected)) - { - consumed = true; - break; - } - } + auto consumes = [&, this](auto& child) { + return not this->compute_start_chars(child, accepted, rejected); + }; + if (m_forward) + consumed = contains_that(node->children, consumes); + else + consumed = contains_that(node->children | reverse(), consumes); + return not consumed or node->quantifier.allows_none(); } case ParsedRegex::Alternation: @@ -750,11 +767,13 @@ private: return true; case ParsedRegex::LookAhead: if (node->children.empty()) - compute_start_chars(node->children.front(), accepted, rejected); + compute_start_chars(m_forward ? node->children.front() : node->children.back(), + accepted, rejected); return true; case ParsedRegex::NegativeLookAhead: if (node->children.empty()) - compute_start_chars(node->children.front(), rejected, accepted); + compute_start_chars(m_forward ? node->children.front() : node->children.back(), + rejected, accepted); return true; case ParsedRegex::LookBehind: return true; @@ -780,6 +799,7 @@ private: CompiledRegex m_program; const ParsedRegex& m_parsed_regex; + const bool m_forward; }; void dump_regex(const CompiledRegex& program) @@ -864,27 +884,34 @@ void dump_regex(const CompiledRegex& program) } } -CompiledRegex compile_regex(StringView re) +CompiledRegex compile_regex(StringView re, MatchDirection direction) { - return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex(); + return RegexCompiler{RegexParser::parse(re), direction}.get_compiled_regex(); +} + +namespace +{ +template +struct TestVM : CompiledRegex, ThreadedRegexVM +{ + using VMType = ThreadedRegexVM; + + TestVM(StringView re, bool dump = false) + : CompiledRegex{compile_regex(re, dir)}, + VMType{(const CompiledRegex&)*this} + { if (dump) dump_regex(*this); } + + bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch) + { + return VMType::exec(re.begin(), re.end(), flags); + } +}; } auto test_regex = UnitTest{[]{ - struct TestVM : CompiledRegex, ThreadedRegexVM - { - TestVM(StringView re, bool dump = false) - : CompiledRegex{compile_regex(re)}, - ThreadedRegexVM{(const CompiledRegex&)*this} - { if (dump) dump_regex(*this); } - - bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch) - { - return ThreadedRegexVM::exec(re.begin(), re.end(), flags); - } - }; { - TestVM vm{R"(a*b)"}; + TestVM<> vm{R"(a*b)"}; kak_assert(vm.exec("b")); kak_assert(vm.exec("ab")); kak_assert(vm.exec("aaab")); @@ -894,7 +921,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(^a.*b$)"}; + TestVM<> vm{R"(^a.*b$)"}; kak_assert(vm.exec("afoob")); kak_assert(vm.exec("ab")); kak_assert(not vm.exec("bab")); @@ -902,7 +929,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(^(foo|qux|baz)+(bar)?baz$)"}; + TestVM<> vm{R"(^(foo|qux|baz)+(bar)?baz$)"}; kak_assert(vm.exec("fooquxbarbaz")); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "qux"); kak_assert(not vm.exec("fooquxbarbaze")); @@ -913,7 +940,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(.*\b(foo|bar)\b.*)"}; + TestVM<> vm{R"(.*\b(foo|bar)\b.*)"}; kak_assert(vm.exec("qux foo baz")); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "foo"); kak_assert(not vm.exec("quxfoobaz")); @@ -922,14 +949,14 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"((foo|bar))"}; + TestVM<> vm{R"((foo|bar))"}; kak_assert(vm.exec("foo")); kak_assert(vm.exec("bar")); kak_assert(not vm.exec("foobar")); } { - TestVM vm{R"(a{3,5}b)"}; + TestVM<> vm{R"(a{3,5}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); kak_assert(not vm.exec("aaaaaab")); @@ -937,21 +964,21 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(a{3}b)"}; + TestVM<> vm{R"(a{3}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); kak_assert(not vm.exec("aaaab")); } { - TestVM vm{R"(a{3,}b)"}; + TestVM<> vm{R"(a{3,}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); kak_assert(vm.exec("aaaaab")); } { - TestVM vm{R"(a{,3}b)"}; + TestVM<> vm{R"(a{,3}b)"}; kak_assert(vm.exec("b")); kak_assert(vm.exec("ab")); kak_assert(vm.exec("aaab")); @@ -959,7 +986,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"(f.*a(.*o))"}; + TestVM<> vm{R"(f.*a(.*o))"}; kak_assert(vm.exec("blahfoobarfoobaz", RegexExecFlags::Search)); kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foobarfoo"); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "rfoo"); @@ -969,7 +996,7 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"([àb-dX-Z-]{3,5})"}; + TestVM<> vm{R"([àb-dX-Z-]{3,5})"}; kak_assert(vm.exec("cà-Y")); kak_assert(not vm.exec("àeY")); kak_assert(vm.exec("dcbàX")); @@ -977,115 +1004,128 @@ auto test_regex = UnitTest{[]{ } { - TestVM vm{R"((a{3,5})a+)"}; + TestVM<> vm{R"((a{3,5})a+)"}; kak_assert(vm.exec("aaaaaa", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaaaa"); } { - TestVM vm{R"((a{3,5}?)a+)"}; + TestVM<> vm{R"((a{3,5}?)a+)"}; kak_assert(vm.exec("aaaaaa", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "aaa"); } { - TestVM vm{R"((a{3,5}?)a)"}; + TestVM<> vm{R"((a{3,5}?)a)"}; kak_assert(vm.exec("aaaa")); } { - TestVM vm{R"(\d{3})"}; + TestVM<> vm{R"(\d{3})"}; kak_assert(vm.exec("123")); kak_assert(not vm.exec("1x3")); } { - TestVM vm{R"([-\d]+)"}; + TestVM<> vm{R"([-\d]+)"}; kak_assert(vm.exec("123-456")); kak_assert(not vm.exec("123_456")); } { - TestVM vm{R"([ \H]+)"}; + TestVM<> vm{R"([ \H]+)"}; kak_assert(vm.exec("abc ")); kak_assert(not vm.exec("a \t")); } { - TestVM vm{R"(\Q{}[]*+?\Ea+)"}; + TestVM<> vm{R"(\Q{}[]*+?\Ea+)"}; kak_assert(vm.exec("{}[]*+?aa")); } { - TestVM vm{R"(\Q...)"}; + TestVM<> vm{R"(\Q...)"}; kak_assert(vm.exec("...")); kak_assert(not vm.exec("bla")); } { - TestVM vm{R"(foo\Kbar)"}; + TestVM<> vm{R"(foo\Kbar)"}; kak_assert(vm.exec("foobar", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "bar"); kak_assert(not vm.exec("bar", RegexExecFlags::None)); } { - TestVM vm{R"((fo+?).*)"}; + TestVM<> vm{R"((fo+?).*)"}; kak_assert(vm.exec("foooo", RegexExecFlags::None)); kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "fo"); } { - TestVM vm{R"((?=foo).)"}; + TestVM<> vm{R"((?=foo).)"}; kak_assert(vm.exec("barfoo", RegexExecFlags::Search)); kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "f"); } { - TestVM vm{R"((?!foo)...)"}; + TestVM<> vm{R"((?!foo)...)"}; kak_assert(not vm.exec("foo")); kak_assert(vm.exec("qux")); } { - TestVM vm{R"(...(?<=foo))"}; + TestVM<> vm{R"(...(?<=foo))"}; kak_assert(vm.exec("foo")); kak_assert(not vm.exec("qux")); } { - TestVM vm{R"(...(? vm{R"(...(? vm{R"(Foo(?i)f[oB]+)"}; kak_assert(vm.exec("FooFOoBb")); } { - TestVM vm{R"([^\]]+)"}; + TestVM<> vm{R"([^\]]+)"}; kak_assert(not vm.exec("a]c")); kak_assert(vm.exec("abc")); } { - TestVM vm{R"((?:foo)+)"}; + TestVM<> vm{R"((?:foo)+)"}; kak_assert(vm.exec("foofoofoo")); kak_assert(not vm.exec("barbarbar")); } { - TestVM vm{R"((? vm{R"((? vm{R"($)"}; kak_assert(vm.exec("foo\n", RegexExecFlags::Search)); kak_assert(*vm.captures()[0] == '\n'); } + + { + TestVM vm{R"(fo{1,})"}; + kak_assert(vm.exec("foo1fooo2", RegexExecFlags::Search)); + kak_assert(*vm.captures()[1] == '2'); + } + + { + TestVM vm{R"((?<=f)oo(b[ae]r)?(?=baz))"}; + kak_assert(vm.exec("foobarbazfoobazfooberbaz", RegexExecFlags::Search)); + kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "oober"); + kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "ber"); + } }}; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 16daba22..6b4d1369 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -1,6 +1,7 @@ #ifndef regex_impl_hh_INCLUDED #define regex_impl_hh_INCLUDED +#include "exception.hh" #include "flags.hh" #include "ref_ptr.hh" #include "unicode.hh" @@ -13,6 +14,12 @@ namespace Kakoune { +enum class MatchDirection +{ + Forward, + Backward +}; + struct CompiledRegex : RefCountable { enum Op : char @@ -43,13 +50,14 @@ struct CompiledRegex : RefCountable Vector bytecode; Vector> matchers; + MatchDirection direction; size_t save_count; struct StartChars { bool map[256]; }; std::unique_ptr start_chars; }; -CompiledRegex compile_regex(StringView re); +CompiledRegex compile_regex(StringView re, MatchDirection direction = MatchDirection::Forward); enum class RegexExecFlags { @@ -67,12 +75,29 @@ enum class RegexExecFlags constexpr bool with_bit_ops(Meta::Type) { return true; } +template +struct ChooseUtf8It +{ + using Type = utf8::iterator; +}; + template +struct ChooseUtf8It +{ + using Type = std::reverse_iterator>; +}; + +template class ThreadedRegexVM { public: ThreadedRegexVM(const CompiledRegex& program) - : m_program{program} { kak_assert(m_program); } + : m_program{program} + { + kak_assert(m_program); + if (direction != program.direction) + throw runtime_error{"Regex and VM direction mismatch"}; + } ThreadedRegexVM(const ThreadedRegexVM&) = delete; ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete; @@ -89,8 +114,9 @@ public: bool exec(Iterator begin, Iterator end, RegexExecFlags flags) { - m_begin = begin; - m_end = end; + const bool forward = direction == MatchDirection::Forward; + m_begin = Utf8It{utf8::iterator{forward ? begin : end, begin, end}}; + m_end = Utf8It{utf8::iterator{forward ? end : begin, begin, end}}; m_flags = flags; if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end) @@ -99,12 +125,12 @@ public: Vector current_threads, next_threads; const bool no_saves = (m_flags & RegexExecFlags::NoSaves); - Utf8It start{m_begin, m_begin, m_end}; + Utf8It start{m_begin}; const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr; if (flags & RegexExecFlags::Search) - to_next_start(start, end, start_chars); + to_next_start(start, m_end, start_chars); if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), current_threads, next_threads)) @@ -115,12 +141,12 @@ public: do { - to_next_start(++start, end, start_chars); + to_next_start(++start, m_end, start_chars); if (exec_from(start, no_saves ? nullptr : new_saves(nullptr), current_threads, next_threads)) return true; } - while (start != end); + while (start != m_end); return false; } @@ -177,7 +203,7 @@ private: Saves* saves; }; - using Utf8It = utf8::iterator; + using Utf8It = typename ChooseUtf8It::Type; enum class StepResult { Consumed, Matched, Failed }; StepResult step(const Utf8It& pos, Thread& thread, Vector& threads) @@ -233,7 +259,7 @@ private: --thread.saves->refcount; thread.saves = new_saves(thread.saves->pos); } - thread.saves->pos[index] = pos.base(); + thread.saves->pos[index] = get_base(pos); break; } case CompiledRegex::Matcher: @@ -350,12 +376,11 @@ private: return true; // Step remaining threads to see if they match without consuming anything else - const Utf8It end{m_end, m_begin, m_end}; while (not current_threads.empty()) { auto thread = current_threads.back(); current_threads.pop_back(); - if (step(end, thread, current_threads) == StepResult::Matched) + if (step(m_end, thread, current_threads) == StepResult::Matched) { release_saves(m_captures); m_captures = thread.saves; @@ -365,7 +390,7 @@ private: return false; } - void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars) + void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars) { if (not start_chars) return; @@ -401,10 +426,13 @@ private: is_word(*(pos-1)) != is_word(*pos); } + static const Iterator& get_base(const utf8::iterator& it) { return it.base(); } + static const Iterator& get_base(const std::reverse_iterator>& it) { return it.base().base(); } + const CompiledRegex& m_program; - Iterator m_begin; - Iterator m_end; + Utf8It m_begin; + Utf8It m_end; RegexExecFlags m_flags; Vector m_saves; @@ -413,19 +441,19 @@ private: Saves* m_captures = nullptr; }; -template +template bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); } -template +template bool regex_match(It begin, It end, Vector& captures, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search))) { std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures)); @@ -434,19 +462,19 @@ bool regex_match(It begin, It end, Vector& captures, const CompiledRegex& re return false; } -template +template bool regex_search(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); } -template +template bool regex_search(It begin, It end, Vector& captures, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None) { - ThreadedRegexVM vm{re}; + ThreadedRegexVM vm{re}; if (vm.exec(begin, end, flags | RegexExecFlags::Search)) { std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));