From 4ac7df3842ada4401e72f48a5a5a9c0f70b7fe24 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Sat, 3 Nov 2018 13:52:40 +1100 Subject: [PATCH] Remove most regex impl special casing for backwards matching --- src/regex_impl.cc | 58 +++++++++++++++++++++++++++-------------------- src/regex_impl.hh | 53 +++++++++++++++++++++---------------------- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 883d7b72..73c9d074 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -755,40 +755,30 @@ private: break; } case ParsedRegex::LookAhead: - push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase - : CompiledRegex::LookAhead) - : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase - : CompiledRegex::LookBehind), + push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase + : CompiledRegex::LookAhead, push_lookaround(index, ignore_case)); break; case ParsedRegex::NegativeLookAhead: - push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase - : CompiledRegex::NegativeLookAhead) - : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase - : CompiledRegex::NegativeLookBehind), + push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase + : CompiledRegex::NegativeLookAhead, push_lookaround(index, ignore_case)); break; case ParsedRegex::LookBehind: - push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase - : CompiledRegex::LookBehind) - : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase - : CompiledRegex::LookAhead), + push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase + : CompiledRegex::LookBehind, push_lookaround(index, ignore_case)); break; case ParsedRegex::NegativeLookBehind: - push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase - : CompiledRegex::NegativeLookBehind) - : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase - : CompiledRegex::NegativeLookAhead), + push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase + : CompiledRegex::NegativeLookBehind, push_lookaround(index, ignore_case)); break; case ParsedRegex::LineStart: - push_inst(forward ? CompiledRegex::LineStart - : CompiledRegex::LineEnd); + push_inst(CompiledRegex::LineStart); break; case ParsedRegex::LineEnd: - push_inst(forward ? CompiledRegex::LineEnd - : CompiledRegex::LineStart); + push_inst(CompiledRegex::LineEnd); break; case ParsedRegex::WordBoundary: push_inst(CompiledRegex::WordBoundary); @@ -797,12 +787,10 @@ private: push_inst(CompiledRegex::NotWordBoundary); break; case ParsedRegex::SubjectBegin: - push_inst(forward ? CompiledRegex::SubjectBegin - : CompiledRegex::SubjectEnd); + push_inst(CompiledRegex::SubjectBegin); break; case ParsedRegex::SubjectEnd: - push_inst(forward ? CompiledRegex::SubjectEnd - : CompiledRegex::SubjectBegin); + push_inst(CompiledRegex::SubjectEnd); break; case ParsedRegex::ResetStart: push_inst(CompiledRegex::Save, 0); @@ -1443,6 +1431,28 @@ auto test_regex = UnitTest{[]{ TestVM vm{R"($)"}; kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search | RegexExecFlags::NotEndOfLine)); kak_assert(StringView{vm.captures()[0]} == "\nqux"); + kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search)); + kak_assert(StringView{vm.captures()[0]} == ""); + } + + { + TestVM vm{R"(^)"}; + kak_assert(not vm.exec("foo", RegexExecFlags::Search | RegexExecFlags::NotBeginOfLine)); + kak_assert(vm.exec("foo", RegexExecFlags::Search)); + kak_assert(vm.exec("foo\nbar", RegexExecFlags::Search)); + kak_assert(StringView{vm.captures()[0]} == "bar"); + } + + { + TestVM vm{R"(\A\w+)"}; + kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search)); + kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foo"); + } + + { + TestVM vm{R"(\b\w+\z)"}; + kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search)); + kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "baz"); } { diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 4b80cd55..d7abaec9 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -192,14 +192,6 @@ public: if (flags & RegexExecFlags::NotInitialNull and begin == end) return false; - constexpr bool forward = direction == MatchDirection::Forward; - - - if (not forward) // Flip line begin/end flags as we flipped the instructions on compilation. - flags = (RegexExecFlags)(flags & ~(RegexExecFlags::NotEndOfLine | RegexExecFlags::NotBeginOfLine)) | - ((flags & RegexExecFlags::NotEndOfLine) ? RegexExecFlags::NotBeginOfLine : RegexExecFlags::None) | - ((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None); - const bool search = (flags & RegexExecFlags::Search); ConstArrayView instructions{m_program.instructions}; @@ -210,12 +202,13 @@ public: if (not search) instructions = instructions.subrange(CompiledRegex::search_prefix_size); + constexpr bool forward = direction == MatchDirection::Forward; const ExecConfig config{ Sentinel{forward ? begin : end}, Sentinel{forward ? end : begin}, - Sentinel{forward ? subject_begin : subject_end}, - Sentinel{forward ? subject_end : subject_begin}, + Sentinel{subject_begin}, + Sentinel{subject_end}, flags, instructions }; @@ -226,8 +219,7 @@ public: Sentinel{subject_end} }}; - if (const auto& start_desc = direction == MatchDirection::Forward ? - m_program.forward_start_desc : m_program.backward_start_desc) + if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc) { if (search) { @@ -525,7 +517,7 @@ private: } m_threads.swap_next(); - next(pos); + (direction == MatchDirection::Forward) ? ++pos : --pos; if (find_next_start and start_desc) to_next_start(pos, config.end, *start_desc); @@ -536,10 +528,10 @@ private: { while (start != end) { - const Codepoint cp = read(start); + const Codepoint cp = read_codepoint(start); if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other]) { - prev(start); + (direction == MatchDirection::Forward) ? --start : ++start; return; } } @@ -550,12 +542,19 @@ private: { using Lookaround = CompiledRegex::Lookaround; - const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin); + if (look_direction == MatchDirection::Backward) + { + if (pos == config.subject_begin) + return m_program.lookarounds[index] == Lookaround::EndOfLookaround; + --pos; + } + for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) { - if (pos == end) + if (look_direction == MatchDirection::Forward and pos == config.subject_end) return false; - Codepoint cp = (look_direction == MatchDirection::Forward ? codepoint(pos) : prev_codepoint(pos)); + + Codepoint cp = *pos; if (ignore_case) cp = to_lower(cp); @@ -582,7 +581,10 @@ private: else if (static_cast(op) != cp) return false; - (look_direction == MatchDirection::Forward) ? next(pos) : prev(pos); + if (look_direction == MatchDirection::Backward and pos == config.subject_begin) + return *++it == Lookaround::EndOfLookaround; + + (look_direction == MatchDirection::Forward) ? ++pos : --pos; } return true; } @@ -591,14 +593,14 @@ private: { if (pos == config.subject_begin) return not (config.flags & RegexExecFlags::NotBeginOfLine); - return prev_codepoint(pos) == '\n'; + return *(pos-1) == '\n'; } static bool is_line_end(const Utf8It& pos, const ExecConfig& config) { if (pos == config.subject_end) return not (config.flags & RegexExecFlags::NotEndOfLine); - return codepoint(pos) == '\n'; + return *pos == '\n'; } static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config) @@ -607,10 +609,10 @@ private: return not (config.flags & RegexExecFlags::NotBeginOfWord); if (pos == config.subject_end) return not (config.flags & RegexExecFlags::NotEndOfWord); - return is_word(prev_codepoint(pos)) != is_word(codepoint(pos)); + return is_word(*(pos-1)) != is_word(*pos); } - static Codepoint read(Utf8It& it) + static Codepoint read_codepoint(Utf8It& it) { if (direction == MatchDirection::Forward) return it.read(); @@ -618,10 +620,7 @@ private: return *--it; } - static constexpr Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); } - static constexpr Utf8It& next(Utf8It& it) { return (direction == MatchDirection::Forward) ? ++it : --it; } - static constexpr Utf8It& prev(Utf8It& it) { return (direction == MatchDirection::Forward) ? --it : ++it; } - static constexpr Codepoint prev_codepoint(Utf8It it) { return codepoint(prev(it)); } + static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); } const CompiledRegex& m_program;