From fb65fa60f87957b2b4449dd5659d148d00427048 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Mon, 5 Mar 2018 05:48:10 +1100 Subject: [PATCH] Regex: take the full subject range as a parameter To allow more general look arounds out of the actual search range, pass a second range (the actual subject). This allows us to remove various flags such as PrevAvailable or NotBeginOfSubject, which are now easy to check from the subject range. Fixes #1902 --- src/highlighters.cc | 7 ++--- src/normal.cc | 5 ++-- src/regex.hh | 52 +++++++++++++++++++-------------- src/regex_impl.cc | 2 +- src/regex_impl.hh | 71 +++++++++++++++++++++++---------------------- src/selectors.cc | 68 +++++++++++++++++++++++-------------------- 6 files changed, 109 insertions(+), 96 deletions(-) diff --git a/src/highlighters.cc b/src/highlighters.cc index 544ddc5f..41953aa4 100644 --- a/src/highlighters.cc +++ b/src/highlighters.cc @@ -363,13 +363,12 @@ private: kak_assert(matches.size() % m_faces.size() == 0); using RegexIt = RegexIterator; RegexIt re_it{get_iterator(buffer, range.begin), - get_iterator(buffer, range.end), m_regex, + get_iterator(buffer, range.end), + buffer.begin(), buffer.end(), m_regex, match_flags(is_bol(range.begin), is_eol(buffer, range.end), is_bow(buffer, range.begin), - is_eow(buffer, range.end), - range.begin == BufferCoord{0,0}, - buffer.is_end(range.end))}; + is_eow(buffer, range.end))}; RegexIt re_end; for (; re_it != re_end; ++re_it) { diff --git a/src/normal.cc b/src/normal.cc index 882c3986..0a36d059 100644 --- a/src/normal.cc +++ b/src/normal.cc @@ -1041,10 +1041,9 @@ void keep(Context& context, NormalParams params) // give more intuitive behaviours in keep use cases. const auto flags = match_flags(is_bol(begin.coord()), false, is_bow(buffer, begin.coord()), - is_eow(buffer, end.coord()), - true, true) | + is_eow(buffer, end.coord())) | RegexExecFlags::AnyMatch; - if (regex_search(begin, end, regex, flags) == matching) + if (regex_search(begin, end, begin, end, regex, flags) == matching) keep.push_back(sel); } if (keep.empty()) diff --git a/src/regex.hh b/src/regex.hh index 4d8fa95c..ed6fb1d7 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -100,21 +100,19 @@ private: Vector m_values; }; -inline RegexExecFlags match_flags(bool bol, bool eol, bool bow, bool eow, bool bos, bool eos) +inline RegexExecFlags match_flags(bool bol, bool eol, bool bow, bool eow) { return (bol ? RegexExecFlags::None : RegexExecFlags::NotBeginOfLine) | (eol ? RegexExecFlags::None : RegexExecFlags::NotEndOfLine) | (bow ? RegexExecFlags::None : RegexExecFlags::NotBeginOfWord) | - (eow ? RegexExecFlags::None : RegexExecFlags::NotEndOfWord) | - (bos ? RegexExecFlags::None : RegexExecFlags::NotBeginOfSubject) | - (eos ? RegexExecFlags::None : RegexExecFlags::NotEndOfSubject); + (eow ? RegexExecFlags::None : RegexExecFlags::NotEndOfWord); } template bool regex_match(It begin, It end, const Regex& re) { ThreadedRegexVM vm{*re.impl()}; - return vm.exec(begin, end, RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); + return vm.exec(begin, end, begin, end, RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); } template @@ -122,7 +120,7 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) { res.values().clear(); ThreadedRegexVM vm{*re.impl()}; - if (vm.exec(begin, end, RegexExecFlags::None)) + if (vm.exec(begin, end, begin, end, RegexExecFlags::None)) { std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(res.values())); return true; @@ -131,20 +129,22 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) } template -bool regex_search(It begin, It end, const Regex& re, +bool regex_search(It begin, It end, It subject_begin, It subject_end, const Regex& re, RegexExecFlags flags = RegexExecFlags::None) { ThreadedRegexVM vm{*re.impl()}; - return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); + return vm.exec(begin, end, subject_begin, subject_end, + flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves); } template -bool regex_search(It begin, It end, MatchResults& res, const Regex& re, +bool regex_search(It begin, It end, It subject_begin, It subject_end, + MatchResults& res, const Regex& re, RegexExecFlags flags = RegexExecFlags::None) { res.values().clear(); ThreadedRegexVM vm{*re.impl()}; - if (vm.exec(begin, end, flags | RegexExecFlags::Search)) + if (vm.exec(begin, end, subject_begin, subject_end, flags | RegexExecFlags::Search)) { std::move(vm.captures().begin(), vm.captures().end(), std::back_inserter(res.values())); return true; @@ -153,10 +153,11 @@ bool regex_search(It begin, It end, MatchResults& res, const Regex& re, } template -bool backward_regex_search(It begin, It end, MatchResults& res, const Regex& re, - RegexExecFlags flags = RegexExecFlags::None) +bool backward_regex_search(It begin, It end, It subject_begin, It subject_end, + MatchResults& res, const Regex& re, + RegexExecFlags flags = RegexExecFlags::None) { - return regex_search(std::move(begin), std::move(end), res, re, flags); + return regex_search(begin, end, subject_begin, subject_end, res, re, flags); } String option_to_string(const Regex& re); @@ -168,14 +169,22 @@ struct RegexIterator using ValueType = MatchResults; RegexIterator() = default; - RegexIterator(Iterator begin, Iterator end, const Regex& re, + RegexIterator(Iterator begin, Iterator end, + Iterator subject_begin, Iterator subject_end, + const Regex& re, RegexExecFlags flags = RegexExecFlags::None) : m_regex{&re}, m_next_pos{direction == MatchDirection::Forward ? begin : end}, - m_begin{begin}, m_end{end}, m_flags{flags} + m_begin{begin}, m_end{end}, + m_subject_begin{subject_begin}, m_subject_end{subject_end}, + m_flags{flags} { next(); } + RegexIterator(Iterator begin, Iterator end, const Regex& re, + RegexExecFlags flags = RegexExecFlags::None) + : RegexIterator{begin, end, begin, end, re, flags} {} + const ValueType& operator*() const { kak_assert(m_regex); return m_results; } const ValueType* operator->() const { kak_assert(m_regex); return &m_results; } @@ -216,19 +225,16 @@ private: if (direction == MatchDirection::Forward) { - if (m_begin != m_next_pos) - additional_flags |= RegexExecFlags::NotBeginOfSubject | RegexExecFlags::PrevAvailable; - - if (not regex_search(m_next_pos, m_end, m_results, *m_regex, - m_flags | additional_flags)) + if (not regex_search(m_next_pos, m_end, m_subject_begin, m_subject_end, + m_results, *m_regex, m_flags | additional_flags)) m_regex = nullptr; else m_next_pos = m_results[0].second; } else { - if (not backward_regex_search(m_begin, m_next_pos, m_results, *m_regex, - m_flags | additional_flags)) + if (not backward_regex_search(m_begin, m_next_pos, m_subject_begin, m_subject_end, + m_results, *m_regex, m_flags | additional_flags)) m_regex = nullptr; else m_next_pos = m_results[0].first; @@ -240,6 +246,8 @@ private: Iterator m_next_pos{}; const Iterator m_begin{}; const Iterator m_end{}; + const Iterator m_subject_begin{}; + const Iterator m_subject_end{}; const RegexExecFlags m_flags = RegexExecFlags::None; }; diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 8b215673..7870fb56 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -1141,7 +1141,7 @@ struct TestVM : CompiledRegex, ThreadedRegexVM bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch) { - return VMType::exec(re.begin(), re.end(), flags); + return VMType::exec(re.begin(), re.end(), re.begin(), re.end(), flags); } }; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index ff9c8c0d..c969d719 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -132,12 +132,9 @@ enum class RegexExecFlags NotEndOfLine = 1 << 2, NotBeginOfWord = 1 << 3, NotEndOfWord = 1 << 4, - NotBeginOfSubject = 1 << 5, - NotEndOfSubject = 1 << 6, - NotInitialNull = 1 << 7, - AnyMatch = 1 << 8, - NoSaves = 1 << 9, - PrevAvailable = 1 << 10, + NotInitialNull = 1 << 5, + AnyMatch = 1 << 6, + NoSaves = 1 << 7, }; constexpr bool with_bit_ops(Meta::Type) { return true; } @@ -167,18 +164,21 @@ public: } } - bool exec(Iterator begin, Iterator end, RegexExecFlags flags) + bool exec(Iterator begin, Iterator end, + Iterator subject_begin, Iterator subject_end, + RegexExecFlags flags) { if (flags & RegexExecFlags::NotInitialNull and begin == end) return false; constexpr bool forward = direction == MatchDirection::Forward; - const bool prev_avail = flags & RegexExecFlags::PrevAvailable; - m_begin = Utf8It{utf8::iterator{forward ? begin : end, - prev_avail ? begin-1 : begin, end}}; - m_end = Utf8It{utf8::iterator{forward ? end : begin, - prev_avail ? begin-1 : begin, end}}; + m_begin = EffectiveIt{Utf8It{forward ? begin : end, subject_begin, subject_end}}; + m_end = EffectiveIt{Utf8It{forward ? end : begin, subject_begin, subject_end}}; + + m_subject_begin = EffectiveIt{Utf8It{forward ? subject_begin : subject_end, subject_begin, subject_end}}; + m_subject_end = EffectiveIt{Utf8It{forward ? subject_end : subject_begin, subject_begin, subject_end}}; + if (forward) m_flags = flags; else // Flip line begin/end flags as we flipped the instructions on compilation. @@ -187,7 +187,7 @@ public: ((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None); const bool search = (flags & RegexExecFlags::Search); - Utf8It start{m_begin}; + EffectiveIt start{m_begin}; const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc : m_program.backward_start_desc; if (start_desc) @@ -273,9 +273,9 @@ private: Saves* saves; }; - using Utf8It = std::conditional_t, - std::reverse_iterator>>; + using Utf8It = utf8::iterator; + using EffectiveIt = std::conditional_t>; struct ExecState { @@ -287,7 +287,7 @@ private: enum class StepResult { Consumed, Matched, Failed, FindNextStart }; // Steps a thread until it consumes the current character, matches or fail - StepResult step(Utf8It& pos, Thread& thread, ExecState& state) + StepResult step(EffectiveIt& pos, Thread& thread, ExecState& state) { const bool no_saves = (m_flags & RegexExecFlags::NoSaves); auto* instructions = m_program.instructions.data(); @@ -371,11 +371,11 @@ private: return StepResult::Failed; break; case CompiledRegex::SubjectBegin: - if (pos != m_begin or (m_flags & RegexExecFlags::NotBeginOfSubject)) + if (pos != m_subject_begin) return StepResult::Failed; break; case CompiledRegex::SubjectEnd: - if (pos != m_end or (m_flags & RegexExecFlags::NotEndOfSubject)) + if (pos != m_subject_end) return StepResult::Failed; break; case CompiledRegex::LookAhead: @@ -414,7 +414,7 @@ private: return StepResult::Failed; } - bool exec_program(Utf8It pos, ConstArrayView instructions) + bool exec_program(EffectiveIt pos, ConstArrayView instructions) { ExecState state; state.current_threads.push_back({instructions.begin(), nullptr}); @@ -495,7 +495,7 @@ private: } } - void to_next_start(Utf8It& start, const Utf8It& end, + void to_next_start(EffectiveIt& start, const EffectiveIt& end, const CompiledRegex::StartDesc& start_desc) { while (start != end and *start >= 0 and @@ -504,11 +504,12 @@ private: } template - bool lookaround(uint32_t index, Utf8It pos) const + bool lookaround(uint32_t index, EffectiveIt pos) const { + const auto end = (look_direction == MatchDirection::Forward ? m_subject_end : m_subject_begin); for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it) { - if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin)) + if (pos == end) return false; Codepoint cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)); if (ignore_case) @@ -535,36 +536,38 @@ private: return true; } - bool is_line_start(const Utf8It& pos) const + bool is_line_start(const EffectiveIt& pos) const { - if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin) + if (pos == m_subject_begin) return not (m_flags & RegexExecFlags::NotBeginOfLine); return *(pos-1) == '\n'; } - bool is_line_end(const Utf8It& pos) const + bool is_line_end(const EffectiveIt& pos) const { - if (pos == m_end) + if (pos == m_subject_end) return not (m_flags & RegexExecFlags::NotEndOfLine); return *pos == '\n'; } - bool is_word_boundary(const Utf8It& pos) const + bool is_word_boundary(const EffectiveIt& pos) const { - if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin) + if (pos == m_subject_begin) return not (m_flags & RegexExecFlags::NotBeginOfWord); - if (pos == m_end) + if (pos == m_subject_end) return not (m_flags & RegexExecFlags::NotEndOfWord); return is_word(*(pos-1)) != is_word(*pos); } - static const Iterator& get_base(const utf8::iterator& it) { return it.base(); } - static Iterator get_base(const std::reverse_iterator>& it) { return it.base().base(); } + static const Iterator& get_base(const Utf8It& it) { return it.base(); } + static Iterator get_base(const std::reverse_iterator& it) { return it.base().base(); } const CompiledRegex& m_program; - Utf8It m_begin; - Utf8It m_end; + EffectiveIt m_begin; + EffectiveIt m_end; + EffectiveIt m_subject_begin; + EffectiveIt m_subject_end; RegexExecFlags m_flags; Vector m_saves; diff --git a/src/selectors.cc b/src/selectors.cc index a53bbafa..378692cc 100644 --- a/src/selectors.cc +++ b/src/selectors.cc @@ -271,22 +271,24 @@ select_matching(const Context& context, const Selection& selection) return {}; } -template +template Optional> -find_opening(const Iterator& begin, Iterator pos, +find_opening(Iterator pos, const Container& container, const Regex& opening, const Regex& closing, int level, bool nestable) { MatchResults res; - if (backward_regex_search(begin, pos, res, closing) and + if (backward_regex_search(container.begin(), pos, + container.begin(), container.end(), res, closing) and res[0].second == pos) pos = res[0].first; - for (auto match : RegexIterator{begin, pos, opening}) + using RegexIt = RegexIterator; + for (auto match : RegexIt{container.begin(), pos, container.begin(), container.end(), opening}) { if (nestable) { - for (auto m : RegexIterator{match[0].second, pos, closing}) + for (auto m : RegexIt{match[0].second, pos, container.begin(), container.end(), closing}) ++level; } @@ -298,22 +300,23 @@ find_opening(const Iterator& begin, Iterator pos, return {}; } -template +template Optional> -find_closing(Iterator pos, const Iterator& end, +find_closing(Iterator pos, const Container& container, const Regex& opening, const Regex& closing, int level, bool nestable) { MatchResults res; - if (regex_search(pos, end, res, opening) and - res[0].first == pos) + if (regex_search(pos, container.end(), container.begin(), container.end(), + res, opening) and res[0].first == pos) pos = res[0].second; - for (auto match : RegexIterator{pos, end, closing}) + using RegexIt = RegexIterator; + for (auto match : RegexIt{pos, container.end(), container.begin(), container.end(), closing}) { if (nestable) { - for (auto m : RegexIterator{pos, match[0].first, opening}) + for (auto m : RegexIt{pos, match[0].first, container.begin(), container.end(), opening}) ++level; } @@ -335,7 +338,8 @@ find_surrounding(const Container& container, Iterator pos, // When onto the token of a non nestable block, consider it as an opening. MatchResults matches; - if (not nestable and regex_search(pos, container.end(), matches, opening) and + if (not nestable and regex_search(pos, container.end(), container.begin(), + container.end(), matches, opening) and matches[0].first == pos) pos = matches[0].second; @@ -344,10 +348,11 @@ find_surrounding(const Container& container, Iterator pos, { // When positionned onto opening and searching to opening, search the parent one if (nestable and first != container.begin() and not (flags & ObjectFlags::ToEnd) and - regex_search(first, container.end(), matches, opening) and matches[0].first == first) + regex_search(first, container.end(), container.begin(), container.end(), + matches, opening) and matches[0].first == first) first = utf8::previous(first, container.begin()); - if (auto res = find_opening(container.begin(), first+1, opening, closing, level, nestable)) + if (auto res = find_opening(first+1, container, opening, closing, level, nestable)) first = (flags & ObjectFlags::Inner) ? res->second : res->first; else return {}; @@ -359,10 +364,11 @@ find_surrounding(const Container& container, Iterator pos, // When positionned onto closing and searching to closing, search the parent one auto next = utf8::next(last, container.end()); if (nestable and next != container.end() and not (flags & ObjectFlags::ToBegin) and - backward_regex_search(container.begin(), next, matches, closing) and matches[0].second == next) + backward_regex_search(container.begin(), next, container.begin(), container.end(), + matches, closing) and matches[0].second == next) last = next; - if (auto res = find_closing(last, container.end(), opening, closing, level, nestable)) + if (auto res = find_closing(last, container, opening, closing, level, nestable)) last = (flags & ObjectFlags::Inner) ? utf8::previous(res->first, container.begin()) : utf8::previous(res->second, container.begin()); else @@ -835,12 +841,10 @@ void select_buffer(SelectionList& selections) } static RegexExecFlags -match_flags(const Buffer& buf, const BufferIterator& begin, const BufferIterator& end, - bool bos, bool eos) +match_flags(const Buffer& buf, const BufferIterator& begin, const BufferIterator& end) { return match_flags(is_bol(begin.coord()), is_eol(buf, end.coord()), - is_bow(buf, begin.coord()), is_eow(buf, end.coord()), - bos, eos); + is_bow(buf, begin.coord()), is_eow(buf, end.coord())); } static bool find_next(const Buffer& buffer, const BufferIterator& pos, @@ -848,12 +852,12 @@ static bool find_next(const Buffer& buffer, const BufferIterator& pos, const Regex& ex, bool& wrapped) { if (pos != buffer.end() and - regex_search(pos, buffer.end(), matches, ex, - match_flags(buffer, pos, buffer.end(), pos.coord() == BufferCoord{0,0}, true))) + regex_search(pos, buffer.end(), buffer.begin(), buffer.end(), + matches, ex, match_flags(buffer, pos, buffer.end()))) return true; wrapped = true; - return regex_search(buffer.begin(), buffer.end(), matches, ex, - match_flags(buffer, buffer.begin(), buffer.end(), true, true)); + return regex_search(buffer.begin(), buffer.end(), buffer.begin(), buffer.end(), + matches, ex, match_flags(buffer, buffer.begin(), buffer.end())); } static bool find_prev(const Buffer& buffer, const BufferIterator& pos, @@ -861,13 +865,15 @@ static bool find_prev(const Buffer& buffer, const BufferIterator& pos, const Regex& ex, bool& wrapped) { if (pos != buffer.begin() and - backward_regex_search(buffer.begin(), pos, matches, ex, - match_flags(buffer, buffer.begin(), pos, true, buffer.is_end(pos.coord())) | + backward_regex_search(buffer.begin(), pos, buffer.begin(), buffer.end(), + matches, ex, + match_flags(buffer, buffer.begin(), pos) | RegexExecFlags::NotInitialNull)) return true; wrapped = true; - return backward_regex_search(buffer.begin(), buffer.end(), matches, ex, - match_flags(buffer, buffer.begin(), buffer.end(), true, true) | + return backward_regex_search(buffer.begin(), buffer.end(), buffer.begin(), buffer.end(), + matches, ex, + match_flags(buffer, buffer.begin(), buffer.end()) | RegexExecFlags::NotInitialNull); } @@ -913,8 +919,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex, int captu { auto sel_beg = buffer.iterator_at(sel.min()); auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end()); - RegexIt re_it(sel_beg, sel_end, regex, - match_flags(buffer, sel_beg, sel_end, true, true)); + RegexIt re_it(sel_beg, sel_end, regex, match_flags(buffer, sel_beg, sel_end)); RegexIt re_end; for (; re_it != re_end; ++re_it) @@ -958,8 +963,7 @@ void split_selections(SelectionList& selections, const Regex& regex, int capture auto begin = buffer.iterator_at(sel.min()); auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end()); - RegexIt re_it(begin, sel_end, regex, - match_flags(buffer, begin, sel_end, true, true)); + RegexIt re_it(begin, sel_end, regex, match_flags(buffer, begin, sel_end)); RegexIt re_end; for (; re_it != re_end; ++re_it)