From c8c8051bd0341e3c9df933ee7c942d227ae13f00 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 10 Aug 2022 20:58:31 +0100 Subject: [PATCH] Refactor RegionsHighlighter to share regexes Instead of storing regexes in each regions, move them to the core highlighter in a hash map so that shared regexes between different regions are only applied once per update instead of once per region Also change iteration logic to apply all regex together to each changed lines to improve memory locality on big buffers. For the big_markdown.md file described in #4685 this reduces initial display time from 3.55s to 2.41s on my machine. --- src/hash_map.hh | 17 +- src/highlighters.cc | 515 +++++++++++++++++++++++--------------------- src/regex_impl.hh | 7 + 3 files changed, 296 insertions(+), 243 deletions(-) diff --git a/src/hash_map.hh b/src/hash_map.hh index 0bf36f7d..5fad3235 100644 --- a/src/hash_map.hh +++ b/src/hash_map.hh @@ -246,6 +246,21 @@ struct HashMap return item_value(m_items.back()); } + template requires IsHashCompatible> + constexpr const EffectiveValue& get(KeyType&& key) const + { + return const_cast(*this).get(key); + } + + template requires IsHashCompatible> + constexpr EffectiveValue& get(KeyType&& key) + { + const auto hash = hash_value(key); + auto index = find_index(key, hash); + kak_assert(index >= 0); + return item_value(m_items[index]); + } + template requires IsHashCompatible constexpr void remove(const KeyType& key) { @@ -337,7 +352,7 @@ struct HashMap } private: - static EffectiveValue& item_value(Item& item) + static auto& item_value(auto& item) { if constexpr (has_value) { return item.value; } else { return item; } } diff --git a/src/highlighters.cc b/src/highlighters.cc index cfb24f6c..527e33fa 100644 --- a/src/highlighters.cc +++ b/src/highlighters.cc @@ -5,6 +5,7 @@ #include "changes.hh" #include "command_manager.hh" #include "context.hh" +#include "clock.hh" #include "display_buffer.hh" #include "face_registry.hh" #include "highlighter_group.hh" @@ -1838,121 +1839,11 @@ struct RegexMatch using RegexMatchList = Vector; -void insert_matches(const Buffer& buffer, RegexMatchList& matches, const Regex& regex, bool capture, LineRange range) -{ - size_t pivot = matches.size(); - capture = capture and regex.mark_count() > 0; - ThreadedRegexVM vm{*regex.impl()}; - for (auto line = range.begin; line < range.end; ++line) - { - const StringView l = buffer[line]; - const auto flags = RegexExecFlags::NotEndOfLine; // buffer line already ends with \n - for (auto&& m : RegexIterator{l.begin(), l.end(), vm, flags}) - { - const bool with_capture = capture and m[1].matched and - m[0].second - m[0].first < std::numeric_limits::max(); - matches.push_back({ - line, - (int)(m[0].first - l.begin()), - (int)(m[0].second - l.begin()), - (uint16_t)(with_capture ? m[1].first - m[0].first : 0), - (uint16_t)(with_capture ? m[1].second - m[1].first : 0) - }); - } - } - - auto pos = std::lower_bound(matches.begin(), matches.begin() + pivot, range.begin, - [](const RegexMatch& m, LineCount l) { return m.line < l; }); - kak_assert(pos == matches.begin() + pivot or pos->line >= range.end); // We should not have had matches for range - - // Move new matches into position. - std::rotate(pos, matches.begin() + pivot, matches.end()); -} - -void update_matches(const Buffer& buffer, ConstArrayView modifs, - RegexMatchList& matches) -{ - // remove out of date matches and update line for others - auto ins_pos = matches.begin(); - for (auto it = ins_pos; it != matches.end(); ++it) - { - auto modif_it = std::upper_bound(modifs.begin(), modifs.end(), it->line, - [](const LineCount& l, const LineModification& c) - { return l < c.old_line; }); - - if (modif_it != modifs.begin()) - { - auto& prev = *(modif_it-1); - if (it->line < prev.old_line + prev.num_removed) - continue; // match removed - - it->line += prev.diff(); - } - - kak_assert(buffer.is_valid(it->begin_coord()) or - buffer[it->line].length() == it->begin); - kak_assert(buffer.is_valid(it->end_coord()) or - buffer[it->line].length() == it->end); - - if (ins_pos != it) - *ins_pos = std::move(*it); - ++ins_pos; - } - matches.erase(ins_pos, matches.end()); -} - struct RegionMatches : UseMemoryDomain { RegexMatchList begin_matches; RegexMatchList end_matches; RegexMatchList recurse_matches; - - static bool compare_to_begin(const RegexMatch& lhs, BufferCoord rhs) - { - return lhs.begin_coord() < rhs; - } - - RegexMatchList::const_iterator find_next_begin(BufferCoord pos) const - { - return std::lower_bound(begin_matches.begin(), begin_matches.end(), - pos, compare_to_begin); - } - - RegexMatchList::const_iterator find_matching_end(const Buffer& buffer, BufferCoord beg_pos, Optional capture) const - { - auto end_it = end_matches.begin(); - auto rec_it = recurse_matches.begin(); - int recurse_level = 0; - while (true) - { - end_it = std::lower_bound(end_it, end_matches.end(), beg_pos, - compare_to_begin); - rec_it = std::lower_bound(rec_it, recurse_matches.end(), beg_pos, - compare_to_begin); - - if (end_it == end_matches.end()) - return end_it; - - while (rec_it != recurse_matches.end() and - rec_it->end_coord() <= end_it->end_coord()) - { - if (not capture or rec_it->capture(buffer) == *capture) - ++recurse_level; - ++rec_it; - } - - if (not capture or *capture == end_it->capture(buffer)) - { - if (recurse_level == 0) - return end_it; - --recurse_level; - } - - if (beg_pos != end_it->end_coord()) - beg_pos = end_it->end_coord(); - ++end_it; - } - } }; const HighlighterDesc default_region_desc = { @@ -2120,21 +2011,20 @@ public: if (parser[0].empty() or parser[1].empty()) throw runtime_error("begin and end must not be empty"); - const RegexCompileFlags flags = match_capture ? - RegexCompileFlags::Optimize : RegexCompileFlags::NoSubs | RegexCompileFlags::Optimize; - const auto& type = parser[2]; auto& registry = HighlighterRegistry::instance(); auto it = registry.find(type); if (it == registry.end()) throw runtime_error(format("no such highlighter type: '{}'", type)); - Regex recurse; + // validate regexes, TODO: less costly + Regex{parser[0]}; + Regex{parser[1]}; if (auto recurse_switch = parser.get_switch("recurse")) - recurse = Regex{*recurse_switch, flags}; + Regex{*recurse_switch}; auto delegate = it->value.factory(parser.positionals_from(3), nullptr); - return std::make_unique(std::move(delegate), Regex{parser[0], flags}, Regex{parser[1], flags}, recurse, match_capture); + return std::make_unique(std::move(delegate), parser[0], parser[1], parser.get_switch("recurse").value_or("").str(), match_capture); } static std::unique_ptr create_default_region(HighlighterParameters params, Highlighter* parent) @@ -2158,129 +2048,28 @@ private: }; using RegionList = Vector; + struct RegexKey + { + StringView regex; + bool match_captures; + + friend size_t hash_value(const RegexKey& key) { return hash_values(key.regex, key.match_captures); } + friend bool operator==(const RegexKey&, const RegexKey&) = default; + }; + struct Cache { size_t buffer_timestamp = 0; size_t regions_timestamp = 0; LineRangeSet ranges; - std::unique_ptr matches; + HashMap matches; HashMap regions; }; - using RegionAndMatch = std::pair; - - // find the begin closest to pos in all matches - RegionAndMatch find_next_begin(const Cache& cache, BufferCoord pos) const - { - RegionAndMatch res{0, cache.matches[0].find_next_begin(pos)}; - for (size_t i = 1; i < m_regions.size(); ++i) - { - const auto& matches = cache.matches[i]; - auto it = matches.find_next_begin(pos); - if (it != matches.begin_matches.end() and - (res.second == cache.matches[res.first].begin_matches.end() or - it->begin_coord() < res.second->begin_coord())) - res = RegionAndMatch{i, it}; - } - return res; - } - - bool update_matches(Cache& cache, const Buffer& buffer, LineRange range) - { - const size_t buffer_timestamp = buffer.timestamp(); - if (cache.buffer_timestamp == 0 or - cache.regions_timestamp != m_regions_timestamp) - { - cache.matches.reset(new RegionMatches[m_regions.size()]); - for (size_t i = 0; i < m_regions.size(); ++i) - { - cache.matches[i] = RegionMatches{}; - m_regions.item(i).value->add_matches(buffer, range, cache.matches[i]); - } - cache.ranges.reset(range); - cache.buffer_timestamp = buffer_timestamp; - cache.regions_timestamp = m_regions_timestamp; - return true; - } - else - { - bool modified = false; - if (cache.buffer_timestamp != buffer_timestamp) - { - auto modifs = compute_line_modifications(buffer, cache.buffer_timestamp); - for (size_t i = 0; i < m_regions.size(); ++i) - { - Kakoune::update_matches(buffer, modifs, cache.matches[i].begin_matches); - Kakoune::update_matches(buffer, modifs, cache.matches[i].end_matches); - Kakoune::update_matches(buffer, modifs, cache.matches[i].recurse_matches); - } - - cache.ranges.update(modifs); - cache.buffer_timestamp = buffer_timestamp; - modified = true; - } - - cache.ranges.add_range(range, [&, this](const LineRange& range) { - if (range.begin == range.end) - return; - for (size_t i = 0; i < m_regions.size(); ++i) - m_regions.item(i).value->add_matches(buffer, range, cache.matches[i]); - modified = true; - }); - return modified; - } - } - - const RegionList& get_regions_for_range(const Buffer& buffer, BufferRange range) - { - Cache& cache = m_cache.get(buffer); - if (update_matches(cache, buffer, {range.begin.line, std::min(buffer.line_count(), range.end.line + 1)})) - cache.regions.clear(); - - auto it = cache.regions.find(range); - if (it != cache.regions.end()) - return it->value; - - RegionList& regions = cache.regions[range]; - - for (auto begin = find_next_begin(cache, range.begin), - end = RegionAndMatch{ 0, cache.matches[0].begin_matches.end() }; - begin != end; ) - { - const RegionMatches& matches = cache.matches[begin.first]; - auto& region = m_regions.item(begin.first); - auto beg_it = begin.second; - auto end_it = matches.find_matching_end(buffer, beg_it->end_coord(), - region.value->match_capture() ? beg_it->capture(buffer) : Optional{}); - - if (end_it == matches.end_matches.end() or end_it->end_coord() >= range.end) // region continue past range end - { - auto begin_coord = beg_it->begin_coord(); - if (begin_coord < range.end) - regions.push_back({begin_coord, range.end, region.key}); - break; - } - - auto end_coord = end_it->end_coord(); - regions.push_back({beg_it->begin_coord(), end_coord, region.key}); - - // With empty begin and end matches (for example if the regexes - // are /"\K/ and /(?=")/), that case can happen, and would - // result in an infinite loop. - if (end_coord == beg_it->begin_coord()) - { - kak_assert(beg_it->empty() and end_it->empty()); - ++end_coord.column; - } - begin = find_next_begin(cache, end_coord); - } - return regions; - } - struct RegionHighlighter : public Highlighter { RegionHighlighter(std::unique_ptr&& delegate, - Regex begin, Regex end, Regex recurse, + String begin, String end, String recurse, bool match_capture) : Highlighter{delegate->passes()}, m_delegate{std::move(delegate)}, @@ -2329,33 +2118,275 @@ private: return m_delegate->highlight(context, display_buffer, range); } - void add_matches(const Buffer& buffer, LineRange range, RegionMatches& matches) const - { - if (m_default) - return; - - Kakoune::insert_matches(buffer, matches.begin_matches, m_begin, m_match_capture, range); - Kakoune::insert_matches(buffer, matches.end_matches, m_end, m_match_capture, range); - if (not m_recurse.empty()) - Kakoune::insert_matches(buffer, matches.recurse_matches, m_recurse, m_match_capture, range); - } bool match_capture() const { return m_match_capture; } bool is_default() const { return m_default; } Highlighter& delegate() { return *m_delegate; } - private: + // private: std::unique_ptr m_delegate; - Regex m_begin; - Regex m_end; - Regex m_recurse; + String m_begin; + String m_end; + String m_recurse; bool m_match_capture = false; bool m_default = false; }; + + using RegionAndMatch = std::pair; + + static bool compare_to_begin(const RegexMatch& lhs, BufferCoord rhs) + { + return lhs.begin_coord() < rhs; + } + + RegexMatchList::const_iterator find_matching_end(const Buffer& buffer, BufferCoord beg_pos, const RegexMatchList& end_matches, const RegexMatchList& recurse_matches, Optional capture) const + { + auto end_it = end_matches.begin(); + auto rec_it = recurse_matches.begin(); + int recurse_level = 0; + while (true) + { + end_it = std::lower_bound(end_it, end_matches.end(), beg_pos, + compare_to_begin); + rec_it = std::lower_bound(rec_it, recurse_matches.end(), beg_pos, + compare_to_begin); + + if (end_it == end_matches.end()) + return end_it; + + while (rec_it != recurse_matches.end() and + rec_it->end_coord() <= end_it->end_coord()) + { + if (not capture or rec_it->capture(buffer) == *capture) + ++recurse_level; + ++rec_it; + } + + if (not capture or *capture == end_it->capture(buffer)) + { + if (recurse_level == 0) + return end_it; + --recurse_level; + } + + if (beg_pos != end_it->end_coord()) + beg_pos = end_it->end_coord(); + ++end_it; + } + } + + // find the begin closest to pos in all matches + Optional find_next_begin(const Cache& cache, BufferCoord pos) const + { + Optional res; + + for (size_t i = 0; i < m_regions.size(); ++i) + { + const auto& [key, region] = m_regions.item(i); + if (region->is_default()) + continue; + + const auto& matches = cache.matches.get(RegexKey{region->m_begin, region->match_capture()}); + auto it = std::lower_bound(matches.begin(), matches.end(), pos, compare_to_begin); + if (it != matches.end() and (not res or it->begin_coord() < res->second->begin_coord())) + res = RegionAndMatch{i, it}; + } + return res; + } + + void add_regex(const String& str, bool captures) + { + const RegexKey key{str, captures}; + if (str.empty() or m_regexes.contains(key)) + return; + + auto flags = RegexCompileFlags::Optimize; + if (not captures) + flags |= RegexCompileFlags::NoSubs; + + m_regexes.insert({key, Regex{str, flags}}); + } + + void add_matches(const Buffer& buffer, LineRange range, Cache& cache) const + { + for (auto& [key, regex] : m_regexes) + cache.matches[key]; + + struct Matcher + { + RegexMatchList& matches; + const Regex& regex; + size_t pivot = matches.size(); + ThreadedRegexVM vm{*regex.impl()}; + }; + Vector matchers; + for (auto& [key, regex] : m_regexes) + matchers.push_back(Matcher{cache.matches.get(key), regex}); + + for (auto line = range.begin; line < range.end; ++line) + { + const StringView l = buffer[line]; + const auto flags = RegexExecFlags::NotEndOfLine; // buffer line already ends with \n + + for (auto& [matches, regex, pivot, vm] : matchers) + { + for (auto&& m : RegexIterator{l.begin(), l.end(), vm, flags}) + { + const bool with_capture = regex.mark_count() > 0 and m[1].matched and + m[0].second - m[0].first < std::numeric_limits::max(); + matches.push_back({ + line, + (int)(m[0].first - l.begin()), + (int)(m[0].second - l.begin()), + (uint16_t)(with_capture ? m[1].first - m[0].first : 0), + (uint16_t)(with_capture ? m[1].second - m[1].first : 0) + }); + } + } + } + + for (auto& [matches, regex, pivot, vm] : matchers) + { + auto pos = std::lower_bound(matches.begin(), matches.begin() + pivot, range.begin, + [](const RegexMatch& m, LineCount l) { return m.line < l; }); + kak_assert(pos == matches.begin() + pivot or pos->line >= range.end); // We should not have had matches for range + + // Move new matches into position. + std::rotate(pos, matches.begin() + pivot, matches.end()); + } + } + + void update_changed_lines(const Buffer& buffer, ConstArrayView modifs, Cache& cache) + { + for (auto& [key, matches] : cache.matches) + { + // remove out of date matches and update line for others + auto ins_pos = matches.begin(); + for (auto it = ins_pos; it != matches.end(); ++it) + { + auto modif_it = std::upper_bound(modifs.begin(), modifs.end(), it->line, + [](const LineCount& l, const LineModification& c) + { return l < c.old_line; }); + + if (modif_it != modifs.begin()) + { + auto& prev = *(modif_it-1); + if (it->line < prev.old_line + prev.num_removed) + continue; // match removed + + it->line += prev.diff(); + } + + kak_assert(buffer.is_valid(it->begin_coord()) or + buffer[it->line].length() == it->begin); + kak_assert(buffer.is_valid(it->end_coord()) or + buffer[it->line].length() == it->end); + + if (ins_pos != it) + *ins_pos = std::move(*it); + ++ins_pos; + } + matches.erase(ins_pos, matches.end()); + } + } + + + bool update_matches(Cache& cache, const Buffer& buffer, LineRange range) + { + const size_t buffer_timestamp = buffer.timestamp(); + if (cache.buffer_timestamp == 0 or + cache.regions_timestamp != m_regions_timestamp) + { + m_regexes.clear(); + cache.matches.clear(); + for (auto& [key, region] : m_regions) + { + add_regex(region->m_begin, region->match_capture()); + add_regex(region->m_end, region->match_capture()); + add_regex(region->m_recurse, region->match_capture()); + } + + add_matches(buffer, range, cache); + cache.ranges.reset(range); + cache.buffer_timestamp = buffer_timestamp; + cache.regions_timestamp = m_regions_timestamp; + return true; + } + else + { + bool modified = false; + if (cache.buffer_timestamp != buffer_timestamp) + { + auto modifs = compute_line_modifications(buffer, cache.buffer_timestamp); + update_changed_lines(buffer, modifs, cache); + cache.ranges.update(modifs); + cache.buffer_timestamp = buffer_timestamp; + modified = true; + } + + cache.ranges.add_range(range, [&, this](const LineRange& range) { + if (range.begin == range.end) + return; + add_matches(buffer, range, cache); + modified = true; + }); + return modified; + } + } + + const RegionList& get_regions_for_range(const Buffer& buffer, BufferRange range) + { + Cache& cache = m_cache.get(buffer); + if (update_matches(cache, buffer, {range.begin.line, std::min(buffer.line_count(), range.end.line + 1)})) + cache.regions.clear(); + + auto it = cache.regions.find(range); + if (it != cache.regions.end()) + return it->value; + + RegionList& regions = cache.regions[range]; + RegexMatchList empty_matches{}; + + for (auto begin = find_next_begin(cache, range.begin); begin; ) + { + auto& [index, beg_it] = *begin; + auto& [name, region] = m_regions.item(index); + auto& end_matches = cache.matches.get(RegexKey{region->m_end, region->match_capture()}); + auto& recurse_matches = region->m_recurse.empty() ? + empty_matches : cache.matches.get(RegexKey{region->m_recurse, region->match_capture()}); + + auto end_it = find_matching_end(buffer, beg_it->end_coord(), end_matches, recurse_matches, + region->match_capture() ? beg_it->capture(buffer) : Optional{}); + + if (end_it == end_matches.end() or end_it->end_coord() >= range.end) // region continue past range end + { + auto begin_coord = beg_it->begin_coord(); + if (begin_coord < range.end) + regions.push_back({begin_coord, range.end, name}); + break; + } + + auto end_coord = end_it->end_coord(); + regions.push_back({beg_it->begin_coord(), end_coord, name}); + + // With empty begin and end matches (for example if the regexes + // are /"\K/ and /(?=")/), that case can happen, and would + // result in an infinite loop. + if (end_coord == beg_it->begin_coord()) + { + kak_assert(beg_it->empty() and end_it->empty()); + ++end_coord.column; + } + begin = find_next_begin(cache, end_coord); + } + return regions; + } + HashMap, MemoryDomain::Highlight> m_regions; + HashMap m_regexes; String m_default_region; size_t m_regions_timestamp = 0; diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 73d69790..b0ceeecf 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -209,6 +209,8 @@ public: (not forward and program.first_backward_inst != -1)); } + ThreadedRegexVM(ThreadedRegexVM&&) = default; + ThreadedRegexVM& operator=(ThreadedRegexVM&&) = default; ThreadedRegexVM(const ThreadedRegexVM&) = delete; ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete; @@ -617,6 +619,11 @@ private: { DualThreadStack() = default; DualThreadStack(const DualThreadStack&) = delete; + DualThreadStack(DualThreadStack&& other) + : m_data{other.m_data}, m_capacity{other.m_capacity}, m_current{other.m_current}, m_next{other.m_next} + { + other.m_data = nullptr; + } ~DualThreadStack() { delete[] m_data; } bool current_is_empty() const { return m_current == 0; }