From c4df0fac52c83fe68b7f583de1f419c976645dc0 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 13 Mar 2024 17:29:05 +1100 Subject: [PATCH] Simplify and accelerate start desc map Store values for all possible bytes and fill utf8 multi byte start values when necessary. --- src/regex_impl.cc | 27 ++++++++++++++++----------- src/regex_impl.hh | 11 +++++------ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 33d7a88a..b1630a1f 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -17,7 +17,6 @@ namespace Kakoune { -constexpr Codepoint CompiledRegex::StartDesc::other; constexpr Codepoint CompiledRegex::StartDesc::count; struct ParsedRegex @@ -893,11 +892,17 @@ private: bool compute_start_desc(ParsedRegex::NodeIndex index, CompiledRegex::StartDesc& start_desc) const { + // fill all bytes that mark the start of an utf8 multi byte sequence + auto add_multi_byte_utf8 = [&] { + std::fill(start_desc.map + 0b11000000, start_desc.map + 0b11111000, true); + }; + static constexpr Codepoint single_byte_limit = 128; + auto& node = get_node(index); switch (node.op) { case ParsedRegex::Literal: - if (node.value < CompiledRegex::StartDesc::count) + if (node.value < single_byte_limit) { if (node.ignore_case) { @@ -908,14 +913,14 @@ private: start_desc.map[node.value] = true; } else - start_desc.map[CompiledRegex::StartDesc::other] = true; + add_multi_byte_utf8(); return node.quantifier.allows_none(); case ParsedRegex::AnyChar: for (auto& b : start_desc.map) b = true; return node.quantifier.allows_none(); case ParsedRegex::AnyCharExceptNewLine: - for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) + for (Codepoint cp = 0; cp < single_byte_limit; ++cp) { if (cp != '\n') start_desc.map[cp] = true; @@ -930,33 +935,33 @@ private: { for (auto& range : character_class.ranges) { - const auto clamp = [](Codepoint cp) { return std::min(CompiledRegex::StartDesc::count, cp); }; + const auto clamp = [](Codepoint cp) { return std::min(single_byte_limit, cp); }; for (auto cp = clamp(range.min), end = clamp(range.max + 1); cp < end; ++cp) start_desc.map[cp] = true; - if (range.max >= CompiledRegex::StartDesc::count) - start_desc.map[CompiledRegex::StartDesc::other] = true; + if (range.max >= single_byte_limit) + add_multi_byte_utf8(); } } else { - for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) + for (Codepoint cp = 0; cp < single_byte_limit; ++cp) { if (start_desc.map[cp] or character_class.matches(cp)) start_desc.map[cp] = true; } } - start_desc.map[CompiledRegex::StartDesc::other] = true; + add_multi_byte_utf8(); return node.quantifier.allows_none(); } case ParsedRegex::CharType: { const CharacterType ctype = (CharacterType)node.value; - for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) + for (Codepoint cp = 0; cp < single_byte_limit; ++cp) { if (is_ctype(ctype, cp)) start_desc.map[cp] = true; } - start_desc.map[CompiledRegex::StartDesc::other] = true; + add_multi_byte_utf8(); return node.quantifier.allows_none(); } case ParsedRegex::Sequence: diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 90b797c8..7997994c 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -152,8 +152,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain struct StartDesc : UseMemoryDomain { - static constexpr Codepoint count = 128; - static constexpr Codepoint other = 0; + static constexpr Codepoint count = 256; bool map[count]; }; @@ -277,7 +276,7 @@ public: else if (start != config.end) { const unsigned char c = forward ? *start : *utf8::previous(start, config.end); - if (not start_desc->map[(c < StartDesc::count) ? c : StartDesc::other]) + if (not start_desc->map[c]) return false; } } @@ -519,11 +518,11 @@ private: { while (start != config.end) { - static_assert(StartDesc::count <= 128, "start desc should be ascii only"); + static_assert(StartDesc::count <= 256, "start desc should be ascii only"); if constexpr (forward) { const unsigned char c = *start; - if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other]) + if (start_desc.map[c]) return; ++start; } @@ -531,7 +530,7 @@ private: { auto prev = utf8::previous(start, config.end); const unsigned char c = *prev; - if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other]) + if (start_desc.map[c]) return; start = prev; }