Simplify and accelerate start desc map

Store values for all possible bytes and fill utf8 multi byte start
values when necessary.
This commit is contained in:
Maxime Coste 2024-03-13 17:29:05 +11:00
parent c956413046
commit c4df0fac52
2 changed files with 21 additions and 17 deletions

View File

@ -17,7 +17,6 @@
namespace Kakoune namespace Kakoune
{ {
constexpr Codepoint CompiledRegex::StartDesc::other;
constexpr Codepoint CompiledRegex::StartDesc::count; constexpr Codepoint CompiledRegex::StartDesc::count;
struct ParsedRegex struct ParsedRegex
@ -893,11 +892,17 @@ private:
bool compute_start_desc(ParsedRegex::NodeIndex index, bool compute_start_desc(ParsedRegex::NodeIndex index,
CompiledRegex::StartDesc& start_desc) const CompiledRegex::StartDesc& start_desc) const
{ {
// fill all bytes that mark the start of an utf8 multi byte sequence
auto add_multi_byte_utf8 = [&] {
std::fill(start_desc.map + 0b11000000, start_desc.map + 0b11111000, true);
};
static constexpr Codepoint single_byte_limit = 128;
auto& node = get_node(index); auto& node = get_node(index);
switch (node.op) switch (node.op)
{ {
case ParsedRegex::Literal: case ParsedRegex::Literal:
if (node.value < CompiledRegex::StartDesc::count) if (node.value < single_byte_limit)
{ {
if (node.ignore_case) if (node.ignore_case)
{ {
@ -908,14 +913,14 @@ private:
start_desc.map[node.value] = true; start_desc.map[node.value] = true;
} }
else else
start_desc.map[CompiledRegex::StartDesc::other] = true; add_multi_byte_utf8();
return node.quantifier.allows_none(); return node.quantifier.allows_none();
case ParsedRegex::AnyChar: case ParsedRegex::AnyChar:
for (auto& b : start_desc.map) for (auto& b : start_desc.map)
b = true; b = true;
return node.quantifier.allows_none(); return node.quantifier.allows_none();
case ParsedRegex::AnyCharExceptNewLine: case ParsedRegex::AnyCharExceptNewLine:
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{ {
if (cp != '\n') if (cp != '\n')
start_desc.map[cp] = true; start_desc.map[cp] = true;
@ -930,33 +935,33 @@ private:
{ {
for (auto& range : character_class.ranges) for (auto& range : character_class.ranges)
{ {
const auto clamp = [](Codepoint cp) { return std::min(CompiledRegex::StartDesc::count, cp); }; const auto clamp = [](Codepoint cp) { return std::min(single_byte_limit, cp); };
for (auto cp = clamp(range.min), end = clamp(range.max + 1); cp < end; ++cp) for (auto cp = clamp(range.min), end = clamp(range.max + 1); cp < end; ++cp)
start_desc.map[cp] = true; start_desc.map[cp] = true;
if (range.max >= CompiledRegex::StartDesc::count) if (range.max >= single_byte_limit)
start_desc.map[CompiledRegex::StartDesc::other] = true; add_multi_byte_utf8();
} }
} }
else else
{ {
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{ {
if (start_desc.map[cp] or character_class.matches(cp)) if (start_desc.map[cp] or character_class.matches(cp))
start_desc.map[cp] = true; start_desc.map[cp] = true;
} }
} }
start_desc.map[CompiledRegex::StartDesc::other] = true; add_multi_byte_utf8();
return node.quantifier.allows_none(); return node.quantifier.allows_none();
} }
case ParsedRegex::CharType: case ParsedRegex::CharType:
{ {
const CharacterType ctype = (CharacterType)node.value; const CharacterType ctype = (CharacterType)node.value;
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{ {
if (is_ctype(ctype, cp)) if (is_ctype(ctype, cp))
start_desc.map[cp] = true; start_desc.map[cp] = true;
} }
start_desc.map[CompiledRegex::StartDesc::other] = true; add_multi_byte_utf8();
return node.quantifier.allows_none(); return node.quantifier.allows_none();
} }
case ParsedRegex::Sequence: case ParsedRegex::Sequence:

View File

@ -152,8 +152,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
struct StartDesc : UseMemoryDomain<MemoryDomain::Regex> struct StartDesc : UseMemoryDomain<MemoryDomain::Regex>
{ {
static constexpr Codepoint count = 128; static constexpr Codepoint count = 256;
static constexpr Codepoint other = 0;
bool map[count]; bool map[count];
}; };
@ -277,7 +276,7 @@ public:
else if (start != config.end) else if (start != config.end)
{ {
const unsigned char c = forward ? *start : *utf8::previous(start, config.end); const unsigned char c = forward ? *start : *utf8::previous(start, config.end);
if (not start_desc->map[(c < StartDesc::count) ? c : StartDesc::other]) if (not start_desc->map[c])
return false; return false;
} }
} }
@ -519,11 +518,11 @@ private:
{ {
while (start != config.end) while (start != config.end)
{ {
static_assert(StartDesc::count <= 128, "start desc should be ascii only"); static_assert(StartDesc::count <= 256, "start desc should be ascii only");
if constexpr (forward) if constexpr (forward)
{ {
const unsigned char c = *start; const unsigned char c = *start;
if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other]) if (start_desc.map[c])
return; return;
++start; ++start;
} }
@ -531,7 +530,7 @@ private:
{ {
auto prev = utf8::previous(start, config.end); auto prev = utf8::previous(start, config.end);
const unsigned char c = *prev; const unsigned char c = *prev;
if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other]) if (start_desc.map[c])
return; return;
start = prev; start = prev;
} }