Optimize Regex CharacterClass matching

Take advantage of ranges sorting to early out, make the logic
inline.
This commit is contained in:
Maxime Coste 2023-02-19 11:15:31 +11:00
parent afaa47e93f
commit f115af7a57
2 changed files with 22 additions and 20 deletions

View File

@ -959,7 +959,7 @@ private:
{ {
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp) for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
{ {
if (start_desc.map[cp] or is_character_class(character_class, cp)) if (start_desc.map[cp] or character_class.matches(cp))
start_desc.map[cp] = true; start_desc.map[cp] = true;
} }
} }
@ -1165,20 +1165,6 @@ CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex(); return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
} }
bool is_character_class(const CharacterClass& character_class, Codepoint cp)
{
if (character_class.ignore_case)
cp = to_lower(cp);
auto it = std::find_if(character_class.ranges.begin(),
character_class.ranges.end(),
[cp](auto& range) { return range.min <= cp and cp <= range.max; });
bool found = it != character_class.ranges.end() or (character_class.ctypes != CharacterType::None and
is_ctype(character_class.ctypes, cp));
return found != character_class.negative;
}
bool is_ctype(CharacterType ctype, Codepoint cp) bool is_ctype(CharacterType ctype, Codepoint cp)
{ {
auto check = [&](CharacterType bit, CharacterType not_bit, auto&& func) { auto check = [&](CharacterType bit, CharacterType not_bit, auto&& func) {

View File

@ -31,6 +31,8 @@ enum class CharacterType : unsigned char
}; };
constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; } constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
bool is_ctype(CharacterType ctype, Codepoint cp);
struct CharacterClass struct CharacterClass
{ {
struct Range struct Range
@ -45,10 +47,24 @@ struct CharacterClass
bool ignore_case = false; bool ignore_case = false;
friend bool operator==(const CharacterClass&, const CharacterClass&) = default; friend bool operator==(const CharacterClass&, const CharacterClass&) = default;
};
bool is_character_class(const CharacterClass& character_class, Codepoint cp); bool matches(Codepoint cp) const
bool is_ctype(CharacterType ctype, Codepoint cp); {
if (ignore_case)
cp = to_lower(cp);
for (auto& range : ranges)
{
if (cp < range.min)
break;
else if (cp <= range.max)
return not negative;
}
return (ctypes != CharacterType::None and is_ctype(ctypes, cp)) != negative;
}
};
struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex> struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
{ {
@ -418,7 +434,7 @@ private:
case CompiledRegex::CharClass: case CompiledRegex::CharClass:
if (pos == config.end) if (pos == config.end)
return failed(); return failed();
return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ? return m_program.character_classes[inst.param.character_class_index].matches(codepoint(pos, config)) ?
consumed() : failed(); consumed() : failed();
case CompiledRegex::CharType: case CompiledRegex::CharType:
if (pos == config.end) if (pos == config.end)
@ -552,7 +568,7 @@ private:
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType) else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
{ {
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass); auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
if (not is_character_class(m_program.character_classes[index], cp)) if (not m_program.character_classes[index].matches(cp))
return false; return false;
} }
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd) else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)