Optimize Regex CharacterClass matching
Take advantage of ranges sorting to early out, make the logic inline.
This commit is contained in:
parent
afaa47e93f
commit
f115af7a57
|
@ -959,7 +959,7 @@ private:
|
||||||
{
|
{
|
||||||
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
|
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
|
||||||
{
|
{
|
||||||
if (start_desc.map[cp] or is_character_class(character_class, cp))
|
if (start_desc.map[cp] or character_class.matches(cp))
|
||||||
start_desc.map[cp] = true;
|
start_desc.map[cp] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1165,20 +1165,6 @@ CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
|
||||||
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
|
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_character_class(const CharacterClass& character_class, Codepoint cp)
|
|
||||||
{
|
|
||||||
if (character_class.ignore_case)
|
|
||||||
cp = to_lower(cp);
|
|
||||||
|
|
||||||
auto it = std::find_if(character_class.ranges.begin(),
|
|
||||||
character_class.ranges.end(),
|
|
||||||
[cp](auto& range) { return range.min <= cp and cp <= range.max; });
|
|
||||||
|
|
||||||
bool found = it != character_class.ranges.end() or (character_class.ctypes != CharacterType::None and
|
|
||||||
is_ctype(character_class.ctypes, cp));
|
|
||||||
return found != character_class.negative;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_ctype(CharacterType ctype, Codepoint cp)
|
bool is_ctype(CharacterType ctype, Codepoint cp)
|
||||||
{
|
{
|
||||||
auto check = [&](CharacterType bit, CharacterType not_bit, auto&& func) {
|
auto check = [&](CharacterType bit, CharacterType not_bit, auto&& func) {
|
||||||
|
|
|
@ -31,6 +31,8 @@ enum class CharacterType : unsigned char
|
||||||
};
|
};
|
||||||
constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
|
constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
|
||||||
|
|
||||||
|
bool is_ctype(CharacterType ctype, Codepoint cp);
|
||||||
|
|
||||||
struct CharacterClass
|
struct CharacterClass
|
||||||
{
|
{
|
||||||
struct Range
|
struct Range
|
||||||
|
@ -45,10 +47,24 @@ struct CharacterClass
|
||||||
bool ignore_case = false;
|
bool ignore_case = false;
|
||||||
|
|
||||||
friend bool operator==(const CharacterClass&, const CharacterClass&) = default;
|
friend bool operator==(const CharacterClass&, const CharacterClass&) = default;
|
||||||
};
|
|
||||||
|
|
||||||
bool is_character_class(const CharacterClass& character_class, Codepoint cp);
|
bool matches(Codepoint cp) const
|
||||||
bool is_ctype(CharacterType ctype, Codepoint cp);
|
{
|
||||||
|
if (ignore_case)
|
||||||
|
cp = to_lower(cp);
|
||||||
|
|
||||||
|
for (auto& range : ranges)
|
||||||
|
{
|
||||||
|
if (cp < range.min)
|
||||||
|
break;
|
||||||
|
else if (cp <= range.max)
|
||||||
|
return not negative;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ctypes != CharacterType::None and is_ctype(ctypes, cp)) != negative;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
{
|
{
|
||||||
|
@ -418,7 +434,7 @@ private:
|
||||||
case CompiledRegex::CharClass:
|
case CompiledRegex::CharClass:
|
||||||
if (pos == config.end)
|
if (pos == config.end)
|
||||||
return failed();
|
return failed();
|
||||||
return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ?
|
return m_program.character_classes[inst.param.character_class_index].matches(codepoint(pos, config)) ?
|
||||||
consumed() : failed();
|
consumed() : failed();
|
||||||
case CompiledRegex::CharType:
|
case CompiledRegex::CharType:
|
||||||
if (pos == config.end)
|
if (pos == config.end)
|
||||||
|
@ -552,7 +568,7 @@ private:
|
||||||
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
|
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
|
||||||
{
|
{
|
||||||
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
|
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
|
||||||
if (not is_character_class(m_program.character_classes[index], cp))
|
if (not m_program.character_classes[index].matches(cp))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)
|
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user