Optimize Regex CharacterClass matching
Take advantage of ranges sorting to early out, make the logic inline.
This commit is contained in:
parent
afaa47e93f
commit
f115af7a57
|
@ -959,7 +959,7 @@ private:
|
|||
{
|
||||
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
|
||||
{
|
||||
if (start_desc.map[cp] or is_character_class(character_class, cp))
|
||||
if (start_desc.map[cp] or character_class.matches(cp))
|
||||
start_desc.map[cp] = true;
|
||||
}
|
||||
}
|
||||
|
@ -1165,20 +1165,6 @@ CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
|
|||
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
|
||||
}
|
||||
|
||||
bool is_character_class(const CharacterClass& character_class, Codepoint cp)
|
||||
{
|
||||
if (character_class.ignore_case)
|
||||
cp = to_lower(cp);
|
||||
|
||||
auto it = std::find_if(character_class.ranges.begin(),
|
||||
character_class.ranges.end(),
|
||||
[cp](auto& range) { return range.min <= cp and cp <= range.max; });
|
||||
|
||||
bool found = it != character_class.ranges.end() or (character_class.ctypes != CharacterType::None and
|
||||
is_ctype(character_class.ctypes, cp));
|
||||
return found != character_class.negative;
|
||||
}
|
||||
|
||||
bool is_ctype(CharacterType ctype, Codepoint cp)
|
||||
{
|
||||
auto check = [&](CharacterType bit, CharacterType not_bit, auto&& func) {
|
||||
|
|
|
@ -31,6 +31,8 @@ enum class CharacterType : unsigned char
|
|||
};
|
||||
constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
|
||||
|
||||
bool is_ctype(CharacterType ctype, Codepoint cp);
|
||||
|
||||
struct CharacterClass
|
||||
{
|
||||
struct Range
|
||||
|
@ -45,10 +47,24 @@ struct CharacterClass
|
|||
bool ignore_case = false;
|
||||
|
||||
friend bool operator==(const CharacterClass&, const CharacterClass&) = default;
|
||||
};
|
||||
|
||||
bool is_character_class(const CharacterClass& character_class, Codepoint cp);
|
||||
bool is_ctype(CharacterType ctype, Codepoint cp);
|
||||
bool matches(Codepoint cp) const
|
||||
{
|
||||
if (ignore_case)
|
||||
cp = to_lower(cp);
|
||||
|
||||
for (auto& range : ranges)
|
||||
{
|
||||
if (cp < range.min)
|
||||
break;
|
||||
else if (cp <= range.max)
|
||||
return not negative;
|
||||
}
|
||||
|
||||
return (ctypes != CharacterType::None and is_ctype(ctypes, cp)) != negative;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||
{
|
||||
|
@ -418,7 +434,7 @@ private:
|
|||
case CompiledRegex::CharClass:
|
||||
if (pos == config.end)
|
||||
return failed();
|
||||
return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ?
|
||||
return m_program.character_classes[inst.param.character_class_index].matches(codepoint(pos, config)) ?
|
||||
consumed() : failed();
|
||||
case CompiledRegex::CharType:
|
||||
if (pos == config.end)
|
||||
|
@ -552,7 +568,7 @@ private:
|
|||
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
|
||||
{
|
||||
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
|
||||
if (not is_character_class(m_program.character_classes[index], cp))
|
||||
if (not m_program.character_classes[index].matches(cp))
|
||||
return false;
|
||||
}
|
||||
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)
|
||||
|
|
Loading…
Reference in New Issue
Block a user