Reuse existing character classes when possible in regex

This commit is contained in:
Maxime Coste 2022-08-05 20:29:43 +10:00
parent 26d14d52bb
commit ca71d8997d
2 changed files with 18 additions and 3 deletions

View File

@ -119,6 +119,7 @@ struct Children
Index operator*() const { return m_pos; }
bool operator!=(Sentinel) const { return m_pos != m_end; }
private:
Index find_prev(Index parent, Index pos) const
{
Index child = parent+1;
@ -544,8 +545,10 @@ private:
character_class.ranges.empty())
return add_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes);
auto class_id = m_parsed_regex.character_classes.size();
m_parsed_regex.character_classes.push_back(std::move(character_class));
auto it = std::find(m_parsed_regex.character_classes.begin(), m_parsed_regex.character_classes.end(), character_class);
auto class_id = it - m_parsed_regex.character_classes.begin();
if (it == m_parsed_regex.character_classes.end())
m_parsed_regex.character_classes.push_back(std::move(character_class));
return add_node(ParsedRegex::CharClass, class_id);
}
@ -1536,6 +1539,12 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("\t\n\v\f\r"));
}
{
TestVM<> vm{R"([\t-\r]\h+[\t-\r])"};
kak_assert(vm.character_classes.size() == 1);
kak_assert(vm.exec("\n \f"));
}
{
TestVM<> vm{R"([^\x00-\x7F]+)"};
kak_assert(not vm.exec("ascii"));

View File

@ -33,12 +33,18 @@ constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
struct CharacterClass
{
struct Range { Codepoint min, max; };
struct Range
{
Codepoint min, max;
friend bool operator==(const Range&, const Range&) = default;
};
Vector<Range, MemoryDomain::Regex> ranges;
CharacterType ctypes = CharacterType::None;
bool negative = false;
bool ignore_case = false;
friend bool operator==(const CharacterClass&, const CharacterClass&) = default;
};
bool is_character_class(const CharacterClass& character_class, Codepoint cp);