Regex: Add support for \h and \H "horizontal blank" character classes

This commit is contained in:
Maxime Coste 2017-09-27 14:04:05 +08:00
parent 4ac0d35d1e
commit e4004a7b7f

View File

@ -222,9 +222,9 @@ private:
{ {
auto matcher_id = m_parsed_regex.matchers.size(); auto matcher_id = m_parsed_regex.matchers.size();
m_parsed_regex.matchers.push_back( m_parsed_regex.matchers.push_back(
[ctype = wctype(character_class.ctype), [ctype = character_class.ctype ? wctype(character_class.ctype) : (wctype_t)0,
chars = character_class.additional_chars] (Codepoint cp) { chars = character_class.additional_chars] (Codepoint cp) {
return iswctype(cp, ctype) or contains(chars, cp); return (ctype != 0 and iswctype(cp, ctype)) or contains(chars, cp);
}); });
return new_node(ParsedRegex::Matcher, matcher_id); return new_node(ParsedRegex::Matcher, matcher_id);
} }
@ -255,6 +255,7 @@ private:
struct CharRange { Codepoint min, max; }; struct CharRange { Codepoint min, max; };
Vector<CharRange> ranges; Vector<CharRange> ranges;
Vector<Codepoint> excluded;
Vector<std::pair<wctype_t, bool>> ctypes; Vector<std::pair<wctype_t, bool>> ctypes;
while (m_pos != m_regex.end() and *m_pos != ']') while (m_pos != m_regex.end() and *m_pos != ']')
{ {
@ -274,9 +275,15 @@ private:
[cp = *m_pos](auto& t) { return t.cp == cp; }); [cp = *m_pos](auto& t) { return t.cp == cp; });
if (it != std::end(character_class_escapes)) if (it != std::end(character_class_escapes))
{ {
if (it->ctype)
ctypes.push_back({wctype(it->ctype), not it->neg}); ctypes.push_back({wctype(it->ctype), not it->neg});
for (auto& c : it->additional_chars) for (auto& c : it->additional_chars) // TODO: handle negative case
{
if (it->neg)
excluded.push_back((Codepoint)c);
else
ranges.push_back({(Codepoint)c, (Codepoint)c}); ranges.push_back({(Codepoint)c, (Codepoint)c});
}
++m_pos; ++m_pos;
continue; continue;
} }
@ -306,12 +313,13 @@ private:
++m_pos; ++m_pos;
auto matcher = [ranges = std::move(ranges), auto matcher = [ranges = std::move(ranges),
ctypes = std::move(ctypes), negative] (Codepoint cp) { ctypes = std::move(ctypes),
excluded = std::move(excluded), negative] (Codepoint cp) {
auto found = contains_that(ranges, [cp](auto& r) { auto found = contains_that(ranges, [cp](auto& r) {
return r.min <= cp and cp <= r.max; return r.min <= cp and cp <= r.max;
}) or contains_that(ctypes, [cp](auto& c) { }) or contains_that(ctypes, [cp](auto& c) {
return (bool)iswctype(cp, c.first) == c.second; return (bool)iswctype(cp, c.first) == c.second;
}); }) or (not excluded.empty() and not contains(excluded, cp));
return negative ? not found : found; return negative ? not found : found;
}; };
@ -390,17 +398,19 @@ private:
bool neg; bool neg;
}; };
static const CharacterClassEscape character_class_escapes[6]; static const CharacterClassEscape character_class_escapes[8];
}; };
// For some reason Gcc fails to link if this is constexpr // For some reason Gcc fails to link if this is constexpr
const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = { const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] = {
{ 'd', "digit", "", false }, { 'd', "digit", "", false },
{ 'D', "digit", "", true }, { 'D', "digit", "", true },
{ 'w', "alnum", "_", false }, { 'w', "alnum", "_", false },
{ 'W', "alnum", "_", true }, { 'W', "alnum", "_", true },
{ 's', "space", "", false }, { 's', "space", "", false },
{ 's', "space", "", true }, { 'S', "space", "", true },
{ 'h', nullptr, " \t", false },
{ 'H', nullptr, " \t", true },
}; };
struct CompiledRegex struct CompiledRegex
@ -982,6 +992,12 @@ auto test_regex = UnitTest{[]{
kak_assert(not vm.exec("123_456")); kak_assert(not vm.exec("123_456"));
} }
{
TestVM vm{R"([ \H]+)"};
kak_assert(vm.exec("abc "));
kak_assert(not vm.exec("a \t"));
}
{ {
TestVM vm{R"(\Q{}[]*+?\Ea+)"}; TestVM vm{R"(\Q{}[]*+?\Ea+)"};
kak_assert(vm.exec("{}[]*+?aa")); kak_assert(vm.exec("{}[]*+?aa"));