Regex: Add support for \h and \H "horizontal blank" character classes
This commit is contained in:
parent
4ac0d35d1e
commit
e4004a7b7f
|
@ -222,9 +222,9 @@ private:
|
||||||
{
|
{
|
||||||
auto matcher_id = m_parsed_regex.matchers.size();
|
auto matcher_id = m_parsed_regex.matchers.size();
|
||||||
m_parsed_regex.matchers.push_back(
|
m_parsed_regex.matchers.push_back(
|
||||||
[ctype = wctype(character_class.ctype),
|
[ctype = character_class.ctype ? wctype(character_class.ctype) : (wctype_t)0,
|
||||||
chars = character_class.additional_chars] (Codepoint cp) {
|
chars = character_class.additional_chars] (Codepoint cp) {
|
||||||
return iswctype(cp, ctype) or contains(chars, cp);
|
return (ctype != 0 and iswctype(cp, ctype)) or contains(chars, cp);
|
||||||
});
|
});
|
||||||
return new_node(ParsedRegex::Matcher, matcher_id);
|
return new_node(ParsedRegex::Matcher, matcher_id);
|
||||||
}
|
}
|
||||||
|
@ -255,6 +255,7 @@ private:
|
||||||
|
|
||||||
struct CharRange { Codepoint min, max; };
|
struct CharRange { Codepoint min, max; };
|
||||||
Vector<CharRange> ranges;
|
Vector<CharRange> ranges;
|
||||||
|
Vector<Codepoint> excluded;
|
||||||
Vector<std::pair<wctype_t, bool>> ctypes;
|
Vector<std::pair<wctype_t, bool>> ctypes;
|
||||||
while (m_pos != m_regex.end() and *m_pos != ']')
|
while (m_pos != m_regex.end() and *m_pos != ']')
|
||||||
{
|
{
|
||||||
|
@ -274,9 +275,15 @@ private:
|
||||||
[cp = *m_pos](auto& t) { return t.cp == cp; });
|
[cp = *m_pos](auto& t) { return t.cp == cp; });
|
||||||
if (it != std::end(character_class_escapes))
|
if (it != std::end(character_class_escapes))
|
||||||
{
|
{
|
||||||
|
if (it->ctype)
|
||||||
ctypes.push_back({wctype(it->ctype), not it->neg});
|
ctypes.push_back({wctype(it->ctype), not it->neg});
|
||||||
for (auto& c : it->additional_chars)
|
for (auto& c : it->additional_chars) // TODO: handle negative case
|
||||||
|
{
|
||||||
|
if (it->neg)
|
||||||
|
excluded.push_back((Codepoint)c);
|
||||||
|
else
|
||||||
ranges.push_back({(Codepoint)c, (Codepoint)c});
|
ranges.push_back({(Codepoint)c, (Codepoint)c});
|
||||||
|
}
|
||||||
++m_pos;
|
++m_pos;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -306,12 +313,13 @@ private:
|
||||||
++m_pos;
|
++m_pos;
|
||||||
|
|
||||||
auto matcher = [ranges = std::move(ranges),
|
auto matcher = [ranges = std::move(ranges),
|
||||||
ctypes = std::move(ctypes), negative] (Codepoint cp) {
|
ctypes = std::move(ctypes),
|
||||||
|
excluded = std::move(excluded), negative] (Codepoint cp) {
|
||||||
auto found = contains_that(ranges, [cp](auto& r) {
|
auto found = contains_that(ranges, [cp](auto& r) {
|
||||||
return r.min <= cp and cp <= r.max;
|
return r.min <= cp and cp <= r.max;
|
||||||
}) or contains_that(ctypes, [cp](auto& c) {
|
}) or contains_that(ctypes, [cp](auto& c) {
|
||||||
return (bool)iswctype(cp, c.first) == c.second;
|
return (bool)iswctype(cp, c.first) == c.second;
|
||||||
});
|
}) or (not excluded.empty() and not contains(excluded, cp));
|
||||||
return negative ? not found : found;
|
return negative ? not found : found;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -390,17 +398,19 @@ private:
|
||||||
bool neg;
|
bool neg;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const CharacterClassEscape character_class_escapes[6];
|
static const CharacterClassEscape character_class_escapes[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
// For some reason Gcc fails to link if this is constexpr
|
// For some reason Gcc fails to link if this is constexpr
|
||||||
const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = {
|
const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] = {
|
||||||
{ 'd', "digit", "", false },
|
{ 'd', "digit", "", false },
|
||||||
{ 'D', "digit", "", true },
|
{ 'D', "digit", "", true },
|
||||||
{ 'w', "alnum", "_", false },
|
{ 'w', "alnum", "_", false },
|
||||||
{ 'W', "alnum", "_", true },
|
{ 'W', "alnum", "_", true },
|
||||||
{ 's', "space", "", false },
|
{ 's', "space", "", false },
|
||||||
{ 's', "space", "", true },
|
{ 'S', "space", "", true },
|
||||||
|
{ 'h', nullptr, " \t", false },
|
||||||
|
{ 'H', nullptr, " \t", true },
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CompiledRegex
|
struct CompiledRegex
|
||||||
|
@ -982,6 +992,12 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(not vm.exec("123_456"));
|
kak_assert(not vm.exec("123_456"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"([ \H]+)"};
|
||||||
|
kak_assert(vm.exec("abc "));
|
||||||
|
kak_assert(not vm.exec("a \t"));
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM vm{R"(\Q{}[]*+?\Ea+)"};
|
TestVM vm{R"(\Q{}[]*+?\Ea+)"};
|
||||||
kak_assert(vm.exec("{}[]*+?aa"));
|
kak_assert(vm.exec("{}[]*+?aa"));
|
||||||
|
|
Loading…
Reference in New Issue
Block a user