Support \x and \u escapes in regex character classes

Change \u to use 6 digits to cover the full unicode range.

Fixes #3172
This commit is contained in:
Maxime Coste 2019-11-06 20:48:48 +11:00
parent 3816f1fa43
commit 3e7301ede7
2 changed files with 66 additions and 35 deletions

View File

@ -23,7 +23,7 @@ Some literals are available as escape sequences:
* `\0` matches the null character. * `\0` matches the null character.
* `\cX` matches the control-X character (X can be in `[A-Za-z]`). * `\cX` matches the control-X character (X can be in `[A-Za-z]`).
* `\xXX` matches the character whose codepoint is XX (in hexadecimal). * `\xXX` matches the character whose codepoint is XX (in hexadecimal).
* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal). * `\uXXXXXX` matches the character whose codepoint is XXXXXX (in hexadecimal).
== Character classes == Character classes
@ -185,3 +185,5 @@ exists for ease of use or performance reasons:
escapes, identity escapes like `\X` with X a non-special character escapes, identity escapes like `\X` with X a non-special character
are not accepted, to avoid confusions between `\h` meaning literal are not accepted, to avoid confusions between `\h` meaning literal
`h` in ECMAScript, and horizontal blank in Kakoune. `h` in ECMAScript, and horizontal blank in Kakoune.
* `\uXXXXXX` uses 6 digits to cover all of unicode, instead of relying
on ECMAScript UTF-16 surrogate pairs with 4 digits.

View File

@ -349,6 +349,29 @@ private:
} }
} }
Codepoint read_hex(size_t count)
{
Codepoint res = 0;
for (int i = 0; i < count; ++i)
{
if (at_end())
parse_error("unterminated hex sequence");
Codepoint digit = *m_pos++;
Codepoint digit_value;
if ('0' <= digit and digit <= '9')
digit_value = digit - '0';
else if ('a' <= digit and digit <= 'f')
digit_value = 0xa + digit - 'a';
else if ('A' <= digit and digit <= 'F')
digit_value = 0xa + digit - 'A';
else
parse_error(format("invalid hex digit '{}'", digit));
res = res * 16 + digit_value;
}
return res;
}
NodeIndex atom_escape() NodeIndex atom_escape()
{ {
const Codepoint cp = *m_pos++; const Codepoint cp = *m_pos++;
@ -381,29 +404,6 @@ private:
return new_node(ParsedRegex::Literal, control.value); return new_node(ParsedRegex::Literal, control.value);
} }
auto read_hex = [this](size_t count)
{
Codepoint res = 0;
for (int i = 0; i < count; ++i)
{
if (at_end())
parse_error("unterminated hex sequence");
Codepoint digit = *m_pos++;
Codepoint digit_value;
if ('0' <= digit and digit <= '9')
digit_value = digit - '0';
else if ('a' <= digit and digit <= 'f')
digit_value = 0xa + digit - 'a';
else if ('A' <= digit and digit <= 'F')
digit_value = 0xa + digit - 'A';
else
parse_error(format("invalid hex digit '{}'", digit));
res = res * 16 + digit_value;
}
return res;
};
if (cp == '0') if (cp == '0')
return new_node(ParsedRegex::Literal, '\0'); return new_node(ParsedRegex::Literal, '\0');
else if (cp == 'c') else if (cp == 'c')
@ -418,7 +418,7 @@ private:
else if (cp == 'x') else if (cp == 'x')
return new_node(ParsedRegex::Literal, read_hex(2)); return new_node(ParsedRegex::Literal, read_hex(2));
else if (cp == 'u') else if (cp == 'u')
return new_node(ParsedRegex::Literal, read_hex(4)); return new_node(ParsedRegex::Literal, read_hex(6));
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return new_node(ParsedRegex::Literal, cp); return new_node(ParsedRegex::Literal, cp);
@ -470,6 +470,20 @@ private:
if (at_end()) if (at_end())
break; break;
auto read_escaped_char = [this]() {
Codepoint cp = *m_pos++;
auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
if (it != std::end(control_escapes))
return it->value;
if (cp == 'x')
return read_hex(2);
if (cp == 'u')
return read_hex(6);
if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
parse_error(format("unknown character class escape '{}'", cp));
return cp;
};
if (cp == '\\') if (cp == '\\')
{ {
auto it = find_if(character_class_escapes, auto it = find_if(character_class_escapes,
@ -481,14 +495,7 @@ private:
continue; continue;
} }
else // its an escaped character else // its an escaped character
{ cp = read_escaped_char();
cp = *m_pos++;
auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
if (it != std::end(control_escapes))
cp = it->value;
else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
parse_error(format("unknown character class escape '{}'", cp));
}
} }
CharacterClass::Range range = { cp, cp }; CharacterClass::Range range = { cp, cp };
@ -498,7 +505,10 @@ private:
break; break;
if (*m_pos != ']') if (*m_pos != ']')
{ {
range.max = *m_pos++; cp = *m_pos++;
if (cp == '\\')
cp = read_escaped_char();
range.max = cp;
if (range.min > range.max) if (range.min > range.max)
parse_error("invalid range specified"); parse_error("invalid range specified");
} }
@ -1522,13 +1532,32 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("bCa")); kak_assert(vm.exec("bCa"));
} }
{
TestVM<> vm{R"([\t-\r]+)"};
kak_assert(vm.exec("\t\n\v\f\r"));
}
{
TestVM<> vm{R"([^\x00-\x7F]+)"};
kak_assert(not vm.exec("ascii"));
kak_assert(vm.exec("←↑→↓"));
kak_assert(vm.exec("😄😊😉"));
}
{
TestVM<> vm{R"([^\u000000-\u00ffff]+)"};
kak_assert(not vm.exec("ascii"));
kak_assert(not vm.exec("←↑→↓"));
kak_assert(vm.exec("😄😊😉"));
}
{ {
TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"}; TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"};
kak_assert(vm.exec("д", RegexExecFlags::None)); kak_assert(vm.exec("д", RegexExecFlags::None));
} }
{ {
TestVM<> vm{R"(\0\x0A\u260e\u260F)"}; TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"};
const char str[] = "\0\n☎☏"; // work around the null byte in the literal const char str[] = "\0\n☎☏"; // work around the null byte in the literal
kak_assert(vm.exec({str, str + sizeof(str)-1})); kak_assert(vm.exec({str, str + sizeof(str)-1}));
} }