Support \x and \u escapes in regex character classes
Change \u to use 6 digits to cover the full unicode range. Fixes #3172
This commit is contained in:
parent
3816f1fa43
commit
3e7301ede7
|
@ -23,7 +23,7 @@ Some literals are available as escape sequences:
|
|||
* `\0` matches the null character.
|
||||
* `\cX` matches the control-X character (X can be in `[A-Za-z]`).
|
||||
* `\xXX` matches the character whose codepoint is XX (in hexadecimal).
|
||||
* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal).
|
||||
* `\uXXXXXX` matches the character whose codepoint is XXXXXX (in hexadecimal).
|
||||
|
||||
== Character classes
|
||||
|
||||
|
@ -185,3 +185,5 @@ exists for ease of use or performance reasons:
|
|||
escapes, identity escapes like `\X` with X a non-special character
|
||||
are not accepted, to avoid confusions between `\h` meaning literal
|
||||
`h` in ECMAScript, and horizontal blank in Kakoune.
|
||||
* `\uXXXXXX` uses 6 digits to cover all of unicode, instead of relying
|
||||
on ECMAScript UTF-16 surrogate pairs with 4 digits.
|
||||
|
|
|
@ -349,6 +349,29 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
Codepoint read_hex(size_t count)
|
||||
{
|
||||
Codepoint res = 0;
|
||||
for (int i = 0; i < count; ++i)
|
||||
{
|
||||
if (at_end())
|
||||
parse_error("unterminated hex sequence");
|
||||
Codepoint digit = *m_pos++;
|
||||
Codepoint digit_value;
|
||||
if ('0' <= digit and digit <= '9')
|
||||
digit_value = digit - '0';
|
||||
else if ('a' <= digit and digit <= 'f')
|
||||
digit_value = 0xa + digit - 'a';
|
||||
else if ('A' <= digit and digit <= 'F')
|
||||
digit_value = 0xa + digit - 'A';
|
||||
else
|
||||
parse_error(format("invalid hex digit '{}'", digit));
|
||||
|
||||
res = res * 16 + digit_value;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
NodeIndex atom_escape()
|
||||
{
|
||||
const Codepoint cp = *m_pos++;
|
||||
|
@ -381,29 +404,6 @@ private:
|
|||
return new_node(ParsedRegex::Literal, control.value);
|
||||
}
|
||||
|
||||
auto read_hex = [this](size_t count)
|
||||
{
|
||||
Codepoint res = 0;
|
||||
for (int i = 0; i < count; ++i)
|
||||
{
|
||||
if (at_end())
|
||||
parse_error("unterminated hex sequence");
|
||||
Codepoint digit = *m_pos++;
|
||||
Codepoint digit_value;
|
||||
if ('0' <= digit and digit <= '9')
|
||||
digit_value = digit - '0';
|
||||
else if ('a' <= digit and digit <= 'f')
|
||||
digit_value = 0xa + digit - 'a';
|
||||
else if ('A' <= digit and digit <= 'F')
|
||||
digit_value = 0xa + digit - 'A';
|
||||
else
|
||||
parse_error(format("invalid hex digit '{}'", digit));
|
||||
|
||||
res = res * 16 + digit_value;
|
||||
}
|
||||
return res;
|
||||
};
|
||||
|
||||
if (cp == '0')
|
||||
return new_node(ParsedRegex::Literal, '\0');
|
||||
else if (cp == 'c')
|
||||
|
@ -418,7 +418,7 @@ private:
|
|||
else if (cp == 'x')
|
||||
return new_node(ParsedRegex::Literal, read_hex(2));
|
||||
else if (cp == 'u')
|
||||
return new_node(ParsedRegex::Literal, read_hex(4));
|
||||
return new_node(ParsedRegex::Literal, read_hex(6));
|
||||
|
||||
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
||||
return new_node(ParsedRegex::Literal, cp);
|
||||
|
@ -470,6 +470,20 @@ private:
|
|||
if (at_end())
|
||||
break;
|
||||
|
||||
auto read_escaped_char = [this]() {
|
||||
Codepoint cp = *m_pos++;
|
||||
auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
|
||||
if (it != std::end(control_escapes))
|
||||
return it->value;
|
||||
if (cp == 'x')
|
||||
return read_hex(2);
|
||||
if (cp == 'u')
|
||||
return read_hex(6);
|
||||
if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
|
||||
parse_error(format("unknown character class escape '{}'", cp));
|
||||
return cp;
|
||||
};
|
||||
|
||||
if (cp == '\\')
|
||||
{
|
||||
auto it = find_if(character_class_escapes,
|
||||
|
@ -481,14 +495,7 @@ private:
|
|||
continue;
|
||||
}
|
||||
else // its an escaped character
|
||||
{
|
||||
cp = *m_pos++;
|
||||
auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
|
||||
if (it != std::end(control_escapes))
|
||||
cp = it->value;
|
||||
else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
|
||||
parse_error(format("unknown character class escape '{}'", cp));
|
||||
}
|
||||
cp = read_escaped_char();
|
||||
}
|
||||
|
||||
CharacterClass::Range range = { cp, cp };
|
||||
|
@ -498,7 +505,10 @@ private:
|
|||
break;
|
||||
if (*m_pos != ']')
|
||||
{
|
||||
range.max = *m_pos++;
|
||||
cp = *m_pos++;
|
||||
if (cp == '\\')
|
||||
cp = read_escaped_char();
|
||||
range.max = cp;
|
||||
if (range.min > range.max)
|
||||
parse_error("invalid range specified");
|
||||
}
|
||||
|
@ -1522,13 +1532,32 @@ auto test_regex = UnitTest{[]{
|
|||
kak_assert(vm.exec("bCa"));
|
||||
}
|
||||
|
||||
{
|
||||
TestVM<> vm{R"([\t-\r]+)"};
|
||||
kak_assert(vm.exec("\t\n\v\f\r"));
|
||||
}
|
||||
|
||||
{
|
||||
TestVM<> vm{R"([^\x00-\x7F]+)"};
|
||||
kak_assert(not vm.exec("ascii"));
|
||||
kak_assert(vm.exec("←↑→↓"));
|
||||
kak_assert(vm.exec("😄😊😉"));
|
||||
}
|
||||
|
||||
{
|
||||
TestVM<> vm{R"([^\u000000-\u00ffff]+)"};
|
||||
kak_assert(not vm.exec("ascii"));
|
||||
kak_assert(not vm.exec("←↑→↓"));
|
||||
kak_assert(vm.exec("😄😊😉"));
|
||||
}
|
||||
|
||||
{
|
||||
TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"};
|
||||
kak_assert(vm.exec("д", RegexExecFlags::None));
|
||||
}
|
||||
|
||||
{
|
||||
TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
|
||||
TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"};
|
||||
const char str[] = "\0\n☎☏"; // work around the null byte in the literal
|
||||
kak_assert(vm.exec({str, str + sizeof(str)-1}));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user