Support \x and \u escapes in regex character classes
Change \u to use 6 digits to cover the full unicode range. Fixes #3172
This commit is contained in:
parent
3816f1fa43
commit
3e7301ede7
|
@ -23,7 +23,7 @@ Some literals are available as escape sequences:
|
||||||
* `\0` matches the null character.
|
* `\0` matches the null character.
|
||||||
* `\cX` matches the control-X character (X can be in `[A-Za-z]`).
|
* `\cX` matches the control-X character (X can be in `[A-Za-z]`).
|
||||||
* `\xXX` matches the character whose codepoint is XX (in hexadecimal).
|
* `\xXX` matches the character whose codepoint is XX (in hexadecimal).
|
||||||
* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal).
|
* `\uXXXXXX` matches the character whose codepoint is XXXXXX (in hexadecimal).
|
||||||
|
|
||||||
== Character classes
|
== Character classes
|
||||||
|
|
||||||
|
@ -185,3 +185,5 @@ exists for ease of use or performance reasons:
|
||||||
escapes, identity escapes like `\X` with X a non-special character
|
escapes, identity escapes like `\X` with X a non-special character
|
||||||
are not accepted, to avoid confusions between `\h` meaning literal
|
are not accepted, to avoid confusions between `\h` meaning literal
|
||||||
`h` in ECMAScript, and horizontal blank in Kakoune.
|
`h` in ECMAScript, and horizontal blank in Kakoune.
|
||||||
|
* `\uXXXXXX` uses 6 digits to cover all of unicode, instead of relying
|
||||||
|
on ECMAScript UTF-16 surrogate pairs with 4 digits.
|
||||||
|
|
|
@ -349,6 +349,29 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Codepoint read_hex(size_t count)
|
||||||
|
{
|
||||||
|
Codepoint res = 0;
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
{
|
||||||
|
if (at_end())
|
||||||
|
parse_error("unterminated hex sequence");
|
||||||
|
Codepoint digit = *m_pos++;
|
||||||
|
Codepoint digit_value;
|
||||||
|
if ('0' <= digit and digit <= '9')
|
||||||
|
digit_value = digit - '0';
|
||||||
|
else if ('a' <= digit and digit <= 'f')
|
||||||
|
digit_value = 0xa + digit - 'a';
|
||||||
|
else if ('A' <= digit and digit <= 'F')
|
||||||
|
digit_value = 0xa + digit - 'A';
|
||||||
|
else
|
||||||
|
parse_error(format("invalid hex digit '{}'", digit));
|
||||||
|
|
||||||
|
res = res * 16 + digit_value;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
NodeIndex atom_escape()
|
NodeIndex atom_escape()
|
||||||
{
|
{
|
||||||
const Codepoint cp = *m_pos++;
|
const Codepoint cp = *m_pos++;
|
||||||
|
@ -381,29 +404,6 @@ private:
|
||||||
return new_node(ParsedRegex::Literal, control.value);
|
return new_node(ParsedRegex::Literal, control.value);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto read_hex = [this](size_t count)
|
|
||||||
{
|
|
||||||
Codepoint res = 0;
|
|
||||||
for (int i = 0; i < count; ++i)
|
|
||||||
{
|
|
||||||
if (at_end())
|
|
||||||
parse_error("unterminated hex sequence");
|
|
||||||
Codepoint digit = *m_pos++;
|
|
||||||
Codepoint digit_value;
|
|
||||||
if ('0' <= digit and digit <= '9')
|
|
||||||
digit_value = digit - '0';
|
|
||||||
else if ('a' <= digit and digit <= 'f')
|
|
||||||
digit_value = 0xa + digit - 'a';
|
|
||||||
else if ('A' <= digit and digit <= 'F')
|
|
||||||
digit_value = 0xa + digit - 'A';
|
|
||||||
else
|
|
||||||
parse_error(format("invalid hex digit '{}'", digit));
|
|
||||||
|
|
||||||
res = res * 16 + digit_value;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
};
|
|
||||||
|
|
||||||
if (cp == '0')
|
if (cp == '0')
|
||||||
return new_node(ParsedRegex::Literal, '\0');
|
return new_node(ParsedRegex::Literal, '\0');
|
||||||
else if (cp == 'c')
|
else if (cp == 'c')
|
||||||
|
@ -418,7 +418,7 @@ private:
|
||||||
else if (cp == 'x')
|
else if (cp == 'x')
|
||||||
return new_node(ParsedRegex::Literal, read_hex(2));
|
return new_node(ParsedRegex::Literal, read_hex(2));
|
||||||
else if (cp == 'u')
|
else if (cp == 'u')
|
||||||
return new_node(ParsedRegex::Literal, read_hex(4));
|
return new_node(ParsedRegex::Literal, read_hex(6));
|
||||||
|
|
||||||
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
||||||
return new_node(ParsedRegex::Literal, cp);
|
return new_node(ParsedRegex::Literal, cp);
|
||||||
|
@ -470,6 +470,20 @@ private:
|
||||||
if (at_end())
|
if (at_end())
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
auto read_escaped_char = [this]() {
|
||||||
|
Codepoint cp = *m_pos++;
|
||||||
|
auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
|
||||||
|
if (it != std::end(control_escapes))
|
||||||
|
return it->value;
|
||||||
|
if (cp == 'x')
|
||||||
|
return read_hex(2);
|
||||||
|
if (cp == 'u')
|
||||||
|
return read_hex(6);
|
||||||
|
if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
|
||||||
|
parse_error(format("unknown character class escape '{}'", cp));
|
||||||
|
return cp;
|
||||||
|
};
|
||||||
|
|
||||||
if (cp == '\\')
|
if (cp == '\\')
|
||||||
{
|
{
|
||||||
auto it = find_if(character_class_escapes,
|
auto it = find_if(character_class_escapes,
|
||||||
|
@ -481,14 +495,7 @@ private:
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else // its an escaped character
|
else // its an escaped character
|
||||||
{
|
cp = read_escaped_char();
|
||||||
cp = *m_pos++;
|
|
||||||
auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
|
|
||||||
if (it != std::end(control_escapes))
|
|
||||||
cp = it->value;
|
|
||||||
else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
|
|
||||||
parse_error(format("unknown character class escape '{}'", cp));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CharacterClass::Range range = { cp, cp };
|
CharacterClass::Range range = { cp, cp };
|
||||||
|
@ -498,7 +505,10 @@ private:
|
||||||
break;
|
break;
|
||||||
if (*m_pos != ']')
|
if (*m_pos != ']')
|
||||||
{
|
{
|
||||||
range.max = *m_pos++;
|
cp = *m_pos++;
|
||||||
|
if (cp == '\\')
|
||||||
|
cp = read_escaped_char();
|
||||||
|
range.max = cp;
|
||||||
if (range.min > range.max)
|
if (range.min > range.max)
|
||||||
parse_error("invalid range specified");
|
parse_error("invalid range specified");
|
||||||
}
|
}
|
||||||
|
@ -1522,13 +1532,32 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(vm.exec("bCa"));
|
kak_assert(vm.exec("bCa"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<> vm{R"([\t-\r]+)"};
|
||||||
|
kak_assert(vm.exec("\t\n\v\f\r"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<> vm{R"([^\x00-\x7F]+)"};
|
||||||
|
kak_assert(not vm.exec("ascii"));
|
||||||
|
kak_assert(vm.exec("←↑→↓"));
|
||||||
|
kak_assert(vm.exec("😄😊😉"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<> vm{R"([^\u000000-\u00ffff]+)"};
|
||||||
|
kak_assert(not vm.exec("ascii"));
|
||||||
|
kak_assert(not vm.exec("←↑→↓"));
|
||||||
|
kak_assert(vm.exec("😄😊😉"));
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"};
|
TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"};
|
||||||
kak_assert(vm.exec("д", RegexExecFlags::None));
|
kak_assert(vm.exec("д", RegexExecFlags::None));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
|
TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"};
|
||||||
const char str[] = "\0\n☎☏"; // work around the null byte in the literal
|
const char str[] = "\0\n☎☏"; // work around the null byte in the literal
|
||||||
kak_assert(vm.exec({str, str + sizeof(str)-1}));
|
kak_assert(vm.exec({str, str + sizeof(str)-1}));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user