Regex: add support for \0, \cX, \xXX and \uXXXX escapes
This commit is contained in:
parent
c423b47109
commit
3f627058b0
|
@ -17,16 +17,20 @@ Literals
|
||||||
--------
|
--------
|
||||||
|
|
||||||
Every character except the syntax characters `\^$.*+?[]{}|().` match
|
Every character except the syntax characters `\^$.*+?[]{}|().` match
|
||||||
themselves, syntax characters can be escaped with a backspace so `\$` will
|
themselves. Syntax characters can be escaped with a backspace so `\$`
|
||||||
match a literal `$` and `\\` will match a literal `\`.
|
will match a literal `$` and `\\` will match a literal `\`.
|
||||||
|
|
||||||
Some additional literals are available as escape sequences:
|
Some literals are available as escape sequences:
|
||||||
|
|
||||||
* `\f` matches the form feed character.
|
* `\f` matches the form feed character.
|
||||||
* `\n` matches the line feed character.
|
* `\n` matches the line feed character.
|
||||||
* `\r` matches the carriage return character.
|
* `\r` matches the carriage return character.
|
||||||
* `\t` matches the tabulation character.
|
* `\t` matches the tabulation character.
|
||||||
* `\v` matches the vertical tabulation character.
|
* `\v` matches the vertical tabulation character.
|
||||||
|
* `\0` matches the null character.
|
||||||
|
* `\cX` matches the control-X character (X can be in `[A-Za-z]`).
|
||||||
|
* `\xXX` matches the character whose codepoint is XX (in hexadecimal).
|
||||||
|
* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal).
|
||||||
|
|
||||||
Character classes
|
Character classes
|
||||||
-----------------
|
-----------------
|
||||||
|
|
|
@ -308,7 +308,44 @@ private:
|
||||||
return new_node(ParsedRegex::Literal, control.value);
|
return new_node(ParsedRegex::Literal, control.value);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TOOD: \c..., \0..., '\0x...', \u...
|
auto read_hex = [this](size_t count)
|
||||||
|
{
|
||||||
|
Codepoint res = 0;
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
{
|
||||||
|
if (at_end())
|
||||||
|
parse_error("unterminated hex sequence");
|
||||||
|
Codepoint digit = *m_pos++;
|
||||||
|
Codepoint digit_value;
|
||||||
|
if ('0' <= digit and digit <= '9')
|
||||||
|
digit_value = digit - '0';
|
||||||
|
else if ('a' <= digit and digit <= 'f')
|
||||||
|
digit_value = 0xa + digit - 'a';
|
||||||
|
else if ('A' <= digit and digit <= 'F')
|
||||||
|
digit_value = 0xa + digit - 'A';
|
||||||
|
else
|
||||||
|
parse_error(format("invalid hex digit '{}'", digit));
|
||||||
|
|
||||||
|
res = res * 16 + digit_value;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (cp == '0')
|
||||||
|
return new_node(ParsedRegex::Literal, '\0');
|
||||||
|
else if (cp == 'c')
|
||||||
|
{
|
||||||
|
if (at_end())
|
||||||
|
parse_error("unterminated control escape");
|
||||||
|
Codepoint ctrl = *m_pos++;
|
||||||
|
if (('a' <= ctrl and ctrl <= 'z') or ('A' <= ctrl and ctrl <= 'Z'))
|
||||||
|
return new_node(ParsedRegex::Literal, ctrl % 32);
|
||||||
|
parse_error(format("Invalid control escape character '{}'", ctrl));
|
||||||
|
}
|
||||||
|
else if (cp == 'x')
|
||||||
|
return new_node(ParsedRegex::Literal, read_hex(2));
|
||||||
|
else if (cp == 'u')
|
||||||
|
return new_node(ParsedRegex::Literal, read_hex(4));
|
||||||
|
|
||||||
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
||||||
return new_node(ParsedRegex::Literal, cp);
|
return new_node(ParsedRegex::Literal, cp);
|
||||||
|
@ -467,7 +504,7 @@ private:
|
||||||
return {ParsedRegex::Quantifier::One};
|
return {ParsedRegex::Quantifier::One};
|
||||||
|
|
||||||
constexpr int max_repeat = 1000;
|
constexpr int max_repeat = 1000;
|
||||||
auto read_int = [max_repeat, this](auto& pos, auto begin, auto end) {
|
auto read_bound = [max_repeat, this](auto& pos, auto begin, auto end) {
|
||||||
int res = 0;
|
int res = 0;
|
||||||
for (; pos != end; ++pos)
|
for (; pos != end; ++pos)
|
||||||
{
|
{
|
||||||
|
@ -496,12 +533,12 @@ private:
|
||||||
case '{':
|
case '{':
|
||||||
{
|
{
|
||||||
auto it = m_pos+1;
|
auto it = m_pos+1;
|
||||||
const int min = read_int(it, it, m_regex.end());
|
const int min = read_bound(it, it, m_regex.end());
|
||||||
int max = min;
|
int max = min;
|
||||||
if (*it == ',')
|
if (*it == ',')
|
||||||
{
|
{
|
||||||
++it;
|
++it;
|
||||||
max = read_int(it, it, m_regex.end());
|
max = read_bound(it, it, m_regex.end());
|
||||||
}
|
}
|
||||||
if (*it++ != '}')
|
if (*it++ != '}')
|
||||||
parse_error("expected closing bracket");
|
parse_error("expected closing bracket");
|
||||||
|
@ -1280,6 +1317,12 @@ auto test_regex = UnitTest{[]{
|
||||||
TestVM<> vm{R"([d-ea-dcf-k]+)"};
|
TestVM<> vm{R"([d-ea-dcf-k]+)"};
|
||||||
kak_assert(vm.exec("abcde"));
|
kak_assert(vm.exec("abcde"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
|
||||||
|
const char str[] = "\0\n☎☏"; // work around the null byte in the literal
|
||||||
|
kak_assert(vm.exec({str, str + sizeof(str)-1}));
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user