Support \x and \u escapes in regex character classes

Change \u to use 6 digits to cover the full unicode range. Fixes #3172
2019-11-06 20:48:48 +11:00 · 2019-11-06 20:48:48 +11:00 · 3e7301ede7
commit 3e7301ede7
parent 3816f1fa43
2 changed files with 66 additions and 35 deletions
--- a/doc/pages/regex.asciidoc
+++ b/doc/pages/regex.asciidoc
@ -23,7 +23,7 @@ Some literals are available as escape sequences:
 * `\0` matches the null character.
 * `\cX` matches the control-X character (X can be in `[A-Za-z]`).
 * `\xXX` matches the character whose codepoint is XX (in hexadecimal).
-* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal).
+* `\uXXXXXX` matches the character whose codepoint is XXXXXX (in hexadecimal).
 == Character classes
@ -185,3 +185,5 @@ exists for ease of use or performance reasons:
  escapes, identity escapes like `\X` with X a non-special character
  are not accepted, to avoid confusions between `\h` meaning literal
  `h` in ECMAScript, and horizontal blank in Kakoune.
 * `\uXXXXXX` uses 6 digits to cover all of unicode, instead of relying
  on ECMAScript UTF-16 surrogate pairs with 4 digits.
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@ -349,6 +349,29 @@ private:
        }
    }
    Codepoint read_hex(size_t count)
    {
        Codepoint res = 0;
        for (int i = 0; i < count; ++i)
        {
            if (at_end())
                parse_error("unterminated hex sequence");
            Codepoint digit = *m_pos++;
            Codepoint digit_value;
            if ('0' <= digit and digit <= '9')
                digit_value = digit - '0';
            else if ('a' <= digit and digit <= 'f')
                digit_value = 0xa + digit - 'a';
            else if ('A' <= digit and digit <= 'F')
                digit_value = 0xa + digit - 'A';
            else
                parse_error(format("invalid hex digit '{}'", digit));
            res = res * 16 + digit_value;
        }
        return res;
    }
    NodeIndex atom_escape()
    {
        const Codepoint cp = *m_pos++;
@ -381,29 +404,6 @@ private:
                return new_node(ParsedRegex::Literal, control.value);
        }
        auto read_hex = [this](size_t count)
        {
            Codepoint res = 0;
            for (int i = 0; i < count; ++i)
            {
                if (at_end())
                    parse_error("unterminated hex sequence");
                Codepoint digit = *m_pos++;
                Codepoint digit_value;
                if ('0' <= digit and digit <= '9')
                    digit_value = digit - '0';
                else if ('a' <= digit and digit <= 'f')
                    digit_value = 0xa + digit - 'a';
                else if ('A' <= digit and digit <= 'F')
                    digit_value = 0xa + digit - 'A';
                else
                    parse_error(format("invalid hex digit '{}'", digit));
                res = res * 16 + digit_value;
            }
            return res;
        };
        if (cp == '0')
            return new_node(ParsedRegex::Literal, '\0');
        else if (cp == 'c')
@ -418,7 +418,7 @@ private:
        else if (cp == 'x')
            return new_node(ParsedRegex::Literal, read_hex(2));
        else if (cp == 'u')
-            return new_node(ParsedRegex::Literal, read_hex(4));
+            return new_node(ParsedRegex::Literal, read_hex(6));
        if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
            return new_node(ParsedRegex::Literal, cp);
@ -470,6 +470,20 @@ private:
            if (at_end())
                break;
            auto read_escaped_char = [this]() {
                Codepoint cp = *m_pos++;
                auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
                if (it != std::end(control_escapes))
                    return it->value;
                if (cp == 'x')
                    return read_hex(2);
                if (cp == 'u')
                    return read_hex(6);
                if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
                    parse_error(format("unknown character class escape '{}'", cp));
                return cp;
            };
            if (cp == '\\')
            {
                auto it = find_if(character_class_escapes,
@ -481,14 +495,7 @@ private:
                    continue;
                }
                else // its an escaped character
-                {
+                    cp = read_escaped_char();
                    cp = *m_pos++;
                    auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
                    if (it != std::end(control_escapes))
                        cp = it->value;
                    else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
                        parse_error(format("unknown character class escape '{}'", cp));
                }
            }
            CharacterClass::Range range = { cp, cp };
@ -498,7 +505,10 @@ private:
                    break;
                if (*m_pos != ']')
                {
-                    range.max = *m_pos++;
+                    cp = *m_pos++;
                    if (cp == '\\')
                        cp = read_escaped_char();
                    range.max = cp;
                    if (range.min > range.max)
                        parse_error("invalid range specified");
                }
@ -1522,13 +1532,32 @@ auto test_regex = UnitTest{[]{
        kak_assert(vm.exec("bCa"));
    }
    {
        TestVM<> vm{R"([\t-\r]+)"};
        kak_assert(vm.exec("\t\n\v\f\r"));
    }
    {
        TestVM<> vm{R"([^\x00-\x7F]+)"};
        kak_assert(not vm.exec("ascii"));
        kak_assert(vm.exec("←↑→↓"));
        kak_assert(vm.exec("😄😊😉"));
    }
    {
        TestVM<> vm{R"([^\u000000-\u00ffff]+)"};
        kak_assert(not vm.exec("ascii"));
        kak_assert(not vm.exec("←↑→↓"));
        kak_assert(vm.exec("😄😊😉"));
    }
    {
        TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"};
        kak_assert(vm.exec("д", RegexExecFlags::None));
    }
    {
-        TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
+        TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"};
        const char str[] = "\0\n☎☏"; // work around the null byte in the literal
        kak_assert(vm.exec({str, str + sizeof(str)-1}));
    }