From 3f627058b0df498630797d7cfbfa3825cb5ad5d8 Mon Sep 17 00:00:00 2001
From: Maxime Coste <mawww@kakoune.org>
Date: Fri, 20 Oct 2017 12:08:24 +0800
Subject: [PATCH] Regex: add support for \0, \cX, \xXX and \uXXXX escapes

---
 doc/manpages/regex.asciidoc | 10 +++++---
 src/regex_impl.cc           | 51 ++++++++++++++++++++++++++++++++++---
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/doc/manpages/regex.asciidoc b/doc/manpages/regex.asciidoc
index 9ff56486..b45a8e1f 100644
--- a/doc/manpages/regex.asciidoc
+++ b/doc/manpages/regex.asciidoc
@@ -17,16 +17,20 @@ Literals
 --------
 
 Every character except the syntax characters `\^$.*+?[]{}|().` match
-themselves, syntax characters can be escaped with a backspace so `\$` will
-match a literal `$` and `\\` will match a literal `\`.
+themselves. Syntax characters can be escaped with a backspace so `\$`
+will match a literal `$` and `\\` will match a literal `\`.
 
-Some additional literals are available as escape sequences:
+Some literals are available as escape sequences:
 
 * `\f` matches the form feed character.
 * `\n` matches the line feed character.
 * `\r` matches the carriage return character.
 * `\t` matches the tabulation character.
 * `\v` matches the vertical tabulation character.
+* `\0` matches the null character.
+* `\cX` matches the control-X character (X can be in `[A-Za-z]`).
+* `\xXX` matches the character whose codepoint is XX (in hexadecimal).
+* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal).
 
 Character classes
 -----------------
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index e5828a8c..c32bbbf1 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -308,7 +308,44 @@ private:
                 return new_node(ParsedRegex::Literal, control.value);
         }
 
-        // TOOD: \c..., \0..., '\0x...', \u...
+        auto read_hex = [this](size_t count)
+        {
+            Codepoint res = 0;
+            for (int i = 0; i < count; ++i)
+            {
+                if (at_end())
+                    parse_error("unterminated hex sequence");
+                Codepoint digit = *m_pos++;
+                Codepoint digit_value;
+                if ('0' <= digit and digit <= '9')
+                    digit_value = digit - '0';
+                else if ('a' <= digit and digit <= 'f')
+                    digit_value = 0xa + digit - 'a';
+                else if ('A' <= digit and digit <= 'F')
+                    digit_value = 0xa + digit - 'A';
+                else
+                    parse_error(format("invalid hex digit '{}'", digit));
+
+                res = res * 16 + digit_value;
+            }
+            return res;
+        };
+
+        if (cp == '0')
+            return new_node(ParsedRegex::Literal, '\0');
+        else if (cp == 'c')
+        {
+            if (at_end())
+                parse_error("unterminated control escape");
+            Codepoint ctrl = *m_pos++;
+            if (('a' <= ctrl and ctrl <= 'z') or ('A' <= ctrl and ctrl <= 'Z'))
+                return new_node(ParsedRegex::Literal, ctrl % 32);
+            parse_error(format("Invalid control escape character '{}'", ctrl));
+        }
+        else if (cp == 'x')
+            return new_node(ParsedRegex::Literal, read_hex(2));
+        else if (cp == 'u')
+            return new_node(ParsedRegex::Literal, read_hex(4));
 
         if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
             return new_node(ParsedRegex::Literal, cp);
@@ -467,7 +504,7 @@ private:
             return {ParsedRegex::Quantifier::One};
 
         constexpr int max_repeat = 1000;
-        auto read_int = [max_repeat, this](auto& pos, auto begin, auto end) {
+        auto read_bound = [max_repeat, this](auto& pos, auto begin, auto end) {
             int res = 0;
             for (; pos != end; ++pos)
             {
@@ -496,12 +533,12 @@ private:
             case '{':
             {
                 auto it = m_pos+1;
-                const int min = read_int(it, it, m_regex.end());
+                const int min = read_bound(it, it, m_regex.end());
                 int max = min;
                 if (*it == ',')
                 {
                     ++it;
-                    max = read_int(it, it, m_regex.end());
+                    max = read_bound(it, it, m_regex.end());
                 }
                 if (*it++ != '}')
                    parse_error("expected closing bracket");
@@ -1280,6 +1317,12 @@ auto test_regex = UnitTest{[]{
         TestVM<> vm{R"([d-ea-dcf-k]+)"};
         kak_assert(vm.exec("abcde"));
     }
+
+    {
+        TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
+        const char str[] = "\0\n☎☏"; // work around the null byte in the literal
+        kak_assert(vm.exec({str, str + sizeof(str)-1}));
+    }
 }};
 
 }