diff --git a/doc/pages/highlighters.asciidoc b/doc/pages/highlighters.asciidoc index ea6500cd..18504b3c 100644 --- a/doc/pages/highlighters.asciidoc +++ b/doc/pages/highlighters.asciidoc @@ -99,6 +99,10 @@ from the remaining parameters. add-highlighter window/ regex //\h*(TODO:)[^\n]* 0:cyan 1:yellow,red -------------------------------------------------------------------- + capture_id can be either the capture number, or its name if a + named capture is used in the regex (See + <>) + *dynregex* : ...:: similar to regex, but expand (like a command parameter would) the given expression before building a regex from the result. diff --git a/doc/pages/regex.asciidoc b/doc/pages/regex.asciidoc index e228c25f..a43fc589 100644 --- a/doc/pages/regex.asciidoc +++ b/doc/pages/regex.asciidoc @@ -78,17 +78,21 @@ Regex atoms can be grouped using `(` and `)` or `(?:` and `)`. If `(` is used, the group will be a capturing group, which means the positions from the subject strings that matched between `(` and `)` will be recorded. -Capture groups are numbered starting at 1. They are numbered in the order of -appearance of their `(` in the regex. A special capture group 0 is -for the whole sequence that matched. +Capture groups are numbered starting at 1. They are numbered in the +order of appearance of their `(` in the regex. A special capture group +0 is for the whole sequence that matched. -`(?:` introduces a non capturing group, which will not record the +* `(?:` introduces a non capturing group, which will not record the matching positions. +* `(?` introduces a named capturing group, which, in addition to +being referred by number, can be, in certain contexts, referred by the +given name. + == Alternations -`|` introduces an alternation, which will either match its left-hand side, -or its right-hand side (preferring the left-hand side) +The `|` character introduces an alternation, which will either match +its left-hand side, or its right-hand side (preferring the left-hand side) For example, `foo|bar` matches either `foo` or `bar`, `foo(bar|baz|qux)` matches `foo` followed by either `bar`, `baz` or `qux`. diff --git a/src/highlighters.cc b/src/highlighters.cc index 21418466..a45b5ee7 100644 --- a/src/highlighters.cc +++ b/src/highlighters.cc @@ -307,19 +307,25 @@ public: if (params.size() < 2) throw runtime_error("wrong parameter count"); + Regex re{params[0], RegexCompileFlags::Optimize}; + FacesSpec faces; for (auto& spec : params.subrange(1)) { auto colon = find(spec, ':'); if (colon == spec.end()) throw runtime_error(format("wrong face spec: '{}' expected :", spec)); - int capture = str_to_int({spec.begin(), colon}); + const StringView capture_name{spec.begin(), colon}; + const int capture = str_to_int_ifp(capture_name).value_or_compute([&] { + return re.named_capture_index(capture_name); + }); + if (capture < 0) + throw runtime_error(format("capture name {} is neither a capture index, nor an existing capture name", + capture_name)); faces.emplace_back(capture, String{colon+1, spec.end()}); } - Regex ex{params[0], RegexCompileFlags::Optimize}; - - return std::make_unique(std::move(ex), std::move(faces)); + return std::make_unique(std::move(re), std::move(faces)); } private: @@ -492,7 +498,6 @@ std::unique_ptr create_dynamic_regex_highlighter(HighlighterParamet faces.emplace_back(capture, String{colon+1, spec.end()}); } - auto make_hl = [](auto& regex_getter, auto& face_getter) { return std::make_unique, std::decay_t>>( diff --git a/src/regex.cc b/src/regex.cc index 970721c7..389b2376 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -1,4 +1,5 @@ #include "regex.hh" +#include "ranges.hh" namespace Kakoune { @@ -8,6 +9,12 @@ Regex::Regex(StringView re, RegexCompileFlags flags) m_str{re.str()} {} +int Regex::named_capture_index(StringView name) const +{ + auto it = find_if(m_impl->named_captures, [&](auto& c) { return c.name == name; }); + return it != m_impl->named_captures.end() ? it->index : -1; +} + String option_to_string(const Regex& re) { return re.str(); diff --git a/src/regex.hh b/src/regex.hh index aac66921..4d7cc5f0 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -21,6 +21,7 @@ public: const String& str() const { return m_str; } size_t mark_count() const { return m_impl->save_count / 2 - 1; } + int named_capture_index(StringView name) const; static constexpr const char* option_type_name = "regex"; diff --git a/src/regex_impl.cc b/src/regex_impl.cc index a86f2824..7596ba40 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -85,7 +85,8 @@ struct ParsedRegex Vector nodes; Vector character_classes; - size_t capture_count; + Vector named_captures; + uint32_t capture_count; }; namespace @@ -166,7 +167,7 @@ private: using Iterator = utf8::iterator; using NodeIndex = ParsedRegex::NodeIndex; - NodeIndex disjunction(unsigned capture = -1) + NodeIndex disjunction(uint32_t capture = -1) { NodeIndex index = new_node(ParsedRegex::Alternation); get_node(index).value = capture; @@ -301,15 +302,25 @@ private: return new_node(ParsedRegex::AnyCharExceptNewLine); case '(': { - auto captures = [this, it = (++m_pos).base()]() mutable { - if (m_regex.end() - it >= 2 and *it++ == '?' and *it++ == ':') - { - m_pos = Iterator{it, m_regex}; - return false; - } - return true; - }; - NodeIndex content = disjunction(captures() ? m_parsed_regex.capture_count++ : -1); + uint32_t capture_group = -1; + const char* it = (++m_pos).base(); + if (m_regex.end() - it < 2 or *it++ != '?') + capture_group = m_parsed_regex.capture_count++; + else if (*it == ':') + m_pos = Iterator{++it, m_regex}; + else if (*it == '<') + { + const auto name_start = ++it; + while (it != m_regex.end() and is_word(*it)) + ++it; + if (it == m_regex.end() or *it != '>') + parse_error("named captures should be only ascii word characters"); + capture_group = m_parsed_regex.capture_count++; + m_parsed_regex.named_captures.push_back({{name_start, it}, capture_group}); + m_pos = Iterator{++it, m_regex}; + } + + NodeIndex content = disjunction(capture_group); if (at_end() or *m_pos++ != ')') parse_error("unclosed parenthesis"); return content; @@ -682,6 +693,7 @@ struct RegexCompiler m_program.first_backward_inst = -1; m_program.character_classes = std::move(m_parsed_regex.character_classes); + m_program.named_captures = std::move(m_parsed_regex.named_captures); m_program.save_count = m_parsed_regex.capture_count * 2; } @@ -1526,6 +1538,24 @@ auto test_regex = UnitTest{[]{ const char str[] = "\0\nā˜Žā˜"; // work around the null byte in the literal kak_assert(vm.exec({str, str + sizeof(str)-1})); } + + { + auto eq = [](const CompiledRegex::NamedCapture& lhs, + const CompiledRegex::NamedCapture& rhs) { + return lhs.name == rhs.name and + lhs.index == rhs.index; + }; + + TestVM<> vm{R"((?\d+)-(?\d+)-(?\d+))"}; + kak_assert(vm.exec("2019-01-03", RegexExecFlags::None)); + kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "2019"); + kak_assert(StringView{vm.captures()[4], vm.captures()[5]} == "01"); + kak_assert(StringView{vm.captures()[6], vm.captures()[7]} == "03"); + kak_assert(vm.named_captures.size() == 3); + kak_assert(eq(vm.named_captures[0], {"year", 1})); + kak_assert(eq(vm.named_captures[1], {"month", 2})); + kak_assert(eq(vm.named_captures[2], {"day", 3})); + } }}; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 86d65ddd..86ae81a8 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -107,9 +107,16 @@ struct CompiledRegex : RefCountable, UseMemoryDomain explicit operator bool() const { return not instructions.empty(); } + struct NamedCapture + { + String name; + uint32_t index; + }; + Vector instructions; Vector character_classes; Vector lookarounds; + Vector named_captures; uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward uint32_t save_count; diff --git a/test/highlight/named-captures/cmd b/test/highlight/named-captures/cmd new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/test/highlight/named-captures/cmd @@ -0,0 +1 @@ + diff --git a/test/highlight/named-captures/in b/test/highlight/named-captures/in new file mode 100644 index 00000000..57ed1021 --- /dev/null +++ b/test/highlight/named-captures/in @@ -0,0 +1 @@ +2018-01-03 diff --git a/test/highlight/named-captures/rc b/test/highlight/named-captures/rc new file mode 100644 index 00000000..14905790 --- /dev/null +++ b/test/highlight/named-captures/rc @@ -0,0 +1 @@ +add-highlighter window/ regex (?\d+)-(?\d+)-(?\d+) year:red month:green day:yellow diff --git a/test/highlight/named-captures/ui-out b/test/highlight/named-captures/ui-out new file mode 100644 index 00000000..7350d79a --- /dev/null +++ b/test/highlight/named-captures/ui-out @@ -0,0 +1,7 @@ +{ "jsonrpc": "2.0", "method": "set_ui_options", "params": [{}] } +{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "2" }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "018" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "green", "bg": "default", "attributes": [] }, "contents": "01" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "yellow", "bg": "default", "attributes": [] }, "contents": "03" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] } +{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] } +{ "jsonrpc": "2.0", "method": "info_hide", "params": [] } +{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "black", "bg": "yellow", "attributes": [] }, "contents": "" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - client0@[kak-tests]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] } +{ "jsonrpc": "2.0", "method": "set_cursor", "params": ["buffer", { "line": 0, "column": 0 }] } +{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }