Add support for named captures to the regex impl and regex highlighter

ECMAScript is adding support for it, and it is a pretty isolated
change to do.

Fixes #2293
This commit is contained in:
Maxime Coste 2019-01-03 22:52:15 +11:00
parent 56ee329d79
commit 328c497be2
11 changed files with 90 additions and 22 deletions

View File

@ -99,6 +99,10 @@ from the remaining parameters.
add-highlighter window/ regex //\h*(TODO:)[^\n]* 0:cyan 1:yellow,red add-highlighter window/ regex //\h*(TODO:)[^\n]* 0:cyan 1:yellow,red
-------------------------------------------------------------------- --------------------------------------------------------------------
capture_id can be either the capture number, or its name if a
named capture is used in the regex (See
<<regex#Groups, `:doc regex Groups`>>)
*dynregex* <expression> <capture_id>:<face> ...:: *dynregex* <expression> <capture_id>:<face> ...::
similar to regex, but expand (like a command parameter would) the similar to regex, but expand (like a command parameter would) the
given expression before building a regex from the result. given expression before building a regex from the result.

View File

@ -78,17 +78,21 @@ Regex atoms can be grouped using `(` and `)` or `(?:` and `)`. If `(` is
used, the group will be a capturing group, which means the positions from used, the group will be a capturing group, which means the positions from
the subject strings that matched between `(` and `)` will be recorded. the subject strings that matched between `(` and `)` will be recorded.
Capture groups are numbered starting at 1. They are numbered in the order of Capture groups are numbered starting at 1. They are numbered in the
appearance of their `(` in the regex. A special capture group 0 is order of appearance of their `(` in the regex. A special capture group
for the whole sequence that matched. 0 is for the whole sequence that matched.
`(?:` introduces a non capturing group, which will not record the * `(?:` introduces a non capturing group, which will not record the
matching positions. matching positions.
* `(?<name>` introduces a named capturing group, which, in addition to
being referred by number, can be, in certain contexts, referred by the
given name.
== Alternations == Alternations
`|` introduces an alternation, which will either match its left-hand side, The `|` character introduces an alternation, which will either match
or its right-hand side (preferring the left-hand side) its left-hand side, or its right-hand side (preferring the left-hand side)
For example, `foo|bar` matches either `foo` or `bar`, `foo(bar|baz|qux)` For example, `foo|bar` matches either `foo` or `bar`, `foo(bar|baz|qux)`
matches `foo` followed by either `bar`, `baz` or `qux`. matches `foo` followed by either `bar`, `baz` or `qux`.

View File

@ -307,19 +307,25 @@ public:
if (params.size() < 2) if (params.size() < 2)
throw runtime_error("wrong parameter count"); throw runtime_error("wrong parameter count");
Regex re{params[0], RegexCompileFlags::Optimize};
FacesSpec faces; FacesSpec faces;
for (auto& spec : params.subrange(1)) for (auto& spec : params.subrange(1))
{ {
auto colon = find(spec, ':'); auto colon = find(spec, ':');
if (colon == spec.end()) if (colon == spec.end())
throw runtime_error(format("wrong face spec: '{}' expected <capture>:<facespec>", spec)); throw runtime_error(format("wrong face spec: '{}' expected <capture>:<facespec>", spec));
int capture = str_to_int({spec.begin(), colon}); const StringView capture_name{spec.begin(), colon};
const int capture = str_to_int_ifp(capture_name).value_or_compute([&] {
return re.named_capture_index(capture_name);
});
if (capture < 0)
throw runtime_error(format("capture name {} is neither a capture index, nor an existing capture name",
capture_name));
faces.emplace_back(capture, String{colon+1, spec.end()}); faces.emplace_back(capture, String{colon+1, spec.end()});
} }
Regex ex{params[0], RegexCompileFlags::Optimize}; return std::make_unique<RegexHighlighter>(std::move(re), std::move(faces));
return std::make_unique<RegexHighlighter>(std::move(ex), std::move(faces));
} }
private: private:
@ -492,7 +498,6 @@ std::unique_ptr<Highlighter> create_dynamic_regex_highlighter(HighlighterParamet
faces.emplace_back(capture, String{colon+1, spec.end()}); faces.emplace_back(capture, String{colon+1, spec.end()});
} }
auto make_hl = [](auto& regex_getter, auto& face_getter) { auto make_hl = [](auto& regex_getter, auto& face_getter) {
return std::make_unique<DynamicRegexHighlighter<std::decay_t<decltype(regex_getter)>, return std::make_unique<DynamicRegexHighlighter<std::decay_t<decltype(regex_getter)>,
std::decay_t<decltype(face_getter)>>>( std::decay_t<decltype(face_getter)>>>(

View File

@ -1,4 +1,5 @@
#include "regex.hh" #include "regex.hh"
#include "ranges.hh"
namespace Kakoune namespace Kakoune
{ {
@ -8,6 +9,12 @@ Regex::Regex(StringView re, RegexCompileFlags flags)
m_str{re.str()} m_str{re.str()}
{} {}
int Regex::named_capture_index(StringView name) const
{
auto it = find_if(m_impl->named_captures, [&](auto& c) { return c.name == name; });
return it != m_impl->named_captures.end() ? it->index : -1;
}
String option_to_string(const Regex& re) String option_to_string(const Regex& re)
{ {
return re.str(); return re.str();

View File

@ -21,6 +21,7 @@ public:
const String& str() const { return m_str; } const String& str() const { return m_str; }
size_t mark_count() const { return m_impl->save_count / 2 - 1; } size_t mark_count() const { return m_impl->save_count / 2 - 1; }
int named_capture_index(StringView name) const;
static constexpr const char* option_type_name = "regex"; static constexpr const char* option_type_name = "regex";

View File

@ -85,7 +85,8 @@ struct ParsedRegex
Vector<Node, MemoryDomain::Regex> nodes; Vector<Node, MemoryDomain::Regex> nodes;
Vector<CharacterClass, MemoryDomain::Regex> character_classes; Vector<CharacterClass, MemoryDomain::Regex> character_classes;
size_t capture_count; Vector<CompiledRegex::NamedCapture, MemoryDomain::Regex> named_captures;
uint32_t capture_count;
}; };
namespace namespace
@ -166,7 +167,7 @@ private:
using Iterator = utf8::iterator<const char*, const char*, Codepoint, int, InvalidPolicy>; using Iterator = utf8::iterator<const char*, const char*, Codepoint, int, InvalidPolicy>;
using NodeIndex = ParsedRegex::NodeIndex; using NodeIndex = ParsedRegex::NodeIndex;
NodeIndex disjunction(unsigned capture = -1) NodeIndex disjunction(uint32_t capture = -1)
{ {
NodeIndex index = new_node(ParsedRegex::Alternation); NodeIndex index = new_node(ParsedRegex::Alternation);
get_node(index).value = capture; get_node(index).value = capture;
@ -301,15 +302,25 @@ private:
return new_node(ParsedRegex::AnyCharExceptNewLine); return new_node(ParsedRegex::AnyCharExceptNewLine);
case '(': case '(':
{ {
auto captures = [this, it = (++m_pos).base()]() mutable { uint32_t capture_group = -1;
if (m_regex.end() - it >= 2 and *it++ == '?' and *it++ == ':') const char* it = (++m_pos).base();
if (m_regex.end() - it < 2 or *it++ != '?')
capture_group = m_parsed_regex.capture_count++;
else if (*it == ':')
m_pos = Iterator{++it, m_regex};
else if (*it == '<')
{ {
m_pos = Iterator{it, m_regex}; const auto name_start = ++it;
return false; while (it != m_regex.end() and is_word(*it))
++it;
if (it == m_regex.end() or *it != '>')
parse_error("named captures should be only ascii word characters");
capture_group = m_parsed_regex.capture_count++;
m_parsed_regex.named_captures.push_back({{name_start, it}, capture_group});
m_pos = Iterator{++it, m_regex};
} }
return true;
}; NodeIndex content = disjunction(capture_group);
NodeIndex content = disjunction(captures() ? m_parsed_regex.capture_count++ : -1);
if (at_end() or *m_pos++ != ')') if (at_end() or *m_pos++ != ')')
parse_error("unclosed parenthesis"); parse_error("unclosed parenthesis");
return content; return content;
@ -682,6 +693,7 @@ struct RegexCompiler
m_program.first_backward_inst = -1; m_program.first_backward_inst = -1;
m_program.character_classes = std::move(m_parsed_regex.character_classes); m_program.character_classes = std::move(m_parsed_regex.character_classes);
m_program.named_captures = std::move(m_parsed_regex.named_captures);
m_program.save_count = m_parsed_regex.capture_count * 2; m_program.save_count = m_parsed_regex.capture_count * 2;
} }
@ -1526,6 +1538,24 @@ auto test_regex = UnitTest{[]{
const char str[] = "\0\n☎☏"; // work around the null byte in the literal const char str[] = "\0\n☎☏"; // work around the null byte in the literal
kak_assert(vm.exec({str, str + sizeof(str)-1})); kak_assert(vm.exec({str, str + sizeof(str)-1}));
} }
{
auto eq = [](const CompiledRegex::NamedCapture& lhs,
const CompiledRegex::NamedCapture& rhs) {
return lhs.name == rhs.name and
lhs.index == rhs.index;
};
TestVM<> vm{R"((?<year>\d+)-(?<month>\d+)-(?<day>\d+))"};
kak_assert(vm.exec("2019-01-03", RegexExecFlags::None));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "2019");
kak_assert(StringView{vm.captures()[4], vm.captures()[5]} == "01");
kak_assert(StringView{vm.captures()[6], vm.captures()[7]} == "03");
kak_assert(vm.named_captures.size() == 3);
kak_assert(eq(vm.named_captures[0], {"year", 1}));
kak_assert(eq(vm.named_captures[1], {"month", 2}));
kak_assert(eq(vm.named_captures[2], {"day", 3}));
}
}}; }};
} }

View File

@ -107,9 +107,16 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
explicit operator bool() const { return not instructions.empty(); } explicit operator bool() const { return not instructions.empty(); }
struct NamedCapture
{
String name;
uint32_t index;
};
Vector<Instruction, MemoryDomain::Regex> instructions; Vector<Instruction, MemoryDomain::Regex> instructions;
Vector<CharacterClass, MemoryDomain::Regex> character_classes; Vector<CharacterClass, MemoryDomain::Regex> character_classes;
Vector<Lookaround, MemoryDomain::Regex> lookarounds; Vector<Lookaround, MemoryDomain::Regex> lookarounds;
Vector<NamedCapture, MemoryDomain::Regex> named_captures;
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
uint32_t save_count; uint32_t save_count;

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
2018-01-03

View File

@ -0,0 +1 @@
add-highlighter window/ regex (?<year>\d+)-(?<month>\d+)-(?<day>\d+) year:red month:green day:yellow

View File

@ -0,0 +1,7 @@
{ "jsonrpc": "2.0", "method": "set_ui_options", "params": [{}] }
{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "2" }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "018" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "green", "bg": "default", "attributes": [] }, "contents": "01" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "yellow", "bg": "default", "attributes": [] }, "contents": "03" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "black", "bg": "yellow", "attributes": [] }, "contents": "" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - client0@[kak-tests]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "set_cursor", "params": ["buffer", { "line": 0, "column": 0 }] }
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }