Add support for named captures to the regex impl and regex highlighter

ECMAScript is adding support for it, and it is a pretty isolated
change to do.

Fixes #2293
This commit is contained in:
Maxime Coste 2019-01-03 22:52:15 +11:00
parent 56ee329d79
commit 328c497be2
11 changed files with 90 additions and 22 deletions

View File

@ -99,6 +99,10 @@ from the remaining parameters.
add-highlighter window/ regex //\h*(TODO:)[^\n]* 0:cyan 1:yellow,red
--------------------------------------------------------------------
capture_id can be either the capture number, or its name if a
named capture is used in the regex (See
<<regex#Groups, `:doc regex Groups`>>)
*dynregex* <expression> <capture_id>:<face> ...::
similar to regex, but expand (like a command parameter would) the
given expression before building a regex from the result.

View File

@ -78,17 +78,21 @@ Regex atoms can be grouped using `(` and `)` or `(?:` and `)`. If `(` is
used, the group will be a capturing group, which means the positions from
the subject strings that matched between `(` and `)` will be recorded.
Capture groups are numbered starting at 1. They are numbered in the order of
appearance of their `(` in the regex. A special capture group 0 is
for the whole sequence that matched.
Capture groups are numbered starting at 1. They are numbered in the
order of appearance of their `(` in the regex. A special capture group
0 is for the whole sequence that matched.
`(?:` introduces a non capturing group, which will not record the
* `(?:` introduces a non capturing group, which will not record the
matching positions.
* `(?<name>` introduces a named capturing group, which, in addition to
being referred by number, can be, in certain contexts, referred by the
given name.
== Alternations
`|` introduces an alternation, which will either match its left-hand side,
or its right-hand side (preferring the left-hand side)
The `|` character introduces an alternation, which will either match
its left-hand side, or its right-hand side (preferring the left-hand side)
For example, `foo|bar` matches either `foo` or `bar`, `foo(bar|baz|qux)`
matches `foo` followed by either `bar`, `baz` or `qux`.

View File

@ -307,19 +307,25 @@ public:
if (params.size() < 2)
throw runtime_error("wrong parameter count");
Regex re{params[0], RegexCompileFlags::Optimize};
FacesSpec faces;
for (auto& spec : params.subrange(1))
{
auto colon = find(spec, ':');
if (colon == spec.end())
throw runtime_error(format("wrong face spec: '{}' expected <capture>:<facespec>", spec));
int capture = str_to_int({spec.begin(), colon});
const StringView capture_name{spec.begin(), colon};
const int capture = str_to_int_ifp(capture_name).value_or_compute([&] {
return re.named_capture_index(capture_name);
});
if (capture < 0)
throw runtime_error(format("capture name {} is neither a capture index, nor an existing capture name",
capture_name));
faces.emplace_back(capture, String{colon+1, spec.end()});
}
Regex ex{params[0], RegexCompileFlags::Optimize};
return std::make_unique<RegexHighlighter>(std::move(ex), std::move(faces));
return std::make_unique<RegexHighlighter>(std::move(re), std::move(faces));
}
private:
@ -492,7 +498,6 @@ std::unique_ptr<Highlighter> create_dynamic_regex_highlighter(HighlighterParamet
faces.emplace_back(capture, String{colon+1, spec.end()});
}
auto make_hl = [](auto& regex_getter, auto& face_getter) {
return std::make_unique<DynamicRegexHighlighter<std::decay_t<decltype(regex_getter)>,
std::decay_t<decltype(face_getter)>>>(

View File

@ -1,4 +1,5 @@
#include "regex.hh"
#include "ranges.hh"
namespace Kakoune
{
@ -8,6 +9,12 @@ Regex::Regex(StringView re, RegexCompileFlags flags)
m_str{re.str()}
{}
int Regex::named_capture_index(StringView name) const
{
auto it = find_if(m_impl->named_captures, [&](auto& c) { return c.name == name; });
return it != m_impl->named_captures.end() ? it->index : -1;
}
String option_to_string(const Regex& re)
{
return re.str();

View File

@ -21,6 +21,7 @@ public:
const String& str() const { return m_str; }
size_t mark_count() const { return m_impl->save_count / 2 - 1; }
int named_capture_index(StringView name) const;
static constexpr const char* option_type_name = "regex";

View File

@ -85,7 +85,8 @@ struct ParsedRegex
Vector<Node, MemoryDomain::Regex> nodes;
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
size_t capture_count;
Vector<CompiledRegex::NamedCapture, MemoryDomain::Regex> named_captures;
uint32_t capture_count;
};
namespace
@ -166,7 +167,7 @@ private:
using Iterator = utf8::iterator<const char*, const char*, Codepoint, int, InvalidPolicy>;
using NodeIndex = ParsedRegex::NodeIndex;
NodeIndex disjunction(unsigned capture = -1)
NodeIndex disjunction(uint32_t capture = -1)
{
NodeIndex index = new_node(ParsedRegex::Alternation);
get_node(index).value = capture;
@ -301,15 +302,25 @@ private:
return new_node(ParsedRegex::AnyCharExceptNewLine);
case '(':
{
auto captures = [this, it = (++m_pos).base()]() mutable {
if (m_regex.end() - it >= 2 and *it++ == '?' and *it++ == ':')
uint32_t capture_group = -1;
const char* it = (++m_pos).base();
if (m_regex.end() - it < 2 or *it++ != '?')
capture_group = m_parsed_regex.capture_count++;
else if (*it == ':')
m_pos = Iterator{++it, m_regex};
else if (*it == '<')
{
m_pos = Iterator{it, m_regex};
return false;
const auto name_start = ++it;
while (it != m_regex.end() and is_word(*it))
++it;
if (it == m_regex.end() or *it != '>')
parse_error("named captures should be only ascii word characters");
capture_group = m_parsed_regex.capture_count++;
m_parsed_regex.named_captures.push_back({{name_start, it}, capture_group});
m_pos = Iterator{++it, m_regex};
}
return true;
};
NodeIndex content = disjunction(captures() ? m_parsed_regex.capture_count++ : -1);
NodeIndex content = disjunction(capture_group);
if (at_end() or *m_pos++ != ')')
parse_error("unclosed parenthesis");
return content;
@ -682,6 +693,7 @@ struct RegexCompiler
m_program.first_backward_inst = -1;
m_program.character_classes = std::move(m_parsed_regex.character_classes);
m_program.named_captures = std::move(m_parsed_regex.named_captures);
m_program.save_count = m_parsed_regex.capture_count * 2;
}
@ -1526,6 +1538,24 @@ auto test_regex = UnitTest{[]{
const char str[] = "\0\n☎☏"; // work around the null byte in the literal
kak_assert(vm.exec({str, str + sizeof(str)-1}));
}
{
auto eq = [](const CompiledRegex::NamedCapture& lhs,
const CompiledRegex::NamedCapture& rhs) {
return lhs.name == rhs.name and
lhs.index == rhs.index;
};
TestVM<> vm{R"((?<year>\d+)-(?<month>\d+)-(?<day>\d+))"};
kak_assert(vm.exec("2019-01-03", RegexExecFlags::None));
kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "2019");
kak_assert(StringView{vm.captures()[4], vm.captures()[5]} == "01");
kak_assert(StringView{vm.captures()[6], vm.captures()[7]} == "03");
kak_assert(vm.named_captures.size() == 3);
kak_assert(eq(vm.named_captures[0], {"year", 1}));
kak_assert(eq(vm.named_captures[1], {"month", 2}));
kak_assert(eq(vm.named_captures[2], {"day", 3}));
}
}};
}

View File

@ -107,9 +107,16 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
explicit operator bool() const { return not instructions.empty(); }
struct NamedCapture
{
String name;
uint32_t index;
};
Vector<Instruction, MemoryDomain::Regex> instructions;
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
Vector<Lookaround, MemoryDomain::Regex> lookarounds;
Vector<NamedCapture, MemoryDomain::Regex> named_captures;
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
uint32_t save_count;

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
2018-01-03

View File

@ -0,0 +1 @@
add-highlighter window/ regex (?<year>\d+)-(?<month>\d+)-(?<day>\d+) year:red month:green day:yellow

View File

@ -0,0 +1,7 @@
{ "jsonrpc": "2.0", "method": "set_ui_options", "params": [{}] }
{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "2" }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "018" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "green", "bg": "default", "attributes": [] }, "contents": "01" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "yellow", "bg": "default", "attributes": [] }, "contents": "03" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "black", "bg": "yellow", "attributes": [] }, "contents": "" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - client0@[kak-tests]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "set_cursor", "params": ["buffer", { "line": 0, "column": 0 }] }
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }