Regex: Refactor regex compilation to a regular RegexCompiler class

This commit is contained in:
Maxime Coste 2017-09-26 23:04:47 +09:00
parent d5717edc9d
commit f7468b576e

View File

@ -13,18 +13,15 @@
namespace Kakoune namespace Kakoune
{ {
struct CompiledRegex struct ParsedRegex
{ {
enum Op : char enum Op
{ {
Match,
Literal, Literal,
AnyChar, AnyChar,
Matcher, Matcher,
Jump, Sequence,
Split_PrioritizeParent, Alternation,
Split_PrioritizeChild,
Save,
LineStart, LineStart,
LineEnd, LineEnd,
WordBoundary, WordBoundary,
@ -33,84 +30,49 @@ struct CompiledRegex
SubjectEnd, SubjectEnd,
}; };
using Offset = unsigned; struct Quantifier
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
};
namespace RegexCompiler
{
struct Quantifier
{
enum Type
{ {
One, enum Type
Optional, {
RepeatZeroOrMore, One,
RepeatOneOrMore, Optional,
RepeatMinMax, RepeatZeroOrMore,
RepeatOneOrMore,
RepeatMinMax,
};
Type type = One;
int min = -1, max = -1;
bool allows_none() const
{
return type == Quantifier::Optional or
type == Quantifier::RepeatZeroOrMore or
(type == Quantifier::RepeatMinMax and min <= 0);
}
bool allows_infinite_repeat() const
{
return type == Quantifier::RepeatZeroOrMore or
type == Quantifier::RepeatOneOrMore or
(type == Quantifier::RepeatMinMax and max == -1);
};
}; };
Type type = One;
int min = -1, max = -1;
bool allows_none() const struct AstNode
{ {
return type == Quantifier::Optional or Op op;
type == Quantifier::RepeatZeroOrMore or Codepoint value;
(type == Quantifier::RepeatMinMax and min <= 0); Quantifier quantifier;
} Vector<std::unique_ptr<AstNode>> children;
bool allows_infinite_repeat() const
{
return type == Quantifier::RepeatZeroOrMore or
type == Quantifier::RepeatOneOrMore or
(type == Quantifier::RepeatMinMax and max == -1);
}; };
};
enum class Op using AstNodePtr = std::unique_ptr<AstNode>;
{
Literal,
AnyChar,
Matcher,
Sequence,
Alternation,
LineStart,
LineEnd,
WordBoundary,
NotWordBoundary,
SubjectBegin,
SubjectEnd,
};
struct AstNode
{
Op op;
Codepoint value;
Quantifier quantifier;
Vector<std::unique_ptr<AstNode>> children;
};
using AstNodePtr = std::unique_ptr<AstNode>;
struct CharRange { Codepoint min, max; };
struct ParsedRegex
{
AstNodePtr ast; AstNodePtr ast;
size_t capture_count; size_t capture_count;
Vector<std::function<bool (Codepoint)>> matchers; Vector<std::function<bool (Codepoint)>> matchers;
}; };
AstNodePtr make_ast_node(Op op, Codepoint value = -1,
Quantifier quantifier = {Quantifier::One})
{
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
}
// Recursive descent parser based on naming used in the ECMAScript // Recursive descent parser based on naming used in the ECMAScript
// standard, although the syntax is not fully compatible. // standard, although the syntax is not fully compatible.
struct RegexParser struct RegexParser
@ -124,6 +86,8 @@ struct RegexParser
ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); } ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
static ParsedRegex parse(StringView re) { return RegexParser{re}.get_parsed_regex(); }
private: private:
struct InvalidPolicy struct InvalidPolicy
{ {
@ -131,6 +95,7 @@ private:
}; };
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>; using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
using AstNodePtr = ParsedRegex::AstNodePtr;
AstNodePtr disjunction(unsigned capture = -1) AstNodePtr disjunction(unsigned capture = -1)
{ {
@ -142,7 +107,7 @@ private:
} }
++m_pos; ++m_pos;
AstNodePtr res = make_ast_node(Op::Alternation); AstNodePtr res = new_node(ParsedRegex::Alternation);
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
res->children.push_back(disjunction()); res->children.push_back(disjunction());
res->value = capture; res->value = capture;
@ -151,7 +116,7 @@ private:
AstNodePtr alternative() AstNodePtr alternative()
{ {
AstNodePtr res = make_ast_node(Op::Sequence); AstNodePtr res = new_node(ParsedRegex::Sequence);
while (auto node = term()) while (auto node = term())
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
if (res->children.empty()) if (res->children.empty())
@ -178,17 +143,17 @@ private:
switch (*m_pos) switch (*m_pos)
{ {
case '^': ++m_pos; return make_ast_node(Op::LineStart); case '^': ++m_pos; return new_node(ParsedRegex::LineStart);
case '$': ++m_pos; return make_ast_node(Op::LineEnd); case '$': ++m_pos; return new_node(ParsedRegex::LineEnd);
case '\\': case '\\':
if (m_pos+1 == m_regex.end()) if (m_pos+1 == m_regex.end())
return nullptr; return nullptr;
switch (*(m_pos+1)) switch (*(m_pos+1))
{ {
case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary); case 'b': m_pos += 2; return new_node(ParsedRegex::WordBoundary);
case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary); case 'B': m_pos += 2; return new_node(ParsedRegex::NotWordBoundary);
case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin); case '`': m_pos += 2; return new_node(ParsedRegex::SubjectBegin);
case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd); case '\'': m_pos += 2; return new_node(ParsedRegex::SubjectEnd);
} }
break; break;
/* TODO: look ahead, look behind */ /* TODO: look ahead, look behind */
@ -204,7 +169,7 @@ private:
const Codepoint cp = *m_pos; const Codepoint cp = *m_pos;
switch (cp) switch (cp)
{ {
case '.': ++m_pos; return make_ast_node(Op::AnyChar); case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
case '(': case '(':
{ {
++m_pos; ++m_pos;
@ -225,7 +190,7 @@ private:
if (contains("^$.*+?()[]{}|", cp)) if (contains("^$.*+?()[]{}|", cp))
return nullptr; return nullptr;
++m_pos; ++m_pos;
return make_ast_node(Op::Literal, cp); return new_node(ParsedRegex::Literal, cp);
} }
} }
@ -244,7 +209,7 @@ private:
chars = character_class.additional_chars] (Codepoint cp) { chars = character_class.additional_chars] (Codepoint cp) {
return iswctype(cp, ctype) or contains(chars, cp); return iswctype(cp, ctype) or contains(chars, cp);
}); });
return make_ast_node(Op::Matcher, matcher_id); return new_node(ParsedRegex::Matcher, matcher_id);
} }
} }
@ -255,13 +220,13 @@ private:
for (auto& control : control_escapes) for (auto& control : control_escapes)
{ {
if (control.name == cp) if (control.name == cp)
return make_ast_node(Op::Literal, control.value); return new_node(ParsedRegex::Literal, control.value);
} }
// TOOD: \c..., \0..., '\0x...', \u... // TOOD: \c..., \0..., '\0x...', \u...
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return make_ast_node(Op::Literal, cp); return new_node(ParsedRegex::Literal, cp);
parse_error("unknown atom escape"); parse_error("unknown atom escape");
} }
@ -271,6 +236,7 @@ private:
if (negative) if (negative)
++m_pos; ++m_pos;
struct CharRange { Codepoint min, max; };
Vector<CharRange> ranges; Vector<CharRange> ranges;
Vector<std::pair<wctype_t, bool>> ctypes; Vector<std::pair<wctype_t, bool>> ctypes;
while (m_pos != m_regex.end() and *m_pos != ']') while (m_pos != m_regex.end() and *m_pos != ']')
@ -327,13 +293,13 @@ private:
auto matcher_id = m_parsed_regex.matchers.size(); auto matcher_id = m_parsed_regex.matchers.size();
m_parsed_regex.matchers.push_back(std::move(matcher)); m_parsed_regex.matchers.push_back(std::move(matcher));
return make_ast_node(Op::Matcher, matcher_id); return new_node(ParsedRegex::Matcher, matcher_id);
} }
Quantifier quantifier() ParsedRegex::Quantifier quantifier()
{ {
if (at_end()) if (at_end())
return {Quantifier::One}; return {ParsedRegex::Quantifier::One};
auto read_int = [](auto& pos, auto begin, auto end) { auto read_int = [](auto& pos, auto begin, auto end) {
int res = 0; int res = 0;
@ -349,9 +315,9 @@ private:
switch (*m_pos) switch (*m_pos)
{ {
case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore}; case '*': ++m_pos; return {ParsedRegex::Quantifier::RepeatZeroOrMore};
case '+': ++m_pos; return {Quantifier::RepeatOneOrMore}; case '+': ++m_pos; return {ParsedRegex::Quantifier::RepeatOneOrMore};
case '?': ++m_pos; return {Quantifier::Optional}; case '?': ++m_pos; return {ParsedRegex::Quantifier::Optional};
case '{': case '{':
{ {
auto it = m_pos+1; auto it = m_pos+1;
@ -365,12 +331,19 @@ private:
if (*it++ != '}') if (*it++ != '}')
parse_error("expected closing bracket"); parse_error("expected closing bracket");
m_pos = it; m_pos = it;
return {Quantifier::RepeatMinMax, min, max}; return {ParsedRegex::Quantifier::RepeatMinMax, min, max};
} }
default: return {Quantifier::One}; default: return {ParsedRegex::Quantifier::One};
} }
} }
static AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1,
ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
{
return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, {}}};
}
bool at_end() const { return m_pos == m_regex.end(); } bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]] [[gnu::noreturn]]
@ -405,168 +378,210 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6]
{ 's', "space", "", true }, { 's', "space", "", true },
}; };
CompiledRegex::Offset alloc_offset(CompiledRegex& program) struct CompiledRegex
{ {
auto pos = program.bytecode.size(); enum Op : char
program.bytecode.resize(pos + sizeof(CompiledRegex::Offset));
return pos;
}
CompiledRegex::Offset& get_offset(CompiledRegex& program, CompiledRegex::Offset pos)
{
return *reinterpret_cast<CompiledRegex::Offset*>(&program.bytecode[pos]);
}
void push_codepoint(CompiledRegex& program, Codepoint cp)
{
utf8::dump(std::back_inserter(program.bytecode), cp);
}
CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node);
CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
{
const auto start_pos = program.bytecode.size();
const Codepoint capture = (node->op == Op::Alternation or node->op == Op::Sequence) ? node->value : -1;
if (capture != -1)
{ {
program.bytecode.push_back(CompiledRegex::Save); Match,
program.bytecode.push_back(capture * 2); Literal,
AnyChar,
Matcher,
Jump,
Split_PrioritizeParent,
Split_PrioritizeChild,
Save,
LineStart,
LineEnd,
WordBoundary,
NotWordBoundary,
SubjectBegin,
SubjectEnd,
};
using Offset = unsigned;
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
};
struct RegexCompiler
{
RegexCompiler(const ParsedRegex& parsed_regex)
: m_parsed_regex{parsed_regex}
{
write_search_prefix();
compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
m_program.save_count = m_parsed_regex.capture_count * 2;
} }
Vector<CompiledRegex::Offset> goto_inner_end_offsets; CompiledRegex get_compiled_regex() { return std::move(m_program); }
switch (node->op)
using Offset = CompiledRegex::Offset;
static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
static CompiledRegex compile(StringView re)
{ {
case Op::Literal: return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex();
program.bytecode.push_back(CompiledRegex::Literal); }
push_codepoint(program, node->value);
break; private:
case Op::AnyChar: Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
program.bytecode.push_back(CompiledRegex::AnyChar); {
break; const auto start_pos = m_program.bytecode.size();
case Op::Matcher:
program.bytecode.push_back(CompiledRegex::Matcher); const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
program.bytecode.push_back(node->value); if (capture != -1)
case Op::Sequence:
for (auto& child : node->children)
compile_node(program, parsed_regex, child);
break;
case Op::Alternation:
{ {
auto& children = node->children; push_op(CompiledRegex::Save);
kak_assert(children.size() == 2); push_byte(capture * 2);
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset(program);
compile_node(program, parsed_regex, children[0]);
program.bytecode.push_back(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset(program));
auto right_pos = compile_node(program, parsed_regex, children[1]);
get_offset(program, offset) = right_pos;
break;
} }
case Op::LineStart:
program.bytecode.push_back(CompiledRegex::LineStart); Vector<Offset> goto_inner_end_offsets;
break; switch (node->op)
case Op::LineEnd: {
program.bytecode.push_back(CompiledRegex::LineEnd); case ParsedRegex::Literal:
break; push_op(CompiledRegex::Literal);
case Op::WordBoundary: push_codepoint(node->value);
program.bytecode.push_back(CompiledRegex::WordBoundary); break;
break; case ParsedRegex::AnyChar:
case Op::NotWordBoundary: push_op(CompiledRegex::AnyChar);
program.bytecode.push_back(CompiledRegex::NotWordBoundary); break;
break; case ParsedRegex::Matcher:
case Op::SubjectBegin: push_op(CompiledRegex::Matcher);
program.bytecode.push_back(CompiledRegex::SubjectBegin); push_byte(node->value);
break; case ParsedRegex::Sequence:
case Op::SubjectEnd: for (auto& child : node->children)
program.bytecode.push_back(CompiledRegex::SubjectEnd); compile_node(child);
break; break;
case ParsedRegex::Alternation:
{
auto& children = node->children;
kak_assert(children.size() == 2);
push_op(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset();
compile_node(children[0]);
push_op(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset());
auto right_pos = compile_node(children[1]);
get_offset(offset) = right_pos;
break;
}
case ParsedRegex::LineStart:
push_op(CompiledRegex::LineStart);
break;
case ParsedRegex::LineEnd:
push_op(CompiledRegex::LineEnd);
break;
case ParsedRegex::WordBoundary:
push_op(CompiledRegex::WordBoundary);
break;
case ParsedRegex::NotWordBoundary:
push_op(CompiledRegex::NotWordBoundary);
break;
case ParsedRegex::SubjectBegin:
push_op(CompiledRegex::SubjectBegin);
break;
case ParsedRegex::SubjectEnd:
push_op(CompiledRegex::SubjectEnd);
break;
}
for (auto& offset : goto_inner_end_offsets)
get_offset(offset) = m_program.bytecode.size();
if (capture != -1)
{
push_op(CompiledRegex::Save);
push_byte(capture * 2 + 1);
}
return start_pos;
} }
for (auto& offset : goto_inner_end_offsets) Offset compile_node(const ParsedRegex::AstNodePtr& node)
get_offset(program, offset) = program.bytecode.size();
if (capture != -1)
{ {
program.bytecode.push_back(CompiledRegex::Save); Offset pos = m_program.bytecode.size();
program.bytecode.push_back(capture * 2 + 1); Vector<Offset> goto_end_offsets;
if (node->quantifier.allows_none())
{
push_op(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset());
}
auto inner_pos = compile_node_inner(node);
// Write the node multiple times when we have a min count quantifier
for (int i = 1; i < node->quantifier.min; ++i)
inner_pos = compile_node_inner(node);
if (node->quantifier.allows_infinite_repeat())
{
push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(alloc_offset()) = inner_pos;
}
// Write the node as an optional match for the min -> max counts
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
i < node->quantifier.max; ++i)
{
push_op(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset());
compile_node_inner(node);
}
for (auto offset : goto_end_offsets)
get_offset(offset) = m_program.bytecode.size();
return pos;
} }
return start_pos; // Add a '.*' as the first instructions for the search use case
} void write_search_prefix()
CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
{
CompiledRegex::Offset pos = program.bytecode.size();
Vector<CompiledRegex::Offset> goto_end_offsets;
if (node->quantifier.allows_none())
{ {
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); kak_assert(m_program.bytecode.empty());
goto_end_offsets.push_back(alloc_offset(program)); push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(alloc_offset()) = search_prefix_size;
push_op(CompiledRegex::AnyChar);
push_op(CompiledRegex::Split_PrioritizeParent);
get_offset(alloc_offset()) = 1 + sizeof(Offset);
} }
auto inner_pos = compile_node_inner(program, parsed_regex, node); Offset alloc_offset()
// Write the node multiple times when we have a min count quantifier
for (int i = 1; i < node->quantifier.min; ++i)
inner_pos = compile_node_inner(program, parsed_regex, node);
if (node->quantifier.allows_infinite_repeat())
{ {
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild); auto pos = m_program.bytecode.size();
get_offset(program, alloc_offset(program)) = inner_pos; m_program.bytecode.resize(pos + sizeof(Offset));
return pos;
} }
// Write the node as an optional match for the min -> max counts
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY ! Offset& get_offset(Offset pos)
i < node->quantifier.max; ++i)
{ {
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); return *reinterpret_cast<Offset*>(&m_program.bytecode[pos]);
goto_end_offsets.push_back(alloc_offset(program));
compile_node_inner(program, parsed_regex, node);
} }
for (auto offset : goto_end_offsets) void push_op(CompiledRegex::Op op)
get_offset(program, offset) = program.bytecode.size(); {
m_program.bytecode.push_back(op);
}
return pos; void push_byte(char byte)
} {
m_program.bytecode.push_back(byte);
}
constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset); void push_codepoint(Codepoint cp)
{
utf8::dump(std::back_inserter(m_program.bytecode), cp);
}
// Add a '.*' as the first instructions for the search use case CompiledRegex m_program;
void write_search_prefix(CompiledRegex& program) const ParsedRegex& m_parsed_regex;
{ };
kak_assert(program.bytecode.empty());
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
get_offset(program, alloc_offset(program)) = prefix_size;
program.bytecode.push_back(CompiledRegex::AnyChar);
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
}
CompiledRegex compile(const ParsedRegex& parsed_regex)
{
CompiledRegex res;
write_search_prefix(res);
compile_node(res, parsed_regex, parsed_regex.ast);
res.bytecode.push_back(CompiledRegex::Match);
res.matchers = parsed_regex.matchers;
res.save_count = parsed_regex.capture_count * 2;
return res;
}
CompiledRegex compile(StringView re)
{
return compile(RegexParser{re}.get_parsed_regex());
}
}
void dump(const CompiledRegex& program) void dump(const CompiledRegex& program)
{ {
@ -728,7 +743,7 @@ struct ThreadedRegexVM
{ {
bool found_match = false; bool found_match = false;
m_threads.clear(); m_threads.clear();
add_thread(0, match ? RegexCompiler::prefix_size : 0, add_thread(0, match ? RegexCompiler::search_prefix_size : 0,
Vector<const char*>(m_program.save_count, nullptr)); Vector<const char*>(m_program.save_count, nullptr));
m_begin = data.begin(); m_begin = data.begin();
@ -814,7 +829,7 @@ void validate_regex(StringView re)
{ {
try try
{ {
RegexCompiler::RegexParser{re}; RegexParser{re};
} }
catch (runtime_error& err) catch (runtime_error& err)
{ {