Regex: Refactor regex compilation to a regular RegexCompiler class

This commit is contained in:
Maxime Coste 2017-09-26 23:04:47 +09:00
parent d5717edc9d
commit f7468b576e

View File

@ -13,18 +13,15 @@
namespace Kakoune namespace Kakoune
{ {
struct CompiledRegex struct ParsedRegex
{ {
enum Op : char enum Op
{ {
Match,
Literal, Literal,
AnyChar, AnyChar,
Matcher, Matcher,
Jump, Sequence,
Split_PrioritizeParent, Alternation,
Split_PrioritizeChild,
Save,
LineStart, LineStart,
LineEnd, LineEnd,
WordBoundary, WordBoundary,
@ -33,16 +30,6 @@ struct CompiledRegex
SubjectEnd, SubjectEnd,
}; };
using Offset = unsigned;
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
};
namespace RegexCompiler
{
struct Quantifier struct Quantifier
{ {
enum Type enum Type
@ -71,21 +58,6 @@ struct Quantifier
}; };
}; };
enum class Op
{
Literal,
AnyChar,
Matcher,
Sequence,
Alternation,
LineStart,
LineEnd,
WordBoundary,
NotWordBoundary,
SubjectBegin,
SubjectEnd,
};
struct AstNode struct AstNode
{ {
Op op; Op op;
@ -96,21 +68,11 @@ struct AstNode
using AstNodePtr = std::unique_ptr<AstNode>; using AstNodePtr = std::unique_ptr<AstNode>;
struct CharRange { Codepoint min, max; };
struct ParsedRegex
{
AstNodePtr ast; AstNodePtr ast;
size_t capture_count; size_t capture_count;
Vector<std::function<bool (Codepoint)>> matchers; Vector<std::function<bool (Codepoint)>> matchers;
}; };
AstNodePtr make_ast_node(Op op, Codepoint value = -1,
Quantifier quantifier = {Quantifier::One})
{
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
}
// Recursive descent parser based on naming used in the ECMAScript // Recursive descent parser based on naming used in the ECMAScript
// standard, although the syntax is not fully compatible. // standard, although the syntax is not fully compatible.
struct RegexParser struct RegexParser
@ -124,6 +86,8 @@ struct RegexParser
ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); } ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
static ParsedRegex parse(StringView re) { return RegexParser{re}.get_parsed_regex(); }
private: private:
struct InvalidPolicy struct InvalidPolicy
{ {
@ -131,6 +95,7 @@ private:
}; };
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>; using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
using AstNodePtr = ParsedRegex::AstNodePtr;
AstNodePtr disjunction(unsigned capture = -1) AstNodePtr disjunction(unsigned capture = -1)
{ {
@ -142,7 +107,7 @@ private:
} }
++m_pos; ++m_pos;
AstNodePtr res = make_ast_node(Op::Alternation); AstNodePtr res = new_node(ParsedRegex::Alternation);
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
res->children.push_back(disjunction()); res->children.push_back(disjunction());
res->value = capture; res->value = capture;
@ -151,7 +116,7 @@ private:
AstNodePtr alternative() AstNodePtr alternative()
{ {
AstNodePtr res = make_ast_node(Op::Sequence); AstNodePtr res = new_node(ParsedRegex::Sequence);
while (auto node = term()) while (auto node = term())
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
if (res->children.empty()) if (res->children.empty())
@ -178,17 +143,17 @@ private:
switch (*m_pos) switch (*m_pos)
{ {
case '^': ++m_pos; return make_ast_node(Op::LineStart); case '^': ++m_pos; return new_node(ParsedRegex::LineStart);
case '$': ++m_pos; return make_ast_node(Op::LineEnd); case '$': ++m_pos; return new_node(ParsedRegex::LineEnd);
case '\\': case '\\':
if (m_pos+1 == m_regex.end()) if (m_pos+1 == m_regex.end())
return nullptr; return nullptr;
switch (*(m_pos+1)) switch (*(m_pos+1))
{ {
case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary); case 'b': m_pos += 2; return new_node(ParsedRegex::WordBoundary);
case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary); case 'B': m_pos += 2; return new_node(ParsedRegex::NotWordBoundary);
case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin); case '`': m_pos += 2; return new_node(ParsedRegex::SubjectBegin);
case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd); case '\'': m_pos += 2; return new_node(ParsedRegex::SubjectEnd);
} }
break; break;
/* TODO: look ahead, look behind */ /* TODO: look ahead, look behind */
@ -204,7 +169,7 @@ private:
const Codepoint cp = *m_pos; const Codepoint cp = *m_pos;
switch (cp) switch (cp)
{ {
case '.': ++m_pos; return make_ast_node(Op::AnyChar); case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
case '(': case '(':
{ {
++m_pos; ++m_pos;
@ -225,7 +190,7 @@ private:
if (contains("^$.*+?()[]{}|", cp)) if (contains("^$.*+?()[]{}|", cp))
return nullptr; return nullptr;
++m_pos; ++m_pos;
return make_ast_node(Op::Literal, cp); return new_node(ParsedRegex::Literal, cp);
} }
} }
@ -244,7 +209,7 @@ private:
chars = character_class.additional_chars] (Codepoint cp) { chars = character_class.additional_chars] (Codepoint cp) {
return iswctype(cp, ctype) or contains(chars, cp); return iswctype(cp, ctype) or contains(chars, cp);
}); });
return make_ast_node(Op::Matcher, matcher_id); return new_node(ParsedRegex::Matcher, matcher_id);
} }
} }
@ -255,13 +220,13 @@ private:
for (auto& control : control_escapes) for (auto& control : control_escapes)
{ {
if (control.name == cp) if (control.name == cp)
return make_ast_node(Op::Literal, control.value); return new_node(ParsedRegex::Literal, control.value);
} }
// TOOD: \c..., \0..., '\0x...', \u... // TOOD: \c..., \0..., '\0x...', \u...
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return make_ast_node(Op::Literal, cp); return new_node(ParsedRegex::Literal, cp);
parse_error("unknown atom escape"); parse_error("unknown atom escape");
} }
@ -271,6 +236,7 @@ private:
if (negative) if (negative)
++m_pos; ++m_pos;
struct CharRange { Codepoint min, max; };
Vector<CharRange> ranges; Vector<CharRange> ranges;
Vector<std::pair<wctype_t, bool>> ctypes; Vector<std::pair<wctype_t, bool>> ctypes;
while (m_pos != m_regex.end() and *m_pos != ']') while (m_pos != m_regex.end() and *m_pos != ']')
@ -327,13 +293,13 @@ private:
auto matcher_id = m_parsed_regex.matchers.size(); auto matcher_id = m_parsed_regex.matchers.size();
m_parsed_regex.matchers.push_back(std::move(matcher)); m_parsed_regex.matchers.push_back(std::move(matcher));
return make_ast_node(Op::Matcher, matcher_id); return new_node(ParsedRegex::Matcher, matcher_id);
} }
Quantifier quantifier() ParsedRegex::Quantifier quantifier()
{ {
if (at_end()) if (at_end())
return {Quantifier::One}; return {ParsedRegex::Quantifier::One};
auto read_int = [](auto& pos, auto begin, auto end) { auto read_int = [](auto& pos, auto begin, auto end) {
int res = 0; int res = 0;
@ -349,9 +315,9 @@ private:
switch (*m_pos) switch (*m_pos)
{ {
case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore}; case '*': ++m_pos; return {ParsedRegex::Quantifier::RepeatZeroOrMore};
case '+': ++m_pos; return {Quantifier::RepeatOneOrMore}; case '+': ++m_pos; return {ParsedRegex::Quantifier::RepeatOneOrMore};
case '?': ++m_pos; return {Quantifier::Optional}; case '?': ++m_pos; return {ParsedRegex::Quantifier::Optional};
case '{': case '{':
{ {
auto it = m_pos+1; auto it = m_pos+1;
@ -365,12 +331,19 @@ private:
if (*it++ != '}') if (*it++ != '}')
parse_error("expected closing bracket"); parse_error("expected closing bracket");
m_pos = it; m_pos = it;
return {Quantifier::RepeatMinMax, min, max}; return {ParsedRegex::Quantifier::RepeatMinMax, min, max};
} }
default: return {Quantifier::One}; default: return {ParsedRegex::Quantifier::One};
} }
} }
static AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1,
ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
{
return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, {}}};
}
bool at_end() const { return m_pos == m_regex.end(); } bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]] [[gnu::noreturn]]
@ -405,169 +378,211 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6]
{ 's', "space", "", true }, { 's', "space", "", true },
}; };
CompiledRegex::Offset alloc_offset(CompiledRegex& program) struct CompiledRegex
{ {
auto pos = program.bytecode.size(); enum Op : char
program.bytecode.resize(pos + sizeof(CompiledRegex::Offset)); {
return pos; Match,
Literal,
AnyChar,
Matcher,
Jump,
Split_PrioritizeParent,
Split_PrioritizeChild,
Save,
LineStart,
LineEnd,
WordBoundary,
NotWordBoundary,
SubjectBegin,
SubjectEnd,
};
using Offset = unsigned;
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
};
struct RegexCompiler
{
RegexCompiler(const ParsedRegex& parsed_regex)
: m_parsed_regex{parsed_regex}
{
write_search_prefix();
compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
m_program.save_count = m_parsed_regex.capture_count * 2;
} }
CompiledRegex::Offset& get_offset(CompiledRegex& program, CompiledRegex::Offset pos) CompiledRegex get_compiled_regex() { return std::move(m_program); }
using Offset = CompiledRegex::Offset;
static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
static CompiledRegex compile(StringView re)
{ {
return *reinterpret_cast<CompiledRegex::Offset*>(&program.bytecode[pos]); return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex();
} }
void push_codepoint(CompiledRegex& program, Codepoint cp) private:
Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
{ {
utf8::dump(std::back_inserter(program.bytecode), cp); const auto start_pos = m_program.bytecode.size();
}
CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node); const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
{
const auto start_pos = program.bytecode.size();
const Codepoint capture = (node->op == Op::Alternation or node->op == Op::Sequence) ? node->value : -1;
if (capture != -1) if (capture != -1)
{ {
program.bytecode.push_back(CompiledRegex::Save); push_op(CompiledRegex::Save);
program.bytecode.push_back(capture * 2); push_byte(capture * 2);
} }
Vector<CompiledRegex::Offset> goto_inner_end_offsets; Vector<Offset> goto_inner_end_offsets;
switch (node->op) switch (node->op)
{ {
case Op::Literal: case ParsedRegex::Literal:
program.bytecode.push_back(CompiledRegex::Literal); push_op(CompiledRegex::Literal);
push_codepoint(program, node->value); push_codepoint(node->value);
break; break;
case Op::AnyChar: case ParsedRegex::AnyChar:
program.bytecode.push_back(CompiledRegex::AnyChar); push_op(CompiledRegex::AnyChar);
break; break;
case Op::Matcher: case ParsedRegex::Matcher:
program.bytecode.push_back(CompiledRegex::Matcher); push_op(CompiledRegex::Matcher);
program.bytecode.push_back(node->value); push_byte(node->value);
case Op::Sequence: case ParsedRegex::Sequence:
for (auto& child : node->children) for (auto& child : node->children)
compile_node(program, parsed_regex, child); compile_node(child);
break; break;
case Op::Alternation: case ParsedRegex::Alternation:
{ {
auto& children = node->children; auto& children = node->children;
kak_assert(children.size() == 2); kak_assert(children.size() == 2);
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); push_op(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset(program); auto offset = alloc_offset();
compile_node(program, parsed_regex, children[0]); compile_node(children[0]);
program.bytecode.push_back(CompiledRegex::Jump); push_op(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset(program)); goto_inner_end_offsets.push_back(alloc_offset());
auto right_pos = compile_node(program, parsed_regex, children[1]); auto right_pos = compile_node(children[1]);
get_offset(program, offset) = right_pos; get_offset(offset) = right_pos;
break; break;
} }
case Op::LineStart: case ParsedRegex::LineStart:
program.bytecode.push_back(CompiledRegex::LineStart); push_op(CompiledRegex::LineStart);
break; break;
case Op::LineEnd: case ParsedRegex::LineEnd:
program.bytecode.push_back(CompiledRegex::LineEnd); push_op(CompiledRegex::LineEnd);
break; break;
case Op::WordBoundary: case ParsedRegex::WordBoundary:
program.bytecode.push_back(CompiledRegex::WordBoundary); push_op(CompiledRegex::WordBoundary);
break; break;
case Op::NotWordBoundary: case ParsedRegex::NotWordBoundary:
program.bytecode.push_back(CompiledRegex::NotWordBoundary); push_op(CompiledRegex::NotWordBoundary);
break; break;
case Op::SubjectBegin: case ParsedRegex::SubjectBegin:
program.bytecode.push_back(CompiledRegex::SubjectBegin); push_op(CompiledRegex::SubjectBegin);
break; break;
case Op::SubjectEnd: case ParsedRegex::SubjectEnd:
program.bytecode.push_back(CompiledRegex::SubjectEnd); push_op(CompiledRegex::SubjectEnd);
break; break;
} }
for (auto& offset : goto_inner_end_offsets) for (auto& offset : goto_inner_end_offsets)
get_offset(program, offset) = program.bytecode.size(); get_offset(offset) = m_program.bytecode.size();
if (capture != -1) if (capture != -1)
{ {
program.bytecode.push_back(CompiledRegex::Save); push_op(CompiledRegex::Save);
program.bytecode.push_back(capture * 2 + 1); push_byte(capture * 2 + 1);
} }
return start_pos; return start_pos;
} }
CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node) Offset compile_node(const ParsedRegex::AstNodePtr& node)
{ {
CompiledRegex::Offset pos = program.bytecode.size(); Offset pos = m_program.bytecode.size();
Vector<CompiledRegex::Offset> goto_end_offsets; Vector<Offset> goto_end_offsets;
if (node->quantifier.allows_none()) if (node->quantifier.allows_none())
{ {
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); push_op(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset(program)); goto_end_offsets.push_back(alloc_offset());
} }
auto inner_pos = compile_node_inner(program, parsed_regex, node); auto inner_pos = compile_node_inner(node);
// Write the node multiple times when we have a min count quantifier // Write the node multiple times when we have a min count quantifier
for (int i = 1; i < node->quantifier.min; ++i) for (int i = 1; i < node->quantifier.min; ++i)
inner_pos = compile_node_inner(program, parsed_regex, node); inner_pos = compile_node_inner(node);
if (node->quantifier.allows_infinite_repeat()) if (node->quantifier.allows_infinite_repeat())
{ {
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild); push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(program, alloc_offset(program)) = inner_pos; get_offset(alloc_offset()) = inner_pos;
} }
// Write the node as an optional match for the min -> max counts // Write the node as an optional match for the min -> max counts
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY ! else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
i < node->quantifier.max; ++i) i < node->quantifier.max; ++i)
{ {
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); push_op(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset(program)); goto_end_offsets.push_back(alloc_offset());
compile_node_inner(program, parsed_regex, node); compile_node_inner(node);
} }
for (auto offset : goto_end_offsets) for (auto offset : goto_end_offsets)
get_offset(program, offset) = program.bytecode.size(); get_offset(offset) = m_program.bytecode.size();
return pos; return pos;
} }
constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset);
// Add a '.*' as the first instructions for the search use case // Add a '.*' as the first instructions for the search use case
void write_search_prefix(CompiledRegex& program) void write_search_prefix()
{ {
kak_assert(program.bytecode.empty()); kak_assert(m_program.bytecode.empty());
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild); push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(program, alloc_offset(program)) = prefix_size; get_offset(alloc_offset()) = search_prefix_size;
program.bytecode.push_back(CompiledRegex::AnyChar); push_op(CompiledRegex::AnyChar);
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent); push_op(CompiledRegex::Split_PrioritizeParent);
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset); get_offset(alloc_offset()) = 1 + sizeof(Offset);
} }
CompiledRegex compile(const ParsedRegex& parsed_regex) Offset alloc_offset()
{ {
CompiledRegex res; auto pos = m_program.bytecode.size();
write_search_prefix(res); m_program.bytecode.resize(pos + sizeof(Offset));
compile_node(res, parsed_regex, parsed_regex.ast); return pos;
res.bytecode.push_back(CompiledRegex::Match);
res.matchers = parsed_regex.matchers;
res.save_count = parsed_regex.capture_count * 2;
return res;
} }
CompiledRegex compile(StringView re) Offset& get_offset(Offset pos)
{ {
return compile(RegexParser{re}.get_parsed_regex()); return *reinterpret_cast<Offset*>(&m_program.bytecode[pos]);
} }
void push_op(CompiledRegex::Op op)
{
m_program.bytecode.push_back(op);
} }
void push_byte(char byte)
{
m_program.bytecode.push_back(byte);
}
void push_codepoint(Codepoint cp)
{
utf8::dump(std::back_inserter(m_program.bytecode), cp);
}
CompiledRegex m_program;
const ParsedRegex& m_parsed_regex;
};
void dump(const CompiledRegex& program) void dump(const CompiledRegex& program)
{ {
for (auto pos = program.bytecode.data(), end = program.bytecode.data() + program.bytecode.size(); for (auto pos = program.bytecode.data(), end = program.bytecode.data() + program.bytecode.size();
@ -728,7 +743,7 @@ struct ThreadedRegexVM
{ {
bool found_match = false; bool found_match = false;
m_threads.clear(); m_threads.clear();
add_thread(0, match ? RegexCompiler::prefix_size : 0, add_thread(0, match ? RegexCompiler::search_prefix_size : 0,
Vector<const char*>(m_program.save_count, nullptr)); Vector<const char*>(m_program.save_count, nullptr));
m_begin = data.begin(); m_begin = data.begin();
@ -814,7 +829,7 @@ void validate_regex(StringView re)
{ {
try try
{ {
RegexCompiler::RegexParser{re}; RegexParser{re};
} }
catch (runtime_error& err) catch (runtime_error& err)
{ {