Regex: Refactor regex compilation to a regular RegexCompiler class

This commit is contained in:
Maxime Coste 2017-09-26 23:04:47 +09:00
parent d5717edc9d
commit f7468b576e

View File

@ -13,18 +13,15 @@
namespace Kakoune
{
struct CompiledRegex
struct ParsedRegex
{
enum Op : char
enum Op
{
Match,
Literal,
AnyChar,
Matcher,
Jump,
Split_PrioritizeParent,
Split_PrioritizeChild,
Save,
Sequence,
Alternation,
LineStart,
LineEnd,
WordBoundary,
@ -33,84 +30,49 @@ struct CompiledRegex
SubjectEnd,
};
using Offset = unsigned;
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
};
namespace RegexCompiler
{
struct Quantifier
{
enum Type
struct Quantifier
{
One,
Optional,
RepeatZeroOrMore,
RepeatOneOrMore,
RepeatMinMax,
enum Type
{
One,
Optional,
RepeatZeroOrMore,
RepeatOneOrMore,
RepeatMinMax,
};
Type type = One;
int min = -1, max = -1;
bool allows_none() const
{
return type == Quantifier::Optional or
type == Quantifier::RepeatZeroOrMore or
(type == Quantifier::RepeatMinMax and min <= 0);
}
bool allows_infinite_repeat() const
{
return type == Quantifier::RepeatZeroOrMore or
type == Quantifier::RepeatOneOrMore or
(type == Quantifier::RepeatMinMax and max == -1);
};
};
Type type = One;
int min = -1, max = -1;
bool allows_none() const
struct AstNode
{
return type == Quantifier::Optional or
type == Quantifier::RepeatZeroOrMore or
(type == Quantifier::RepeatMinMax and min <= 0);
}
bool allows_infinite_repeat() const
{
return type == Quantifier::RepeatZeroOrMore or
type == Quantifier::RepeatOneOrMore or
(type == Quantifier::RepeatMinMax and max == -1);
Op op;
Codepoint value;
Quantifier quantifier;
Vector<std::unique_ptr<AstNode>> children;
};
};
enum class Op
{
Literal,
AnyChar,
Matcher,
Sequence,
Alternation,
LineStart,
LineEnd,
WordBoundary,
NotWordBoundary,
SubjectBegin,
SubjectEnd,
};
using AstNodePtr = std::unique_ptr<AstNode>;
struct AstNode
{
Op op;
Codepoint value;
Quantifier quantifier;
Vector<std::unique_ptr<AstNode>> children;
};
using AstNodePtr = std::unique_ptr<AstNode>;
struct CharRange { Codepoint min, max; };
struct ParsedRegex
{
AstNodePtr ast;
size_t capture_count;
Vector<std::function<bool (Codepoint)>> matchers;
};
AstNodePtr make_ast_node(Op op, Codepoint value = -1,
Quantifier quantifier = {Quantifier::One})
{
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
}
// Recursive descent parser based on naming used in the ECMAScript
// standard, although the syntax is not fully compatible.
struct RegexParser
@ -124,6 +86,8 @@ struct RegexParser
ParsedRegex get_parsed_regex() { return std::move(m_parsed_regex); }
static ParsedRegex parse(StringView re) { return RegexParser{re}.get_parsed_regex(); }
private:
struct InvalidPolicy
{
@ -131,6 +95,7 @@ private:
};
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
using AstNodePtr = ParsedRegex::AstNodePtr;
AstNodePtr disjunction(unsigned capture = -1)
{
@ -142,7 +107,7 @@ private:
}
++m_pos;
AstNodePtr res = make_ast_node(Op::Alternation);
AstNodePtr res = new_node(ParsedRegex::Alternation);
res->children.push_back(std::move(node));
res->children.push_back(disjunction());
res->value = capture;
@ -151,7 +116,7 @@ private:
AstNodePtr alternative()
{
AstNodePtr res = make_ast_node(Op::Sequence);
AstNodePtr res = new_node(ParsedRegex::Sequence);
while (auto node = term())
res->children.push_back(std::move(node));
if (res->children.empty())
@ -178,17 +143,17 @@ private:
switch (*m_pos)
{
case '^': ++m_pos; return make_ast_node(Op::LineStart);
case '$': ++m_pos; return make_ast_node(Op::LineEnd);
case '^': ++m_pos; return new_node(ParsedRegex::LineStart);
case '$': ++m_pos; return new_node(ParsedRegex::LineEnd);
case '\\':
if (m_pos+1 == m_regex.end())
return nullptr;
switch (*(m_pos+1))
{
case 'b': m_pos += 2; return make_ast_node(Op::WordBoundary);
case 'B': m_pos += 2; return make_ast_node(Op::NotWordBoundary);
case '`': m_pos += 2; return make_ast_node(Op::SubjectBegin);
case '\'': m_pos += 2; return make_ast_node(Op::SubjectEnd);
case 'b': m_pos += 2; return new_node(ParsedRegex::WordBoundary);
case 'B': m_pos += 2; return new_node(ParsedRegex::NotWordBoundary);
case '`': m_pos += 2; return new_node(ParsedRegex::SubjectBegin);
case '\'': m_pos += 2; return new_node(ParsedRegex::SubjectEnd);
}
break;
/* TODO: look ahead, look behind */
@ -204,7 +169,7 @@ private:
const Codepoint cp = *m_pos;
switch (cp)
{
case '.': ++m_pos; return make_ast_node(Op::AnyChar);
case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
case '(':
{
++m_pos;
@ -225,7 +190,7 @@ private:
if (contains("^$.*+?()[]{}|", cp))
return nullptr;
++m_pos;
return make_ast_node(Op::Literal, cp);
return new_node(ParsedRegex::Literal, cp);
}
}
@ -244,7 +209,7 @@ private:
chars = character_class.additional_chars] (Codepoint cp) {
return iswctype(cp, ctype) or contains(chars, cp);
});
return make_ast_node(Op::Matcher, matcher_id);
return new_node(ParsedRegex::Matcher, matcher_id);
}
}
@ -255,13 +220,13 @@ private:
for (auto& control : control_escapes)
{
if (control.name == cp)
return make_ast_node(Op::Literal, control.value);
return new_node(ParsedRegex::Literal, control.value);
}
// TOOD: \c..., \0..., '\0x...', \u...
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return make_ast_node(Op::Literal, cp);
return new_node(ParsedRegex::Literal, cp);
parse_error("unknown atom escape");
}
@ -271,6 +236,7 @@ private:
if (negative)
++m_pos;
struct CharRange { Codepoint min, max; };
Vector<CharRange> ranges;
Vector<std::pair<wctype_t, bool>> ctypes;
while (m_pos != m_regex.end() and *m_pos != ']')
@ -327,13 +293,13 @@ private:
auto matcher_id = m_parsed_regex.matchers.size();
m_parsed_regex.matchers.push_back(std::move(matcher));
return make_ast_node(Op::Matcher, matcher_id);
return new_node(ParsedRegex::Matcher, matcher_id);
}
Quantifier quantifier()
ParsedRegex::Quantifier quantifier()
{
if (at_end())
return {Quantifier::One};
return {ParsedRegex::Quantifier::One};
auto read_int = [](auto& pos, auto begin, auto end) {
int res = 0;
@ -349,9 +315,9 @@ private:
switch (*m_pos)
{
case '*': ++m_pos; return {Quantifier::RepeatZeroOrMore};
case '+': ++m_pos; return {Quantifier::RepeatOneOrMore};
case '?': ++m_pos; return {Quantifier::Optional};
case '*': ++m_pos; return {ParsedRegex::Quantifier::RepeatZeroOrMore};
case '+': ++m_pos; return {ParsedRegex::Quantifier::RepeatOneOrMore};
case '?': ++m_pos; return {ParsedRegex::Quantifier::Optional};
case '{':
{
auto it = m_pos+1;
@ -365,12 +331,19 @@ private:
if (*it++ != '}')
parse_error("expected closing bracket");
m_pos = it;
return {Quantifier::RepeatMinMax, min, max};
return {ParsedRegex::Quantifier::RepeatMinMax, min, max};
}
default: return {Quantifier::One};
default: return {ParsedRegex::Quantifier::One};
}
}
static AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1,
ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
{
return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, {}}};
}
bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]]
@ -405,168 +378,210 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6]
{ 's', "space", "", true },
};
CompiledRegex::Offset alloc_offset(CompiledRegex& program)
struct CompiledRegex
{
auto pos = program.bytecode.size();
program.bytecode.resize(pos + sizeof(CompiledRegex::Offset));
return pos;
}
CompiledRegex::Offset& get_offset(CompiledRegex& program, CompiledRegex::Offset pos)
{
return *reinterpret_cast<CompiledRegex::Offset*>(&program.bytecode[pos]);
}
void push_codepoint(CompiledRegex& program, Codepoint cp)
{
utf8::dump(std::back_inserter(program.bytecode), cp);
}
CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node);
CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
{
const auto start_pos = program.bytecode.size();
const Codepoint capture = (node->op == Op::Alternation or node->op == Op::Sequence) ? node->value : -1;
if (capture != -1)
enum Op : char
{
program.bytecode.push_back(CompiledRegex::Save);
program.bytecode.push_back(capture * 2);
Match,
Literal,
AnyChar,
Matcher,
Jump,
Split_PrioritizeParent,
Split_PrioritizeChild,
Save,
LineStart,
LineEnd,
WordBoundary,
NotWordBoundary,
SubjectBegin,
SubjectEnd,
};
using Offset = unsigned;
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
};
struct RegexCompiler
{
RegexCompiler(const ParsedRegex& parsed_regex)
: m_parsed_regex{parsed_regex}
{
write_search_prefix();
compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
m_program.save_count = m_parsed_regex.capture_count * 2;
}
Vector<CompiledRegex::Offset> goto_inner_end_offsets;
switch (node->op)
CompiledRegex get_compiled_regex() { return std::move(m_program); }
using Offset = CompiledRegex::Offset;
static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
static CompiledRegex compile(StringView re)
{
case Op::Literal:
program.bytecode.push_back(CompiledRegex::Literal);
push_codepoint(program, node->value);
break;
case Op::AnyChar:
program.bytecode.push_back(CompiledRegex::AnyChar);
break;
case Op::Matcher:
program.bytecode.push_back(CompiledRegex::Matcher);
program.bytecode.push_back(node->value);
case Op::Sequence:
for (auto& child : node->children)
compile_node(program, parsed_regex, child);
break;
case Op::Alternation:
return RegexCompiler{RegexParser::parse(re)}.get_compiled_regex();
}
private:
Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
{
const auto start_pos = m_program.bytecode.size();
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
if (capture != -1)
{
auto& children = node->children;
kak_assert(children.size() == 2);
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset(program);
compile_node(program, parsed_regex, children[0]);
program.bytecode.push_back(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset(program));
auto right_pos = compile_node(program, parsed_regex, children[1]);
get_offset(program, offset) = right_pos;
break;
push_op(CompiledRegex::Save);
push_byte(capture * 2);
}
case Op::LineStart:
program.bytecode.push_back(CompiledRegex::LineStart);
break;
case Op::LineEnd:
program.bytecode.push_back(CompiledRegex::LineEnd);
break;
case Op::WordBoundary:
program.bytecode.push_back(CompiledRegex::WordBoundary);
break;
case Op::NotWordBoundary:
program.bytecode.push_back(CompiledRegex::NotWordBoundary);
break;
case Op::SubjectBegin:
program.bytecode.push_back(CompiledRegex::SubjectBegin);
break;
case Op::SubjectEnd:
program.bytecode.push_back(CompiledRegex::SubjectEnd);
break;
Vector<Offset> goto_inner_end_offsets;
switch (node->op)
{
case ParsedRegex::Literal:
push_op(CompiledRegex::Literal);
push_codepoint(node->value);
break;
case ParsedRegex::AnyChar:
push_op(CompiledRegex::AnyChar);
break;
case ParsedRegex::Matcher:
push_op(CompiledRegex::Matcher);
push_byte(node->value);
case ParsedRegex::Sequence:
for (auto& child : node->children)
compile_node(child);
break;
case ParsedRegex::Alternation:
{
auto& children = node->children;
kak_assert(children.size() == 2);
push_op(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset();
compile_node(children[0]);
push_op(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset());
auto right_pos = compile_node(children[1]);
get_offset(offset) = right_pos;
break;
}
case ParsedRegex::LineStart:
push_op(CompiledRegex::LineStart);
break;
case ParsedRegex::LineEnd:
push_op(CompiledRegex::LineEnd);
break;
case ParsedRegex::WordBoundary:
push_op(CompiledRegex::WordBoundary);
break;
case ParsedRegex::NotWordBoundary:
push_op(CompiledRegex::NotWordBoundary);
break;
case ParsedRegex::SubjectBegin:
push_op(CompiledRegex::SubjectBegin);
break;
case ParsedRegex::SubjectEnd:
push_op(CompiledRegex::SubjectEnd);
break;
}
for (auto& offset : goto_inner_end_offsets)
get_offset(offset) = m_program.bytecode.size();
if (capture != -1)
{
push_op(CompiledRegex::Save);
push_byte(capture * 2 + 1);
}
return start_pos;
}
for (auto& offset : goto_inner_end_offsets)
get_offset(program, offset) = program.bytecode.size();
if (capture != -1)
Offset compile_node(const ParsedRegex::AstNodePtr& node)
{
program.bytecode.push_back(CompiledRegex::Save);
program.bytecode.push_back(capture * 2 + 1);
Offset pos = m_program.bytecode.size();
Vector<Offset> goto_end_offsets;
if (node->quantifier.allows_none())
{
push_op(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset());
}
auto inner_pos = compile_node_inner(node);
// Write the node multiple times when we have a min count quantifier
for (int i = 1; i < node->quantifier.min; ++i)
inner_pos = compile_node_inner(node);
if (node->quantifier.allows_infinite_repeat())
{
push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(alloc_offset()) = inner_pos;
}
// Write the node as an optional match for the min -> max counts
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
i < node->quantifier.max; ++i)
{
push_op(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset());
compile_node_inner(node);
}
for (auto offset : goto_end_offsets)
get_offset(offset) = m_program.bytecode.size();
return pos;
}
return start_pos;
}
CompiledRegex::Offset compile_node(CompiledRegex& program, const ParsedRegex& parsed_regex, const AstNodePtr& node)
{
CompiledRegex::Offset pos = program.bytecode.size();
Vector<CompiledRegex::Offset> goto_end_offsets;
if (node->quantifier.allows_none())
// Add a '.*' as the first instructions for the search use case
void write_search_prefix()
{
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset(program));
kak_assert(m_program.bytecode.empty());
push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(alloc_offset()) = search_prefix_size;
push_op(CompiledRegex::AnyChar);
push_op(CompiledRegex::Split_PrioritizeParent);
get_offset(alloc_offset()) = 1 + sizeof(Offset);
}
auto inner_pos = compile_node_inner(program, parsed_regex, node);
// Write the node multiple times when we have a min count quantifier
for (int i = 1; i < node->quantifier.min; ++i)
inner_pos = compile_node_inner(program, parsed_regex, node);
if (node->quantifier.allows_infinite_repeat())
Offset alloc_offset()
{
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
get_offset(program, alloc_offset(program)) = inner_pos;
auto pos = m_program.bytecode.size();
m_program.bytecode.resize(pos + sizeof(Offset));
return pos;
}
// Write the node as an optional match for the min -> max counts
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
i < node->quantifier.max; ++i)
Offset& get_offset(Offset pos)
{
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
goto_end_offsets.push_back(alloc_offset(program));
compile_node_inner(program, parsed_regex, node);
return *reinterpret_cast<Offset*>(&m_program.bytecode[pos]);
}
for (auto offset : goto_end_offsets)
get_offset(program, offset) = program.bytecode.size();
void push_op(CompiledRegex::Op op)
{
m_program.bytecode.push_back(op);
}
return pos;
}
void push_byte(char byte)
{
m_program.bytecode.push_back(byte);
}
constexpr CompiledRegex::Offset prefix_size = 3 + 2 * sizeof(CompiledRegex::Offset);
void push_codepoint(Codepoint cp)
{
utf8::dump(std::back_inserter(m_program.bytecode), cp);
}
// Add a '.*' as the first instructions for the search use case
void write_search_prefix(CompiledRegex& program)
{
kak_assert(program.bytecode.empty());
program.bytecode.push_back(CompiledRegex::Split_PrioritizeChild);
get_offset(program, alloc_offset(program)) = prefix_size;
program.bytecode.push_back(CompiledRegex::AnyChar);
program.bytecode.push_back(CompiledRegex::Split_PrioritizeParent);
get_offset(program, alloc_offset(program)) = 1 + sizeof(CompiledRegex::Offset);
}
CompiledRegex compile(const ParsedRegex& parsed_regex)
{
CompiledRegex res;
write_search_prefix(res);
compile_node(res, parsed_regex, parsed_regex.ast);
res.bytecode.push_back(CompiledRegex::Match);
res.matchers = parsed_regex.matchers;
res.save_count = parsed_regex.capture_count * 2;
return res;
}
CompiledRegex compile(StringView re)
{
return compile(RegexParser{re}.get_parsed_regex());
}
}
CompiledRegex m_program;
const ParsedRegex& m_parsed_regex;
};
void dump(const CompiledRegex& program)
{
@ -728,7 +743,7 @@ struct ThreadedRegexVM
{
bool found_match = false;
m_threads.clear();
add_thread(0, match ? RegexCompiler::prefix_size : 0,
add_thread(0, match ? RegexCompiler::search_prefix_size : 0,
Vector<const char*>(m_program.save_count, nullptr));
m_begin = data.begin();
@ -814,7 +829,7 @@ void validate_regex(StringView re)
{
try
{
RegexCompiler::RegexParser{re};
RegexParser{re};
}
catch (runtime_error& err)
{