From a448e1e22248da39947d38ef520d0bdaf7eecbc9 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Sun, 17 Sep 2017 19:15:43 +0900 Subject: [PATCH] Regex: Code cleanup in the regex impl --- src/regex_impl.cc | 585 +++++++++++++++++++++++----------------------- 1 file changed, 296 insertions(+), 289 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 93b91c4b..d40402f2 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -3,283 +3,286 @@ #include "unit_tests.hh" #include "string.hh" #include "exception.hh" +#include "array_view.hh" namespace Kakoune { -struct RegexProgram +namespace RegexProgram { - enum Op : char - { - Match, - Literal, - AnyChar, - Jump, - Split, - LineStart, - LineEnd, - }; - - using Offset = size_t; - using Instructions = Vector; - - Instructions instructions; +enum Op : char +{ + Match, + Literal, + AnyChar, + Jump, + Split, + LineStart, + LineEnd, }; +using Offset = size_t; +} namespace RegexCompiler { - enum class Quantifier - { - One, - Optional, - RepeatZeroOrMore, - RepeatOneOrMore - }; +enum class Quantifier +{ + One, + Optional, + RepeatZeroOrMore, + RepeatOneOrMore +}; - enum class Op - { - Literal, - AnyChar, - Sequence, - Alternation, - LineStart, - LineEnd, - }; +enum class Op +{ + Literal, + AnyChar, + Sequence, + Alternation, + LineStart, + LineEnd, +}; - struct AstNode - { - Op op; - char value; - Quantifier quantifier; - Vector> children; - }; +struct AstNode +{ + Op op; + char value; + Quantifier quantifier; + Vector> children; +}; - using AstNodePtr = std::unique_ptr; +using AstNodePtr = std::unique_ptr; - AstNodePtr make_ast_node(Op op, char value = 0, - Quantifier quantifier = Quantifier::One) - { - return AstNodePtr{new AstNode{op, value, quantifier, {}}}; - } - - template - struct Parser - { - static AstNodePtr parse(Iterator pos, Iterator end) - { - return disjunction(pos, end); - } - - private: - static AstNodePtr disjunction(Iterator& pos, Iterator end) - { - AstNodePtr node = alternative(pos, end); - if (pos == end or *pos != '|') - return node; - - AstNodePtr res = make_ast_node(Op::Alternation); - res->children.push_back(std::move(node)); - while (pos != end and *pos == '|') - res->children.push_back(disjunction(++pos, end)); - return res; - } - - static AstNodePtr alternative(Iterator& pos, Iterator end) - { - AstNodePtr res = make_ast_node(Op::Sequence); - while (auto node = term(pos, end)) - res->children.push_back(std::move(node)); - return res; - } - - static AstNodePtr term(Iterator& pos, Iterator end) - { - if (auto node = assertion(pos, end)) - return node; - if (auto node = atom(pos, end)) - { - node->quantifier = quantifier(pos, end); - return node; - } - return nullptr; - } - - static AstNodePtr assertion(Iterator& pos, Iterator end) - { - switch (*pos) - { - case '^': ++pos; return make_ast_node(Op::LineStart); - case '$': ++pos; return make_ast_node(Op::LineEnd); - /* TODO: \`, \', \b, \B, look ahead, look behind */ - } - return nullptr; - } - - static AstNodePtr atom(Iterator& pos, Iterator end) - { - const auto c = *pos; - switch (c) - { - case '.': ++pos; return make_ast_node(Op::AnyChar); - case '(': - { - ++pos; - auto content = disjunction(pos, end); - if (pos == end or *pos != ')') - throw runtime_error{"Unclosed parenthesis"}; - ++pos; - return content; - } - default: - if (contains("^$.*+?()[]{}|", c)) - return nullptr; - ++pos; - return make_ast_node(Op::Literal, c); - } - } - - static Quantifier quantifier(Iterator& pos, Iterator end) - { - switch (*pos) - { - case '*': ++pos; return Quantifier::RepeatZeroOrMore; - case '+': ++pos; return Quantifier::RepeatOneOrMore; - case '?': ++pos; return Quantifier::Optional; - default: return Quantifier::One; - } - } - }; - - RegexProgram::Offset compile_node(RegexProgram& program, const AstNodePtr& node) - { - auto& insts = program.instructions; - RegexProgram::Offset pos = insts.size(); - - auto allow_none = [](Quantifier quantifier) { - return quantifier == Quantifier::Optional or - quantifier == Quantifier::RepeatZeroOrMore; - }; - - auto is_repeat = [](Quantifier quantifier) { - return quantifier == Quantifier::RepeatZeroOrMore or - quantifier == Quantifier::RepeatOneOrMore; - }; - - auto alloc_offsets = [](RegexProgram::Instructions& instructions, int count) { - auto pos = instructions.size(); - instructions.resize(instructions.size() + count * sizeof(RegexProgram::Offset)); - return pos; - }; - - auto get_offset = [](RegexProgram::Instructions& instructions, RegexProgram::Offset base, int index = 0) { - return reinterpret_cast(&instructions[base]) + index; - }; - - RegexProgram::Offset optional_offset = -1; - if (allow_none(node->quantifier)) - { - insts.push_back(RegexProgram::Split); - insts.push_back(2); - auto offsets = alloc_offsets(insts, 2); - *get_offset(insts, offsets) = insts.size(); - optional_offset = offsets; - } - - Vector goto_end_offsets; - auto content_pos = insts.size(); - switch (node->op) - { - case Op::Literal: - insts.push_back(RegexProgram::Literal); - insts.push_back(node->value); - break; - case Op::AnyChar: - insts.push_back(RegexProgram::AnyChar); - break; - case Op::Sequence: - for (auto& child : node->children) - compile_node(program, child); - break; - case Op::Alternation: - { - const auto count = node->children.size(); - if (count > 255) - throw runtime_error{"More than 255 elements in an alternation is not supported"}; - - insts.push_back(RegexProgram::Split); - insts.push_back(count); - auto offsets = alloc_offsets(insts, count); - auto& children = node->children; - for (int i = 0; i < children.size(); ++i) - { - auto child_pos = compile_node(program, children[i]); - *get_offset(insts, offsets, i) = child_pos; - // Jump to end after executing that children - insts.push_back(RegexProgram::Jump); - goto_end_offsets.push_back(alloc_offsets(insts, 1)); - } - break; - } - case Op::LineStart: - insts.push_back(RegexProgram::LineStart); - break; - case Op::LineEnd: - insts.push_back(RegexProgram::LineEnd); - break; - } - - for (auto& offset : goto_end_offsets) - *get_offset(insts, offset) = insts.size(); - - if (is_repeat(node->quantifier)) - { - insts.push_back(RegexProgram::Split); - insts.push_back(2); - auto offsets = alloc_offsets(insts, 2); - *get_offset(insts, offsets, 0) = content_pos; - *get_offset(insts, offsets, 1) = insts.size(); - } - - if (optional_offset != -1) - *get_offset(insts, optional_offset, 1) = insts.size(); - - return pos; - } - - RegexProgram compile(const AstNodePtr& node) - { - RegexProgram res; - compile_node(res, node); - res.instructions.push_back(RegexProgram::Match); - return res; - } +AstNodePtr make_ast_node(Op op, char value = 0, + Quantifier quantifier = Quantifier::One) +{ + return AstNodePtr{new AstNode{op, value, quantifier, {}}}; } -void dump_program(const RegexProgram& program) +template +struct Parser { - auto& insts = program.instructions; - for (size_t pos = 0; pos < insts.size(); ) + static AstNodePtr parse(Iterator pos, Iterator end) + { + return disjunction(pos, end); + } + +private: + static AstNodePtr disjunction(Iterator& pos, Iterator end) + { + AstNodePtr node = alternative(pos, end); + if (pos == end or *pos != '|') + return node; + + AstNodePtr res = make_ast_node(Op::Alternation); + res->children.push_back(std::move(node)); + while (pos != end and *pos == '|') + res->children.push_back(disjunction(++pos, end)); + return res; + } + + static AstNodePtr alternative(Iterator& pos, Iterator end) + { + AstNodePtr res = make_ast_node(Op::Sequence); + while (auto node = term(pos, end)) + res->children.push_back(std::move(node)); + return res; + } + + static AstNodePtr term(Iterator& pos, Iterator end) + { + if (auto node = assertion(pos, end)) + return node; + if (auto node = atom(pos, end)) + { + node->quantifier = quantifier(pos, end); + return node; + } + return nullptr; + } + + static AstNodePtr assertion(Iterator& pos, Iterator end) + { + switch (*pos) + { + case '^': ++pos; return make_ast_node(Op::LineStart); + case '$': ++pos; return make_ast_node(Op::LineEnd); + /* TODO: \`, \', \b, \B, look ahead, look behind */ + } + return nullptr; + } + + static AstNodePtr atom(Iterator& pos, Iterator end) + { + const auto c = *pos; + switch (c) + { + case '.': ++pos; return make_ast_node(Op::AnyChar); + case '(': + { + ++pos; + auto content = disjunction(pos, end); + if (pos == end or *pos != ')') + throw runtime_error{"Unclosed parenthesis"}; + ++pos; + return content; + } + default: + if (contains("^$.*+?()[]{}|", c)) + return nullptr; + ++pos; + return make_ast_node(Op::Literal, c); + } + } + + static Quantifier quantifier(Iterator& pos, Iterator end) + { + switch (*pos) + { + case '*': ++pos; return Quantifier::RepeatZeroOrMore; + case '+': ++pos; return Quantifier::RepeatOneOrMore; + case '?': ++pos; return Quantifier::Optional; + default: return Quantifier::One; + } + } +}; + +RegexProgram::Offset compile_node(Vector& program, const AstNodePtr& node) +{ + RegexProgram::Offset pos = program.size(); + + auto allow_none = [](Quantifier quantifier) { + return quantifier == Quantifier::Optional or + quantifier == Quantifier::RepeatZeroOrMore; + }; + + auto is_repeat = [](Quantifier quantifier) { + return quantifier == Quantifier::RepeatZeroOrMore or + quantifier == Quantifier::RepeatOneOrMore; + }; + + auto alloc_offsets = [](Vector& instructions, int count) { + auto pos = instructions.size(); + instructions.resize(instructions.size() + count * sizeof(RegexProgram::Offset)); + return pos; + }; + + auto get_offset = [](Vector& instructions, RegexProgram::Offset base, int index = 0) { + return reinterpret_cast(&instructions[base]) + index; + }; + + RegexProgram::Offset optional_offset = -1; + if (allow_none(node->quantifier)) + { + program.push_back(RegexProgram::Split); + program.push_back(2); + auto offsets = alloc_offsets(program, 2); + *get_offset(program, offsets) = program.size(); + optional_offset = offsets; + } + + Vector goto_end_offsets; + auto content_pos = program.size(); + switch (node->op) + { + case Op::Literal: + program.push_back(RegexProgram::Literal); + program.push_back(node->value); + break; + case Op::AnyChar: + program.push_back(RegexProgram::AnyChar); + break; + case Op::Sequence: + for (auto& child : node->children) + compile_node(program, child); + break; + case Op::Alternation: + { + const auto count = node->children.size(); + if (count > 255) + throw runtime_error{"More than 255 elements in an alternation is not supported"}; + + program.push_back(RegexProgram::Split); + program.push_back(count); + auto offsets = alloc_offsets(program, count); + auto& children = node->children; + for (int i = 0; i < children.size(); ++i) + { + auto child_pos = compile_node(program, children[i]); + *get_offset(program, offsets, i) = child_pos; + // Jump to end after executing that children + program.push_back(RegexProgram::Jump); + goto_end_offsets.push_back(alloc_offsets(program, 1)); + } + break; + } + case Op::LineStart: + program.push_back(RegexProgram::LineStart); + break; + case Op::LineEnd: + program.push_back(RegexProgram::LineEnd); + break; + } + + for (auto& offset : goto_end_offsets) + *get_offset(program, offset) = program.size(); + + if (is_repeat(node->quantifier)) + { + program.push_back(RegexProgram::Split); + program.push_back(2); + auto offsets = alloc_offsets(program, 2); + *get_offset(program, offsets, 0) = content_pos; + *get_offset(program, offsets, 1) = program.size(); + } + + if (optional_offset != -1) + *get_offset(program, optional_offset, 1) = program.size(); + + return pos; +} + +Vector compile(const AstNodePtr& node) +{ + Vector res; + compile_node(res, node); + res.push_back(RegexProgram::Match); + return res; +} + +template +Vector compile(Iterator begin, Iterator end) +{ + return compile(Parser::parse(begin, end)); +} +} + +namespace RegexProgram +{ +void dump(ConstArrayView program) +{ + for (size_t pos = 0; pos < program.size(); ) { printf("%4zd ", pos); - switch ((RegexProgram::Op)insts[pos++]) + switch ((RegexProgram::Op)program[pos++]) { case RegexProgram::Literal: - printf("literal %c\n", insts[pos++]); + printf("literal %c\n", program[pos++]); break; case RegexProgram::AnyChar: printf("any char\n"); break; case RegexProgram::Jump: - printf("jump %zd\n", *reinterpret_cast(&insts[pos])); + printf("jump %zd\n", *reinterpret_cast(&program[pos])); pos += sizeof(RegexProgram::Offset); break; case RegexProgram::Split: { - int count = insts[pos++]; + int count = program[pos++]; printf("split ["); for (int i = 0; i < count; ++i) - printf("%zd%s", reinterpret_cast(&insts[pos])[i], + printf("%zd%s", reinterpret_cast(&program[pos])[i], (i == count - 1) ? "]\n" : ", "); pos += count * sizeof(RegexProgram::Offset); break; @@ -296,17 +299,15 @@ void dump_program(const RegexProgram& program) } } -bool regex_match(const RegexProgram& program, StringView data) +struct StepResult { - const char* start = program.instructions.data(); - Vector threads = { start }; + enum Result { Consumed, Matched, Failed } result; + const char* next = nullptr; +}; - struct StepResult - { - enum Result { Consumed, Stepped, Matched, Failed } result; - const char* next = nullptr; - }; - auto step_thread = [&](const char* inst, char c) -> StepResult +StepResult step_thread(const char* inst, char c, const char* start, Vector& threads) +{ + while (true) { const RegexProgram::Op op = (RegexProgram::Op)*inst++; switch (op) @@ -318,42 +319,43 @@ bool regex_match(const RegexProgram& program, StringView data) case RegexProgram::AnyChar: return { StepResult::Consumed, inst }; case RegexProgram::Jump: - return { StepResult::Stepped, start + *reinterpret_cast(inst) }; + inst = start + *reinterpret_cast(inst); + break; case RegexProgram::Split: { const int count = *inst++; auto* offsets = reinterpret_cast(inst); for (int o = 1; o < count; ++o) threads.push_back(start + offsets[o]); - return { StepResult::Stepped, start + offsets[0] }; + inst = start + offsets[0]; + break; } case RegexProgram::LineStart: // TODO - return { StepResult::Stepped, inst }; + break; case RegexProgram::LineEnd: // TODO - return { StepResult::Stepped, inst }; + break; case RegexProgram::Match: return { StepResult::Matched }; } - return { StepResult::Failed }; - }; + } + return { StepResult::Failed }; +} + +bool match(ConstArrayView program, StringView data) +{ + const char* start = program.begin(); + Vector threads = { start }; for (auto c : data) { for (int i = 0; i < threads.size(); ++i) { - while (threads[i]) - { - auto res = step_thread(threads[i], c); - threads[i] = res.next; - - if (res.result == StepResult::Consumed or - res.result == StepResult::Failed) - break; - else if (res.result == StepResult::Matched) - return true; - } + auto res = step_thread(threads[i], c, start, threads); + threads[i] = res.next; + if (res.result == StepResult::Matched) + return true; } threads.erase(std::remove(threads.begin(), threads.end(), nullptr), threads.end()); if (threads.empty()) @@ -363,29 +365,34 @@ bool regex_match(const RegexProgram& program, StringView data) // Step remaining threads to see if they match without consuming anything else for (int i = 0; i < threads.size(); ++i) { - while (threads[i]) - { - auto res = step_thread(threads[i], 0); - threads[i] = res.next; - if (res.result == StepResult::Consumed) - break; - else if (res.result == StepResult::Matched) - return true; - } + if (step_thread(threads[i], 0, start, threads).result == StepResult::Matched) + return true; } return false; } +} auto test_regex = UnitTest{[]{ - StringView re = "^(foo|qux)+(bar)?baz$"; - auto node = RegexCompiler::Parser::parse(re.begin(), re.end()); - kak_assert(node); - auto program = RegexCompiler::compile(node); - dump_program(program); - kak_assert(regex_match(program, "fooquxbarbaz")); - kak_assert(not regex_match(program, "quxbar")); - kak_assert(not regex_match(program, "blahblah")); - kak_assert(regex_match(program, "foobaz")); + { + StringView re = "a*b"; + auto program = RegexCompiler::compile(re.begin(), re.end()); + RegexProgram::dump(program); + kak_assert(RegexProgram::match(program, "b")); + kak_assert(RegexProgram::match(program, "ab")); + kak_assert(RegexProgram::match(program, "aaab")); + kak_assert(not RegexProgram::match(program, "acb")); + kak_assert(not RegexProgram::match(program, "")); + } + { + StringView re = "^(foo|qux)+(bar)?baz$"; + auto program = RegexCompiler::compile(re.begin(), re.end()); + RegexProgram::dump(program); + kak_assert(RegexProgram::match(program, "fooquxbarbaz")); + kak_assert(not RegexProgram::match(program, "quxbar")); + kak_assert(not RegexProgram::match(program, "blahblah")); + kak_assert(RegexProgram::match(program, "foobaz")); + kak_assert(RegexProgram::match(program, "quxbaz")); + } }}; }