From f02b2645dacc8788d624e94994584e9d4f31204e Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Mon, 2 Oct 2017 14:59:04 +0800 Subject: [PATCH] Regex: validate that our custom impl gets the same results as boost regex In addition to running boost regex, run our custom regex and compare the results to ensure the two regex engine agree. --- src/regex.cc | 10 +- src/regex.hh | 54 ++++++++- src/regex_impl.cc | 260 ++-------------------------------------- src/regex_impl.hh | 294 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 360 insertions(+), 258 deletions(-) diff --git a/src/regex.cc b/src/regex.cc index 77e01d89..749dced8 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -1,7 +1,7 @@ #include "regex.hh" #include "exception.hh" -#include "regex_impl.hh" +#include "buffer_utils.hh" namespace Kakoune { @@ -11,7 +11,7 @@ using Utf8It = RegexUtf8It; Regex::Regex(StringView re, flag_type flags) try : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()} { - validate_regex(re); + m_impl = compile_regex(re); } catch (std::runtime_error& err) { throw regex_error(err.what()); } String option_to_string(const Regex& re) @@ -24,4 +24,10 @@ void option_from_string(StringView str, Regex& re) re = Regex{str}; } + +void regex_mismatch(const Regex& re) +{ + write_to_debug_buffer(format("regex mismatch for '{}'", re.str())); +} + } diff --git a/src/regex.hh b/src/regex.hh index 5e8b13fe..cab09267 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -5,6 +5,7 @@ #include "string_utils.hh" #include "exception.hh" #include "utf8_iterator.hh" +#include "regex_impl.hh" #include @@ -35,8 +36,11 @@ public: static constexpr const char* option_type_name = "regex"; + const CompiledRegex& impl() const { return m_impl; } + private: String m_str; + CompiledRegex m_impl; }; template @@ -102,12 +106,39 @@ inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, (eow ? RegexConstant::match_default : RegexConstant::match_not_eow); } +void regex_mismatch(const Regex& re); + +template +void check_captures(const Regex& re, const MatchResults& res, const Vector& captures) +{ + if (res.size() > captures.size() * 2) + return regex_mismatch(re); + + for (size_t i = 0; i < res.size(); ++i) + { + if (not res[i].matched) + { + if (captures[i*2] != It{} or captures[i*2+1] != It{}) + regex_mismatch(re); + continue; + } + + if (res[i].first != captures[i*2]) + regex_mismatch(re); + if (res[i].second != captures[i*2+1]) + regex_mismatch(re); + } +} + template bool regex_match(It begin, It end, const Regex& re) { try { - return boost::regex_match>({begin, begin, end}, {end, begin, end}, re); + bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, re); + if (re.impl() and matched != regex_match(begin, end, re.impl())) + regex_mismatch(re); + return matched; } catch (std::runtime_error& err) { @@ -120,7 +151,13 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) { try { - return boost::regex_match>({begin, begin, end}, {end, begin, end}, res, re); + bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, res, re); + Vector captures; + if (re.impl() and matched != regex_match(begin, end, captures, re.impl())) + regex_mismatch(re); + if (re.impl() and matched) + check_captures(re, res, captures); + return matched; } catch (std::runtime_error& err) { @@ -134,7 +171,10 @@ bool regex_search(It begin, It end, const Regex& re, { try { - return boost::regex_search>({begin, begin, end}, {end, begin, end}, re, flags); + bool matched = boost::regex_search>({begin, begin, end}, {end, begin, end}, re, flags); + if (re.impl() and matched != regex_search(begin, end, re.impl())) + regex_mismatch(re); + return matched; } catch (std::runtime_error& err) { @@ -148,7 +188,13 @@ bool regex_search(It begin, It end, MatchResults& res, const Regex& re, { try { - return boost::regex_search>({begin, begin, end}, {end, begin, end}, res, re, flags); + bool matched = boost::regex_search>({begin, begin, end}, {end, begin, end}, res, re, flags); + Vector captures; + if (re.impl() and matched != regex_search(begin, end, captures, re.impl())) + regex_mismatch(re); + if (re.impl() and matched) + check_captures(re, res, captures); + return matched; } catch (std::runtime_error& err) { diff --git a/src/regex_impl.cc b/src/regex_impl.cc index ea8b22e5..c8c5bc05 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -497,38 +497,6 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] { 'H', nullptr, " \t", true }, }; -struct CompiledRegex -{ - enum Op : char - { - Match, - Literal, - LiteralIgnoreCase, - AnyChar, - Matcher, - Jump, - Split_PrioritizeParent, - Split_PrioritizeChild, - Save, - LineStart, - LineEnd, - WordBoundary, - NotWordBoundary, - SubjectBegin, - SubjectEnd, - LookAhead, - LookBehind, - NegativeLookAhead, - NegativeLookBehind, - }; - - using Offset = unsigned; - - Vector bytecode; - Vector> matchers; - size_t save_count; -}; - struct RegexCompiler { RegexCompiler(const ParsedRegex& parsed_regex) @@ -544,7 +512,6 @@ struct RegexCompiler CompiledRegex get_compiled_regex() { return std::move(m_program); } using Offset = CompiledRegex::Offset; - static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset); static CompiledRegex compile(StringView re) { @@ -697,7 +664,7 @@ private: { kak_assert(m_program.bytecode.empty()); push_op(CompiledRegex::Split_PrioritizeChild); - get_offset(alloc_offset()) = search_prefix_size; + get_offset(alloc_offset()) = CompiledRegex::search_prefix_size; push_op(CompiledRegex::AnyChar); push_op(CompiledRegex::Split_PrioritizeParent); get_offset(alloc_offset()) = 1 + sizeof(Offset); @@ -830,230 +797,18 @@ void dump_regex(const CompiledRegex& program) } } -template -struct ThreadedRegexVM -{ - ThreadedRegexVM(const CompiledRegex& program) - : m_program{program} {} - - struct Thread - { - const char* inst; - Vector saves = {}; - }; - - enum class StepResult { Consumed, Matched, Failed }; - StepResult step(size_t thread_index) - { - const auto prog_start = m_program.bytecode.data(); - const auto prog_end = prog_start + m_program.bytecode.size(); - while (true) - { - auto& thread = m_threads[thread_index]; - const Codepoint cp = m_pos == m_end ? 0 : *m_pos; - const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++; - switch (op) - { - case CompiledRegex::Literal: - if (utf8::read_codepoint(thread.inst, prog_end) == cp) - return StepResult::Consumed; - return StepResult::Failed; - case CompiledRegex::LiteralIgnoreCase: - if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp)) - return StepResult::Consumed; - return StepResult::Failed; - case CompiledRegex::AnyChar: - return StepResult::Consumed; - case CompiledRegex::Jump: - { - auto inst = prog_start + *reinterpret_cast(thread.inst); - // if instruction is already going to be executed by another thread, drop this thread - if (std::find_if(m_threads.begin(), m_threads.end(), - [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end()) - return StepResult::Failed; - thread.inst = inst; - break; - } - case CompiledRegex::Split_PrioritizeParent: - { - add_thread(thread_index+1, *reinterpret_cast(thread.inst), thread.saves); - // thread is invalidated now, as we mutated the m_thread vector - m_threads[thread_index].inst += sizeof(CompiledRegex::Offset); - break; - } - case CompiledRegex::Split_PrioritizeChild: - { - add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves); - // thread is invalidated now, as we mutated the m_thread vector - m_threads[thread_index].inst = prog_start + *reinterpret_cast(m_threads[thread_index].inst); - break; - } - case CompiledRegex::Save: - { - const char index = *thread.inst++; - thread.saves[index] = m_pos.base(); - break; - } - case CompiledRegex::Matcher: - { - const int matcher_id = *thread.inst++; - return m_program.matchers[matcher_id](*m_pos) ? - StepResult::Consumed : StepResult::Failed; - } - case CompiledRegex::LineStart: - if (not is_line_start()) - return StepResult::Failed; - break; - case CompiledRegex::LineEnd: - if (not is_line_end()) - return StepResult::Failed; - break; - case CompiledRegex::WordBoundary: - if (not is_word_boundary()) - return StepResult::Failed; - break; - case CompiledRegex::NotWordBoundary: - if (is_word_boundary()) - return StepResult::Failed; - break; - case CompiledRegex::SubjectBegin: - if (m_pos != m_begin) - return StepResult::Failed; - break; - case CompiledRegex::SubjectEnd: - if (m_pos != m_end) - return StepResult::Failed; - break; - case CompiledRegex::LookAhead: - case CompiledRegex::NegativeLookAhead: - { - int count = *thread.inst++; - for (auto it = m_pos; count and it != m_end; ++it, --count) - if (*it != utf8::read(thread.inst)) - break; - if ((op == CompiledRegex::LookAhead and count != 0) or - (op == CompiledRegex::NegativeLookAhead and count == 0)) - return StepResult::Failed; - thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); - break; - } - case CompiledRegex::LookBehind: - case CompiledRegex::NegativeLookBehind: - { - int count = *thread.inst++; - for (auto it = m_pos-1; count and it >= m_begin; --it, --count) - if (*it != utf8::read(thread.inst)) - break; - if ((op == CompiledRegex::LookBehind and count != 0) or - (op == CompiledRegex::NegativeLookBehind and count == 0)) - return StepResult::Failed; - thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); - break; - } - case CompiledRegex::Match: - thread.inst = nullptr; - return StepResult::Matched; - } - } - return StepResult::Failed; - } - - bool exec(StringView data, bool match = true, bool longest = false) - { - bool found_match = false; - m_threads.clear(); - add_thread(0, match ? RegexCompiler::search_prefix_size : 0, - Vector(m_program.save_count, nullptr)); - - m_begin = data.begin(); - m_end = data.end(); - - for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos) - { - for (int i = 0; i < m_threads.size(); ++i) - { - const auto res = step(i); - if (res == StepResult::Matched) - { - if (match) - continue; // We are not at end, this is not a full match - - m_captures = std::move(m_threads[i].saves); - found_match = true; - m_threads.resize(i); // remove this and lower priority threads - if (not longest) - return true; - } - else if (res == StepResult::Failed) - m_threads[i].inst = nullptr; - } - m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(), - [](const Thread& t) { return t.inst == nullptr; }), m_threads.end()); - if (m_threads.empty()) - return found_match; - } - - // Step remaining threads to see if they match without consuming anything else - for (int i = 0; i < m_threads.size(); ++i) - { - if (step(i) == StepResult::Matched) - { - m_captures = std::move(m_threads[i].saves); - found_match = true; - m_threads.resize(i); // remove this and lower priority threads - if (not longest) - return true; - } - } - return found_match; - } - - void add_thread(int index, CompiledRegex::Offset pos, Vector saves) - { - const char* inst = m_program.bytecode.data() + pos; - if (std::find_if(m_threads.begin(), m_threads.end(), - [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end()) - m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)}); - } - - bool is_line_start() const - { - return m_pos == m_begin or *(m_pos-1) == '\n'; - } - - bool is_line_end() const - { - return m_pos == m_end or *m_pos == '\n'; - } - - bool is_word_boundary() const - { - return m_pos == m_begin or m_pos == m_end or - is_word(*(m_pos-1)) != is_word(*m_pos); - } - - const CompiledRegex& m_program; - Vector m_threads; - - using Utf8It = utf8::iterator; - - Iterator m_begin; - Iterator m_end; - Utf8It m_pos; - - Vector m_captures; -}; - -void validate_regex(StringView re) +CompiledRegex compile_regex(StringView re) { + CompiledRegex res; try { - RegexParser{re}; + res = RegexCompiler::compile(re); } catch (runtime_error& err) { write_to_debug_buffer(err.what()); } + return std::move(res); } auto test_regex = UnitTest{[]{ @@ -1064,6 +819,11 @@ auto test_regex = UnitTest{[]{ m_program{RegexCompiler::compile(re)} { if (dump) dump_regex(m_program); } + bool exec(StringView re, bool match = true, bool longest = false) + { + return ThreadedRegexVM::exec(re.begin(), re.end(), match, longest); + } + CompiledRegex m_program; }; diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 8fafcacd..322b60a4 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -1,12 +1,302 @@ #ifndef regex_impl_hh_INCLUDED #define regex_impl_hh_INCLUDED +#include "unicode.hh" +#include "utf8.hh" +#include "utf8_iterator.hh" +#include "vector.hh" + namespace Kakoune { -class StringView; +struct CompiledRegex +{ + enum Op : char + { + Match, + Literal, + LiteralIgnoreCase, + AnyChar, + Matcher, + Jump, + Split_PrioritizeParent, + Split_PrioritizeChild, + Save, + LineStart, + LineEnd, + WordBoundary, + NotWordBoundary, + SubjectBegin, + SubjectEnd, + LookAhead, + LookBehind, + NegativeLookAhead, + NegativeLookBehind, + }; -void validate_regex(StringView re); + using Offset = unsigned; + static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset); + + explicit operator bool() const { return not bytecode.empty(); } + + Vector bytecode; + Vector> matchers; + size_t save_count; +}; + +CompiledRegex compile_regex(StringView re); + +template +struct ThreadedRegexVM +{ + ThreadedRegexVM(const CompiledRegex& program) + : m_program{program} { kak_assert(m_program); } + + struct Thread + { + const char* inst; + Vector saves = {}; + }; + + enum class StepResult { Consumed, Matched, Failed }; + StepResult step(size_t thread_index) + { + const auto prog_start = m_program.bytecode.data(); + const auto prog_end = prog_start + m_program.bytecode.size(); + while (true) + { + auto& thread = m_threads[thread_index]; + const Codepoint cp = m_pos == m_end ? 0 : *m_pos; + const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++; + switch (op) + { + case CompiledRegex::Literal: + if (utf8::read_codepoint(thread.inst, prog_end) == cp) + return StepResult::Consumed; + return StepResult::Failed; + case CompiledRegex::LiteralIgnoreCase: + if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp)) + return StepResult::Consumed; + return StepResult::Failed; + case CompiledRegex::AnyChar: + return StepResult::Consumed; + case CompiledRegex::Jump: + { + auto inst = prog_start + *reinterpret_cast(thread.inst); + // if instruction is already going to be executed by another thread, drop this thread + if (std::find_if(m_threads.begin(), m_threads.end(), + [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end()) + return StepResult::Failed; + thread.inst = inst; + break; + } + case CompiledRegex::Split_PrioritizeParent: + { + add_thread(thread_index+1, *reinterpret_cast(thread.inst), thread.saves); + // thread is invalidated now, as we mutated the m_thread vector + m_threads[thread_index].inst += sizeof(CompiledRegex::Offset); + break; + } + case CompiledRegex::Split_PrioritizeChild: + { + add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves); + // thread is invalidated now, as we mutated the m_thread vector + m_threads[thread_index].inst = prog_start + *reinterpret_cast(m_threads[thread_index].inst); + break; + } + case CompiledRegex::Save: + { + const char index = *thread.inst++; + thread.saves[index] = m_pos.base(); + break; + } + case CompiledRegex::Matcher: + { + const int matcher_id = *thread.inst++; + return m_program.matchers[matcher_id](*m_pos) ? + StepResult::Consumed : StepResult::Failed; + } + case CompiledRegex::LineStart: + if (not is_line_start()) + return StepResult::Failed; + break; + case CompiledRegex::LineEnd: + if (not is_line_end()) + return StepResult::Failed; + break; + case CompiledRegex::WordBoundary: + if (not is_word_boundary()) + return StepResult::Failed; + break; + case CompiledRegex::NotWordBoundary: + if (is_word_boundary()) + return StepResult::Failed; + break; + case CompiledRegex::SubjectBegin: + if (m_pos != m_begin) + return StepResult::Failed; + break; + case CompiledRegex::SubjectEnd: + if (m_pos != m_end) + return StepResult::Failed; + break; + case CompiledRegex::LookAhead: + case CompiledRegex::NegativeLookAhead: + { + int count = *thread.inst++; + for (auto it = m_pos; count and it != m_end; ++it, --count) + if (*it != utf8::read(thread.inst)) + break; + if ((op == CompiledRegex::LookAhead and count != 0) or + (op == CompiledRegex::NegativeLookAhead and count == 0)) + return StepResult::Failed; + thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); + break; + } + case CompiledRegex::LookBehind: + case CompiledRegex::NegativeLookBehind: + { + int count = *thread.inst++; + for (auto it = m_pos-1; count and it >= m_begin; --it, --count) + if (*it != utf8::read(thread.inst)) + break; + if ((op == CompiledRegex::LookBehind and count != 0) or + (op == CompiledRegex::NegativeLookBehind and count == 0)) + return StepResult::Failed; + thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); + break; + } + case CompiledRegex::Match: + thread.inst = nullptr; + return StepResult::Matched; + } + } + return StepResult::Failed; + } + + bool exec(Iterator begin, Iterator end, bool match = true, bool longest = false) + { + bool found_match = false; + m_threads.clear(); + add_thread(0, match ? CompiledRegex::search_prefix_size : 0, + Vector(m_program.save_count, Iterator{})); + + m_begin = begin; + m_end = end; + + for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos) + { + for (int i = 0; i < m_threads.size(); ++i) + { + const auto res = step(i); + if (res == StepResult::Matched) + { + if (match) + continue; // We are not at end, this is not a full match + + m_captures = std::move(m_threads[i].saves); + found_match = true; + m_threads.resize(i); // remove this and lower priority threads + if (not longest) + return true; + } + else if (res == StepResult::Failed) + m_threads[i].inst = nullptr; + } + m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(), + [](const Thread& t) { return t.inst == nullptr; }), m_threads.end()); + if (m_threads.empty()) + return found_match; + } + + // Step remaining threads to see if they match without consuming anything else + for (int i = 0; i < m_threads.size(); ++i) + { + if (step(i) == StepResult::Matched) + { + m_captures = std::move(m_threads[i].saves); + found_match = true; + m_threads.resize(i); // remove this and lower priority threads + if (not longest) + return true; + } + } + return found_match; + } + + void add_thread(int index, CompiledRegex::Offset pos, Vector saves) + { + const char* inst = m_program.bytecode.data() + pos; + if (std::find_if(m_threads.begin(), m_threads.end(), + [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end()) + m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)}); + } + + bool is_line_start() const + { + return m_pos == m_begin or *(m_pos-1) == '\n'; + } + + bool is_line_end() const + { + return m_pos == m_end or *m_pos == '\n'; + } + + bool is_word_boundary() const + { + return m_pos == m_begin or m_pos == m_end or + is_word(*(m_pos-1)) != is_word(*m_pos); + } + + const CompiledRegex& m_program; + Vector m_threads; + + using Utf8It = utf8::iterator; + + Iterator m_begin; + Iterator m_end; + Utf8It m_pos; + + Vector m_captures; +}; + +template +bool regex_match(It begin, It end, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + return vm.exec(begin, end, true, false); +} + +template +bool regex_match(It begin, It end, Vector& captures, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + if (vm.exec(begin, end, true, true)) + { + captures = std::move(vm.m_captures); + return true; + } + return false; +} + +template +bool regex_search(It begin, It end, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + return vm.exec(begin, end, false, false); +} + +template +bool regex_search(It begin, It end, Vector& captures, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + if (vm.exec(begin, end, false, true)) + { + captures = std::move(vm.m_captures); + return true; + } + return false; +} }