diff --git a/src/regex.cc b/src/regex.cc index 77e01d89..749dced8 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -1,7 +1,7 @@ #include "regex.hh" #include "exception.hh" -#include "regex_impl.hh" +#include "buffer_utils.hh" namespace Kakoune { @@ -11,7 +11,7 @@ using Utf8It = RegexUtf8It; Regex::Regex(StringView re, flag_type flags) try : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()} { - validate_regex(re); + m_impl = compile_regex(re); } catch (std::runtime_error& err) { throw regex_error(err.what()); } String option_to_string(const Regex& re) @@ -24,4 +24,10 @@ void option_from_string(StringView str, Regex& re) re = Regex{str}; } + +void regex_mismatch(const Regex& re) +{ + write_to_debug_buffer(format("regex mismatch for '{}'", re.str())); +} + } diff --git a/src/regex.hh b/src/regex.hh index 5e8b13fe..cab09267 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -5,6 +5,7 @@ #include "string_utils.hh" #include "exception.hh" #include "utf8_iterator.hh" +#include "regex_impl.hh" #include @@ -35,8 +36,11 @@ public: static constexpr const char* option_type_name = "regex"; + const CompiledRegex& impl() const { return m_impl; } + private: String m_str; + CompiledRegex m_impl; }; template @@ -102,12 +106,39 @@ inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, (eow ? RegexConstant::match_default : RegexConstant::match_not_eow); } +void regex_mismatch(const Regex& re); + +template +void check_captures(const Regex& re, const MatchResults& res, const Vector& captures) +{ + if (res.size() > captures.size() * 2) + return regex_mismatch(re); + + for (size_t i = 0; i < res.size(); ++i) + { + if (not res[i].matched) + { + if (captures[i*2] != It{} or captures[i*2+1] != It{}) + regex_mismatch(re); + continue; + } + + if (res[i].first != captures[i*2]) + regex_mismatch(re); + if (res[i].second != captures[i*2+1]) + regex_mismatch(re); + } +} + template bool regex_match(It begin, It end, const Regex& re) { try { - return boost::regex_match>({begin, begin, end}, {end, begin, end}, re); + bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, re); + if (re.impl() and matched != regex_match(begin, end, re.impl())) + regex_mismatch(re); + return matched; } catch (std::runtime_error& err) { @@ -120,7 +151,13 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) { try { - return boost::regex_match>({begin, begin, end}, {end, begin, end}, res, re); + bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, res, re); + Vector captures; + if (re.impl() and matched != regex_match(begin, end, captures, re.impl())) + regex_mismatch(re); + if (re.impl() and matched) + check_captures(re, res, captures); + return matched; } catch (std::runtime_error& err) { @@ -134,7 +171,10 @@ bool regex_search(It begin, It end, const Regex& re, { try { - return boost::regex_search>({begin, begin, end}, {end, begin, end}, re, flags); + bool matched = boost::regex_search>({begin, begin, end}, {end, begin, end}, re, flags); + if (re.impl() and matched != regex_search(begin, end, re.impl())) + regex_mismatch(re); + return matched; } catch (std::runtime_error& err) { @@ -148,7 +188,13 @@ bool regex_search(It begin, It end, MatchResults& res, const Regex& re, { try { - return boost::regex_search>({begin, begin, end}, {end, begin, end}, res, re, flags); + bool matched = boost::regex_search>({begin, begin, end}, {end, begin, end}, res, re, flags); + Vector captures; + if (re.impl() and matched != regex_search(begin, end, captures, re.impl())) + regex_mismatch(re); + if (re.impl() and matched) + check_captures(re, res, captures); + return matched; } catch (std::runtime_error& err) { diff --git a/src/regex_impl.cc b/src/regex_impl.cc index ea8b22e5..c8c5bc05 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -497,38 +497,6 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] { 'H', nullptr, " \t", true }, }; -struct CompiledRegex -{ - enum Op : char - { - Match, - Literal, - LiteralIgnoreCase, - AnyChar, - Matcher, - Jump, - Split_PrioritizeParent, - Split_PrioritizeChild, - Save, - LineStart, - LineEnd, - WordBoundary, - NotWordBoundary, - SubjectBegin, - SubjectEnd, - LookAhead, - LookBehind, - NegativeLookAhead, - NegativeLookBehind, - }; - - using Offset = unsigned; - - Vector bytecode; - Vector> matchers; - size_t save_count; -}; - struct RegexCompiler { RegexCompiler(const ParsedRegex& parsed_regex) @@ -544,7 +512,6 @@ struct RegexCompiler CompiledRegex get_compiled_regex() { return std::move(m_program); } using Offset = CompiledRegex::Offset; - static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset); static CompiledRegex compile(StringView re) { @@ -697,7 +664,7 @@ private: { kak_assert(m_program.bytecode.empty()); push_op(CompiledRegex::Split_PrioritizeChild); - get_offset(alloc_offset()) = search_prefix_size; + get_offset(alloc_offset()) = CompiledRegex::search_prefix_size; push_op(CompiledRegex::AnyChar); push_op(CompiledRegex::Split_PrioritizeParent); get_offset(alloc_offset()) = 1 + sizeof(Offset); @@ -830,230 +797,18 @@ void dump_regex(const CompiledRegex& program) } } -template -struct ThreadedRegexVM -{ - ThreadedRegexVM(const CompiledRegex& program) - : m_program{program} {} - - struct Thread - { - const char* inst; - Vector saves = {}; - }; - - enum class StepResult { Consumed, Matched, Failed }; - StepResult step(size_t thread_index) - { - const auto prog_start = m_program.bytecode.data(); - const auto prog_end = prog_start + m_program.bytecode.size(); - while (true) - { - auto& thread = m_threads[thread_index]; - const Codepoint cp = m_pos == m_end ? 0 : *m_pos; - const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++; - switch (op) - { - case CompiledRegex::Literal: - if (utf8::read_codepoint(thread.inst, prog_end) == cp) - return StepResult::Consumed; - return StepResult::Failed; - case CompiledRegex::LiteralIgnoreCase: - if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp)) - return StepResult::Consumed; - return StepResult::Failed; - case CompiledRegex::AnyChar: - return StepResult::Consumed; - case CompiledRegex::Jump: - { - auto inst = prog_start + *reinterpret_cast(thread.inst); - // if instruction is already going to be executed by another thread, drop this thread - if (std::find_if(m_threads.begin(), m_threads.end(), - [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end()) - return StepResult::Failed; - thread.inst = inst; - break; - } - case CompiledRegex::Split_PrioritizeParent: - { - add_thread(thread_index+1, *reinterpret_cast(thread.inst), thread.saves); - // thread is invalidated now, as we mutated the m_thread vector - m_threads[thread_index].inst += sizeof(CompiledRegex::Offset); - break; - } - case CompiledRegex::Split_PrioritizeChild: - { - add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves); - // thread is invalidated now, as we mutated the m_thread vector - m_threads[thread_index].inst = prog_start + *reinterpret_cast(m_threads[thread_index].inst); - break; - } - case CompiledRegex::Save: - { - const char index = *thread.inst++; - thread.saves[index] = m_pos.base(); - break; - } - case CompiledRegex::Matcher: - { - const int matcher_id = *thread.inst++; - return m_program.matchers[matcher_id](*m_pos) ? - StepResult::Consumed : StepResult::Failed; - } - case CompiledRegex::LineStart: - if (not is_line_start()) - return StepResult::Failed; - break; - case CompiledRegex::LineEnd: - if (not is_line_end()) - return StepResult::Failed; - break; - case CompiledRegex::WordBoundary: - if (not is_word_boundary()) - return StepResult::Failed; - break; - case CompiledRegex::NotWordBoundary: - if (is_word_boundary()) - return StepResult::Failed; - break; - case CompiledRegex::SubjectBegin: - if (m_pos != m_begin) - return StepResult::Failed; - break; - case CompiledRegex::SubjectEnd: - if (m_pos != m_end) - return StepResult::Failed; - break; - case CompiledRegex::LookAhead: - case CompiledRegex::NegativeLookAhead: - { - int count = *thread.inst++; - for (auto it = m_pos; count and it != m_end; ++it, --count) - if (*it != utf8::read(thread.inst)) - break; - if ((op == CompiledRegex::LookAhead and count != 0) or - (op == CompiledRegex::NegativeLookAhead and count == 0)) - return StepResult::Failed; - thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); - break; - } - case CompiledRegex::LookBehind: - case CompiledRegex::NegativeLookBehind: - { - int count = *thread.inst++; - for (auto it = m_pos-1; count and it >= m_begin; --it, --count) - if (*it != utf8::read(thread.inst)) - break; - if ((op == CompiledRegex::LookBehind and count != 0) or - (op == CompiledRegex::NegativeLookBehind and count == 0)) - return StepResult::Failed; - thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); - break; - } - case CompiledRegex::Match: - thread.inst = nullptr; - return StepResult::Matched; - } - } - return StepResult::Failed; - } - - bool exec(StringView data, bool match = true, bool longest = false) - { - bool found_match = false; - m_threads.clear(); - add_thread(0, match ? RegexCompiler::search_prefix_size : 0, - Vector(m_program.save_count, nullptr)); - - m_begin = data.begin(); - m_end = data.end(); - - for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos) - { - for (int i = 0; i < m_threads.size(); ++i) - { - const auto res = step(i); - if (res == StepResult::Matched) - { - if (match) - continue; // We are not at end, this is not a full match - - m_captures = std::move(m_threads[i].saves); - found_match = true; - m_threads.resize(i); // remove this and lower priority threads - if (not longest) - return true; - } - else if (res == StepResult::Failed) - m_threads[i].inst = nullptr; - } - m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(), - [](const Thread& t) { return t.inst == nullptr; }), m_threads.end()); - if (m_threads.empty()) - return found_match; - } - - // Step remaining threads to see if they match without consuming anything else - for (int i = 0; i < m_threads.size(); ++i) - { - if (step(i) == StepResult::Matched) - { - m_captures = std::move(m_threads[i].saves); - found_match = true; - m_threads.resize(i); // remove this and lower priority threads - if (not longest) - return true; - } - } - return found_match; - } - - void add_thread(int index, CompiledRegex::Offset pos, Vector saves) - { - const char* inst = m_program.bytecode.data() + pos; - if (std::find_if(m_threads.begin(), m_threads.end(), - [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end()) - m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)}); - } - - bool is_line_start() const - { - return m_pos == m_begin or *(m_pos-1) == '\n'; - } - - bool is_line_end() const - { - return m_pos == m_end or *m_pos == '\n'; - } - - bool is_word_boundary() const - { - return m_pos == m_begin or m_pos == m_end or - is_word(*(m_pos-1)) != is_word(*m_pos); - } - - const CompiledRegex& m_program; - Vector m_threads; - - using Utf8It = utf8::iterator; - - Iterator m_begin; - Iterator m_end; - Utf8It m_pos; - - Vector m_captures; -}; - -void validate_regex(StringView re) +CompiledRegex compile_regex(StringView re) { + CompiledRegex res; try { - RegexParser{re}; + res = RegexCompiler::compile(re); } catch (runtime_error& err) { write_to_debug_buffer(err.what()); } + return std::move(res); } auto test_regex = UnitTest{[]{ @@ -1064,6 +819,11 @@ auto test_regex = UnitTest{[]{ m_program{RegexCompiler::compile(re)} { if (dump) dump_regex(m_program); } + bool exec(StringView re, bool match = true, bool longest = false) + { + return ThreadedRegexVM::exec(re.begin(), re.end(), match, longest); + } + CompiledRegex m_program; }; diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 8fafcacd..322b60a4 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -1,12 +1,302 @@ #ifndef regex_impl_hh_INCLUDED #define regex_impl_hh_INCLUDED +#include "unicode.hh" +#include "utf8.hh" +#include "utf8_iterator.hh" +#include "vector.hh" + namespace Kakoune { -class StringView; +struct CompiledRegex +{ + enum Op : char + { + Match, + Literal, + LiteralIgnoreCase, + AnyChar, + Matcher, + Jump, + Split_PrioritizeParent, + Split_PrioritizeChild, + Save, + LineStart, + LineEnd, + WordBoundary, + NotWordBoundary, + SubjectBegin, + SubjectEnd, + LookAhead, + LookBehind, + NegativeLookAhead, + NegativeLookBehind, + }; -void validate_regex(StringView re); + using Offset = unsigned; + static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset); + + explicit operator bool() const { return not bytecode.empty(); } + + Vector bytecode; + Vector> matchers; + size_t save_count; +}; + +CompiledRegex compile_regex(StringView re); + +template +struct ThreadedRegexVM +{ + ThreadedRegexVM(const CompiledRegex& program) + : m_program{program} { kak_assert(m_program); } + + struct Thread + { + const char* inst; + Vector saves = {}; + }; + + enum class StepResult { Consumed, Matched, Failed }; + StepResult step(size_t thread_index) + { + const auto prog_start = m_program.bytecode.data(); + const auto prog_end = prog_start + m_program.bytecode.size(); + while (true) + { + auto& thread = m_threads[thread_index]; + const Codepoint cp = m_pos == m_end ? 0 : *m_pos; + const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++; + switch (op) + { + case CompiledRegex::Literal: + if (utf8::read_codepoint(thread.inst, prog_end) == cp) + return StepResult::Consumed; + return StepResult::Failed; + case CompiledRegex::LiteralIgnoreCase: + if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp)) + return StepResult::Consumed; + return StepResult::Failed; + case CompiledRegex::AnyChar: + return StepResult::Consumed; + case CompiledRegex::Jump: + { + auto inst = prog_start + *reinterpret_cast(thread.inst); + // if instruction is already going to be executed by another thread, drop this thread + if (std::find_if(m_threads.begin(), m_threads.end(), + [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end()) + return StepResult::Failed; + thread.inst = inst; + break; + } + case CompiledRegex::Split_PrioritizeParent: + { + add_thread(thread_index+1, *reinterpret_cast(thread.inst), thread.saves); + // thread is invalidated now, as we mutated the m_thread vector + m_threads[thread_index].inst += sizeof(CompiledRegex::Offset); + break; + } + case CompiledRegex::Split_PrioritizeChild: + { + add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves); + // thread is invalidated now, as we mutated the m_thread vector + m_threads[thread_index].inst = prog_start + *reinterpret_cast(m_threads[thread_index].inst); + break; + } + case CompiledRegex::Save: + { + const char index = *thread.inst++; + thread.saves[index] = m_pos.base(); + break; + } + case CompiledRegex::Matcher: + { + const int matcher_id = *thread.inst++; + return m_program.matchers[matcher_id](*m_pos) ? + StepResult::Consumed : StepResult::Failed; + } + case CompiledRegex::LineStart: + if (not is_line_start()) + return StepResult::Failed; + break; + case CompiledRegex::LineEnd: + if (not is_line_end()) + return StepResult::Failed; + break; + case CompiledRegex::WordBoundary: + if (not is_word_boundary()) + return StepResult::Failed; + break; + case CompiledRegex::NotWordBoundary: + if (is_word_boundary()) + return StepResult::Failed; + break; + case CompiledRegex::SubjectBegin: + if (m_pos != m_begin) + return StepResult::Failed; + break; + case CompiledRegex::SubjectEnd: + if (m_pos != m_end) + return StepResult::Failed; + break; + case CompiledRegex::LookAhead: + case CompiledRegex::NegativeLookAhead: + { + int count = *thread.inst++; + for (auto it = m_pos; count and it != m_end; ++it, --count) + if (*it != utf8::read(thread.inst)) + break; + if ((op == CompiledRegex::LookAhead and count != 0) or + (op == CompiledRegex::NegativeLookAhead and count == 0)) + return StepResult::Failed; + thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); + break; + } + case CompiledRegex::LookBehind: + case CompiledRegex::NegativeLookBehind: + { + int count = *thread.inst++; + for (auto it = m_pos-1; count and it >= m_begin; --it, --count) + if (*it != utf8::read(thread.inst)) + break; + if ((op == CompiledRegex::LookBehind and count != 0) or + (op == CompiledRegex::NegativeLookBehind and count == 0)) + return StepResult::Failed; + thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1}); + break; + } + case CompiledRegex::Match: + thread.inst = nullptr; + return StepResult::Matched; + } + } + return StepResult::Failed; + } + + bool exec(Iterator begin, Iterator end, bool match = true, bool longest = false) + { + bool found_match = false; + m_threads.clear(); + add_thread(0, match ? CompiledRegex::search_prefix_size : 0, + Vector(m_program.save_count, Iterator{})); + + m_begin = begin; + m_end = end; + + for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos) + { + for (int i = 0; i < m_threads.size(); ++i) + { + const auto res = step(i); + if (res == StepResult::Matched) + { + if (match) + continue; // We are not at end, this is not a full match + + m_captures = std::move(m_threads[i].saves); + found_match = true; + m_threads.resize(i); // remove this and lower priority threads + if (not longest) + return true; + } + else if (res == StepResult::Failed) + m_threads[i].inst = nullptr; + } + m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(), + [](const Thread& t) { return t.inst == nullptr; }), m_threads.end()); + if (m_threads.empty()) + return found_match; + } + + // Step remaining threads to see if they match without consuming anything else + for (int i = 0; i < m_threads.size(); ++i) + { + if (step(i) == StepResult::Matched) + { + m_captures = std::move(m_threads[i].saves); + found_match = true; + m_threads.resize(i); // remove this and lower priority threads + if (not longest) + return true; + } + } + return found_match; + } + + void add_thread(int index, CompiledRegex::Offset pos, Vector saves) + { + const char* inst = m_program.bytecode.data() + pos; + if (std::find_if(m_threads.begin(), m_threads.end(), + [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end()) + m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)}); + } + + bool is_line_start() const + { + return m_pos == m_begin or *(m_pos-1) == '\n'; + } + + bool is_line_end() const + { + return m_pos == m_end or *m_pos == '\n'; + } + + bool is_word_boundary() const + { + return m_pos == m_begin or m_pos == m_end or + is_word(*(m_pos-1)) != is_word(*m_pos); + } + + const CompiledRegex& m_program; + Vector m_threads; + + using Utf8It = utf8::iterator; + + Iterator m_begin; + Iterator m_end; + Utf8It m_pos; + + Vector m_captures; +}; + +template +bool regex_match(It begin, It end, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + return vm.exec(begin, end, true, false); +} + +template +bool regex_match(It begin, It end, Vector& captures, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + if (vm.exec(begin, end, true, true)) + { + captures = std::move(vm.m_captures); + return true; + } + return false; +} + +template +bool regex_search(It begin, It end, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + return vm.exec(begin, end, false, false); +} + +template +bool regex_search(It begin, It end, Vector& captures, const CompiledRegex& re) +{ + ThreadedRegexVM vm{re}; + if (vm.exec(begin, end, false, true)) + { + captures = std::move(vm.m_captures); + return true; + } + return false; +} }