Regex: validate that our custom impl gets the same results as boost regex

In addition to running boost regex, run our custom regex and compare the results to ensure the two regex engine agree.
2017-10-02 14:59:04 +08:00 · 2017-10-02 14:59:04 +08:00 · f02b2645da
commit f02b2645da
parent 76dcfd5c52
4 changed files with 360 additions and 258 deletions
--- a/src/regex.cc
+++ b/src/regex.cc
@ -1,7 +1,7 @@
 #include "regex.hh"
 #include "exception.hh"
-#include "regex_impl.hh"
+#include "buffer_utils.hh"
 namespace Kakoune
 {
@ -11,7 +11,7 @@ using Utf8It = RegexUtf8It<const char*>;
 Regex::Regex(StringView re, flag_type flags) try
    : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
 {
-    validate_regex(re);
+    m_impl = compile_regex(re);
 } catch (std::runtime_error& err) { throw regex_error(err.what()); }
 String option_to_string(const Regex& re)
@ -24,4 +24,10 @@ void option_from_string(StringView str, Regex& re)
    re = Regex{str};
 }
 void regex_mismatch(const Regex& re)
 {
    write_to_debug_buffer(format("regex mismatch for '{}'", re.str()));
 }
 }
--- a/src/regex.hh
+++ b/src/regex.hh
@ -5,6 +5,7 @@
 #include "string_utils.hh"
 #include "exception.hh"
 #include "utf8_iterator.hh"
 #include "regex_impl.hh"
 #include <boost/regex.hpp>
@ -35,8 +36,11 @@ public:
    static constexpr const char* option_type_name = "regex";
    const CompiledRegex& impl() const { return m_impl; }
 private:
    String m_str;
    CompiledRegex m_impl;
 };
 template<typename It>
@ -102,12 +106,39 @@ inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow,
           (eow ? RegexConstant::match_default : RegexConstant::match_not_eow);
 }
 void regex_mismatch(const Regex& re);
 template<typename It>
 void check_captures(const Regex& re, const MatchResults<It>& res, const Vector<It>& captures)
 {
    if (res.size() > captures.size() * 2)
        return regex_mismatch(re);
    for (size_t i = 0; i < res.size(); ++i)
    {
        if (not res[i].matched)
        {
            if (captures[i*2] != It{} or captures[i*2+1] != It{})
                regex_mismatch(re);
            continue;
        }
        if (res[i].first != captures[i*2])
            regex_mismatch(re);
        if (res[i].second != captures[i*2+1])
            regex_mismatch(re);
    }
 }
 template<typename It>
 bool regex_match(It begin, It end, const Regex& re)
 {
    try
    {
-        return boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
+        bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
        if (re.impl() and matched != regex_match(begin, end, re.impl()))
            regex_mismatch(re);
        return matched;
    }
    catch (std::runtime_error& err)
    {
@ -120,7 +151,13 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
 {
    try
    {
-        return boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
+        bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
        Vector<It> captures;
        if (re.impl() and matched != regex_match(begin, end, captures, re.impl()))
            regex_mismatch(re);
        if (re.impl() and matched)
            check_captures(re, res, captures);
        return matched;
    }
    catch (std::runtime_error& err)
    {
@ -134,7 +171,10 @@ bool regex_search(It begin, It end, const Regex& re,
 {
    try
    {
-        return boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
+        bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
        if (re.impl() and matched != regex_search(begin, end, re.impl()))
            regex_mismatch(re);
        return matched;
    }
    catch (std::runtime_error& err)
    {
@ -148,7 +188,13 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
 {
    try
    {
-        return boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
+        bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
        Vector<It> captures;
        if (re.impl() and matched != regex_search(begin, end, captures, re.impl()))
            regex_mismatch(re);
        if (re.impl() and matched)
            check_captures(re, res, captures);
        return matched;
    }
    catch (std::runtime_error& err)
    {
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@ -497,38 +497,6 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8]
    { 'H', nullptr, " \t", true },
 };
 struct CompiledRegex
 {
    enum Op : char
    {
        Match,
        Literal,
        LiteralIgnoreCase,
        AnyChar,
        Matcher,
        Jump,
        Split_PrioritizeParent,
        Split_PrioritizeChild,
        Save,
        LineStart,
        LineEnd,
        WordBoundary,
        NotWordBoundary,
        SubjectBegin,
        SubjectEnd,
        LookAhead,
        LookBehind,
        NegativeLookAhead,
        NegativeLookBehind,
    };
    using Offset = unsigned;
    Vector<char> bytecode;
    Vector<std::function<bool (Codepoint)>> matchers;
    size_t save_count;
 };
 struct RegexCompiler
 {
    RegexCompiler(const ParsedRegex& parsed_regex)
@ -544,7 +512,6 @@ struct RegexCompiler
    CompiledRegex get_compiled_regex() { return std::move(m_program); }
    using Offset = CompiledRegex::Offset;
    static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
    static CompiledRegex compile(StringView re)
    {
@ -697,7 +664,7 @@ private:
    {
        kak_assert(m_program.bytecode.empty());
        push_op(CompiledRegex::Split_PrioritizeChild);
-        get_offset(alloc_offset()) = search_prefix_size;
+        get_offset(alloc_offset()) = CompiledRegex::search_prefix_size;
        push_op(CompiledRegex::AnyChar);
        push_op(CompiledRegex::Split_PrioritizeParent);
        get_offset(alloc_offset()) = 1 + sizeof(Offset);
@ -830,230 +797,18 @@ void dump_regex(const CompiledRegex& program)
    }
 }
-template<typename Iterator>
+CompiledRegex compile_regex(StringView re)
 struct ThreadedRegexVM
 {
    ThreadedRegexVM(const CompiledRegex& program)
      : m_program{program} {}
    struct Thread
    {
        const char* inst;
        Vector<const char*> saves = {};
    };
    enum class StepResult { Consumed, Matched, Failed };
    StepResult step(size_t thread_index)
    {
        const auto prog_start = m_program.bytecode.data();
        const auto prog_end = prog_start + m_program.bytecode.size();
        while (true)
        {
            auto& thread = m_threads[thread_index];
            const Codepoint cp = m_pos == m_end ? 0 : *m_pos;
            const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
            switch (op)
            {
                case CompiledRegex::Literal:
                    if (utf8::read_codepoint(thread.inst, prog_end) == cp)
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::LiteralIgnoreCase:
                    if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::AnyChar:
                    return StepResult::Consumed;
                case CompiledRegex::Jump:
                {
                    auto inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
                    // if instruction is already going to be executed by another thread, drop this thread
                    if (std::find_if(m_threads.begin(), m_threads.end(),
                                     [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
                        return StepResult::Failed;
                    thread.inst = inst;
                    break;
                }
                case CompiledRegex::Split_PrioritizeParent:
                {
                    add_thread(thread_index+1, *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst), thread.saves);
                    // thread is invalidated now, as we mutated the m_thread vector
                    m_threads[thread_index].inst += sizeof(CompiledRegex::Offset);
                    break;
                }
                case CompiledRegex::Split_PrioritizeChild:
                {
                    add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves);
                    // thread is invalidated now, as we mutated the m_thread vector
                    m_threads[thread_index].inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(m_threads[thread_index].inst);
                    break;
                }
                case CompiledRegex::Save:
                {
                    const char index = *thread.inst++;
                    thread.saves[index] = m_pos.base();
                    break;
                }
                case CompiledRegex::Matcher:
                {
                    const int matcher_id = *thread.inst++;
                    return m_program.matchers[matcher_id](*m_pos) ?
                        StepResult::Consumed : StepResult::Failed;
                }
                case CompiledRegex::LineStart:
                    if (not is_line_start())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::LineEnd:
                    if (not is_line_end())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::WordBoundary:
                    if (not is_word_boundary())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::NotWordBoundary:
                    if (is_word_boundary())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::SubjectBegin:
                    if (m_pos != m_begin)
                        return StepResult::Failed;
                    break;
                case CompiledRegex::SubjectEnd:
                    if (m_pos != m_end)
                        return StepResult::Failed;
                    break;
                case CompiledRegex::LookAhead:
                case CompiledRegex::NegativeLookAhead:
                {
                    int count = *thread.inst++;
                    for (auto it = m_pos; count and it != m_end; ++it, --count)
                        if (*it != utf8::read(thread.inst))
                            break;
                    if ((op == CompiledRegex::LookAhead and count != 0) or
                        (op == CompiledRegex::NegativeLookAhead and count == 0))
                        return StepResult::Failed;
                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
                    break;
                }
                case CompiledRegex::LookBehind:
                case CompiledRegex::NegativeLookBehind:
                {
                    int count = *thread.inst++;
                    for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
                        if (*it != utf8::read(thread.inst))
                            break;
                    if ((op == CompiledRegex::LookBehind and count != 0) or
                        (op == CompiledRegex::NegativeLookBehind and count == 0))
                        return StepResult::Failed;
                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
                    break;
                }
                case CompiledRegex::Match:
                    thread.inst = nullptr;
                    return StepResult::Matched;
            }
        }
        return StepResult::Failed;
    }
    bool exec(StringView data, bool match = true, bool longest = false)
    {
        bool found_match = false;
        m_threads.clear();
        add_thread(0, match ? RegexCompiler::search_prefix_size : 0,
                   Vector<const char*>(m_program.save_count, nullptr));
        m_begin = data.begin();
        m_end = data.end();
        for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
        {
            for (int i = 0; i < m_threads.size(); ++i)
            {
                const auto res = step(i);
                if (res == StepResult::Matched)
                {
                    if (match)
                        continue; // We are not at end, this is not a full match
                    m_captures = std::move(m_threads[i].saves);
                    found_match = true;
                    m_threads.resize(i); // remove this and lower priority threads
                    if (not longest)
                        return true;
                }
                else if (res == StepResult::Failed)
                    m_threads[i].inst = nullptr;
            }
            m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
                                           [](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
            if (m_threads.empty())
                return found_match;
        }
        // Step remaining threads to see if they match without consuming anything else
        for (int i = 0; i < m_threads.size(); ++i)
        {
            if (step(i) == StepResult::Matched)
            {
                m_captures = std::move(m_threads[i].saves);
                found_match = true;
                m_threads.resize(i); // remove this and lower priority threads
                if (not longest)
                    return true;
            }
        }
        return found_match;
    }
    void add_thread(int index, CompiledRegex::Offset pos, Vector<const char*> saves)
    {
        const char* inst = m_program.bytecode.data() + pos;
        if (std::find_if(m_threads.begin(), m_threads.end(),
                         [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
            m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)});
    }
    bool is_line_start() const
    {
        return m_pos == m_begin or *(m_pos-1) == '\n';
    }
    bool is_line_end() const
    {
        return m_pos == m_end or *m_pos == '\n';
    }
    bool is_word_boundary() const
    {
        return m_pos == m_begin or m_pos == m_end or
               is_word(*(m_pos-1)) != is_word(*m_pos);
    }
    const CompiledRegex& m_program;
    Vector<Thread> m_threads;
    using Utf8It = utf8::iterator<Iterator>;
    Iterator m_begin;
    Iterator m_end;
    Utf8It m_pos;
    Vector<const char*> m_captures;
 };
 void validate_regex(StringView re)
 {
    CompiledRegex res;
    try
    {
-        RegexParser{re};
+        res = RegexCompiler::compile(re);
    }
    catch (runtime_error& err)
    {
        write_to_debug_buffer(err.what());
    }
    return std::move(res);
 }
 auto test_regex = UnitTest{[]{
@ -1064,6 +819,11 @@ auto test_regex = UnitTest{[]{
              m_program{RegexCompiler::compile(re)}
        { if (dump) dump_regex(m_program); }
        bool exec(StringView re, bool match = true, bool longest = false)
        {
            return ThreadedRegexVM::exec(re.begin(), re.end(), match, longest);
        }
        CompiledRegex m_program;
    };
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@ -1,12 +1,302 @@
 #ifndef regex_impl_hh_INCLUDED
 #define regex_impl_hh_INCLUDED
 #include "unicode.hh"
 #include "utf8.hh"
 #include "utf8_iterator.hh"
 #include "vector.hh"
 namespace Kakoune
 {
-class StringView;
+struct CompiledRegex
 {
    enum Op : char
    {
        Match,
        Literal,
        LiteralIgnoreCase,
        AnyChar,
        Matcher,
        Jump,
        Split_PrioritizeParent,
        Split_PrioritizeChild,
        Save,
        LineStart,
        LineEnd,
        WordBoundary,
        NotWordBoundary,
        SubjectBegin,
        SubjectEnd,
        LookAhead,
        LookBehind,
        NegativeLookAhead,
        NegativeLookBehind,
    };
-void validate_regex(StringView re);
+    using Offset = unsigned;
    static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
    explicit operator bool() const { return not bytecode.empty(); }
    Vector<char> bytecode;
    Vector<std::function<bool (Codepoint)>> matchers;
    size_t save_count;
 };
 CompiledRegex compile_regex(StringView re);
 template<typename Iterator>
 struct ThreadedRegexVM
 {
    ThreadedRegexVM(const CompiledRegex& program)
      : m_program{program} { kak_assert(m_program); }
    struct Thread
    {
        const char* inst;
        Vector<Iterator> saves = {};
    };
    enum class StepResult { Consumed, Matched, Failed };
    StepResult step(size_t thread_index)
    {
        const auto prog_start = m_program.bytecode.data();
        const auto prog_end = prog_start + m_program.bytecode.size();
        while (true)
        {
            auto& thread = m_threads[thread_index];
            const Codepoint cp = m_pos == m_end ? 0 : *m_pos;
            const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
            switch (op)
            {
                case CompiledRegex::Literal:
                    if (utf8::read_codepoint(thread.inst, prog_end) == cp)
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::LiteralIgnoreCase:
                    if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::AnyChar:
                    return StepResult::Consumed;
                case CompiledRegex::Jump:
                {
                    auto inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
                    // if instruction is already going to be executed by another thread, drop this thread
                    if (std::find_if(m_threads.begin(), m_threads.end(),
                                     [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
                        return StepResult::Failed;
                    thread.inst = inst;
                    break;
                }
                case CompiledRegex::Split_PrioritizeParent:
                {
                    add_thread(thread_index+1, *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst), thread.saves);
                    // thread is invalidated now, as we mutated the m_thread vector
                    m_threads[thread_index].inst += sizeof(CompiledRegex::Offset);
                    break;
                }
                case CompiledRegex::Split_PrioritizeChild:
                {
                    add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves);
                    // thread is invalidated now, as we mutated the m_thread vector
                    m_threads[thread_index].inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(m_threads[thread_index].inst);
                    break;
                }
                case CompiledRegex::Save:
                {
                    const char index = *thread.inst++;
                    thread.saves[index] = m_pos.base();
                    break;
                }
                case CompiledRegex::Matcher:
                {
                    const int matcher_id = *thread.inst++;
                    return m_program.matchers[matcher_id](*m_pos) ?
                        StepResult::Consumed : StepResult::Failed;
                }
                case CompiledRegex::LineStart:
                    if (not is_line_start())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::LineEnd:
                    if (not is_line_end())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::WordBoundary:
                    if (not is_word_boundary())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::NotWordBoundary:
                    if (is_word_boundary())
                        return StepResult::Failed;
                    break;
                case CompiledRegex::SubjectBegin:
                    if (m_pos != m_begin)
                        return StepResult::Failed;
                    break;
                case CompiledRegex::SubjectEnd:
                    if (m_pos != m_end)
                        return StepResult::Failed;
                    break;
                case CompiledRegex::LookAhead:
                case CompiledRegex::NegativeLookAhead:
                {
                    int count = *thread.inst++;
                    for (auto it = m_pos; count and it != m_end; ++it, --count)
                        if (*it != utf8::read(thread.inst))
                            break;
                    if ((op == CompiledRegex::LookAhead and count != 0) or
                        (op == CompiledRegex::NegativeLookAhead and count == 0))
                        return StepResult::Failed;
                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
                    break;
                }
                case CompiledRegex::LookBehind:
                case CompiledRegex::NegativeLookBehind:
                {
                    int count = *thread.inst++;
                    for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
                        if (*it != utf8::read(thread.inst))
                            break;
                    if ((op == CompiledRegex::LookBehind and count != 0) or
                        (op == CompiledRegex::NegativeLookBehind and count == 0))
                        return StepResult::Failed;
                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
                    break;
                }
                case CompiledRegex::Match:
                    thread.inst = nullptr;
                    return StepResult::Matched;
            }
        }
        return StepResult::Failed;
    }
    bool exec(Iterator begin, Iterator end, bool match = true, bool longest = false)
    {
        bool found_match = false;
        m_threads.clear();
        add_thread(0, match ? CompiledRegex::search_prefix_size : 0,
                   Vector<Iterator>(m_program.save_count, Iterator{}));
        m_begin = begin;
        m_end = end;
        for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
        {
            for (int i = 0; i < m_threads.size(); ++i)
            {
                const auto res = step(i);
                if (res == StepResult::Matched)
                {
                    if (match)
                        continue; // We are not at end, this is not a full match
                    m_captures = std::move(m_threads[i].saves);
                    found_match = true;
                    m_threads.resize(i); // remove this and lower priority threads
                    if (not longest)
                        return true;
                }
                else if (res == StepResult::Failed)
                    m_threads[i].inst = nullptr;
            }
            m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
                                           [](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
            if (m_threads.empty())
                return found_match;
        }
        // Step remaining threads to see if they match without consuming anything else
        for (int i = 0; i < m_threads.size(); ++i)
        {
            if (step(i) == StepResult::Matched)
            {
                m_captures = std::move(m_threads[i].saves);
                found_match = true;
                m_threads.resize(i); // remove this and lower priority threads
                if (not longest)
                    return true;
            }
        }
        return found_match;
    }
    void add_thread(int index, CompiledRegex::Offset pos, Vector<Iterator> saves)
    {
        const char* inst = m_program.bytecode.data() + pos;
        if (std::find_if(m_threads.begin(), m_threads.end(),
                         [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
            m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)});
    }
    bool is_line_start() const
    {
        return m_pos == m_begin or *(m_pos-1) == '\n';
    }
    bool is_line_end() const
    {
        return m_pos == m_end or *m_pos == '\n';
    }
    bool is_word_boundary() const
    {
        return m_pos == m_begin or m_pos == m_end or
               is_word(*(m_pos-1)) != is_word(*m_pos);
    }
    const CompiledRegex& m_program;
    Vector<Thread> m_threads;
    using Utf8It = utf8::iterator<Iterator>;
    Iterator m_begin;
    Iterator m_end;
    Utf8It m_pos;
    Vector<Iterator> m_captures;
 };
 template<typename It>
 bool regex_match(It begin, It end, const CompiledRegex& re)
 {
    ThreadedRegexVM<It> vm{re};
    return vm.exec(begin, end, true, false);
 }
 template<typename It>
 bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re)
 {
    ThreadedRegexVM<It> vm{re};
    if (vm.exec(begin, end, true, true))
    {
        captures = std::move(vm.m_captures);
        return true;
    }
    return false;
 }
 template<typename It>
 bool regex_search(It begin, It end, const CompiledRegex& re)
 {
    ThreadedRegexVM<It> vm{re};
    return vm.exec(begin, end, false, false);
 }
 template<typename It>
 bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re)
 {
    ThreadedRegexVM<It> vm{re};
    if (vm.exec(begin, end, false, true))
    {
        captures = std::move(vm.m_captures);
        return true;
    }
    return false;
 }
 }