From f02b2645dacc8788d624e94994584e9d4f31204e Mon Sep 17 00:00:00 2001
From: Maxime Coste <mawww@kakoune.org>
Date: Mon, 2 Oct 2017 14:59:04 +0800
Subject: [PATCH] Regex: validate that our custom impl gets the same results as
 boost regex

In addition to running boost regex, run our custom regex and compare
the results to ensure the two regex engine agree.
---
 src/regex.cc      |  10 +-
 src/regex.hh      |  54 ++++++++-
 src/regex_impl.cc | 260 ++--------------------------------------
 src/regex_impl.hh | 294 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 360 insertions(+), 258 deletions(-)
diff --git a/src/regex.cc b/src/regex.cc
index 77e01d89..749dced8 100644
--- a/src/regex.cc
+++ b/src/regex.cc
@@ -1,7 +1,7 @@
 #include "regex.hh"
 
 #include "exception.hh"
-#include "regex_impl.hh"
+#include "buffer_utils.hh"
 
 namespace Kakoune
 {
@@ -11,7 +11,7 @@ using Utf8It = RegexUtf8It<const char*>;
 Regex::Regex(StringView re, flag_type flags) try
     : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
 {
-    validate_regex(re);
+    m_impl = compile_regex(re);
 } catch (std::runtime_error& err) { throw regex_error(err.what()); }
 
 String option_to_string(const Regex& re)
@@ -24,4 +24,10 @@ void option_from_string(StringView str, Regex& re)
     re = Regex{str};
 }
 
+
+void regex_mismatch(const Regex& re)
+{
+    write_to_debug_buffer(format("regex mismatch for '{}'", re.str()));
+}
+
 }
diff --git a/src/regex.hh b/src/regex.hh
index 5e8b13fe..cab09267 100644
--- a/src/regex.hh
+++ b/src/regex.hh
@@ -5,6 +5,7 @@
 #include "string_utils.hh"
 #include "exception.hh"
 #include "utf8_iterator.hh"
+#include "regex_impl.hh"
 
 #include <boost/regex.hpp>
 
@@ -35,8 +36,11 @@ public:
 
     static constexpr const char* option_type_name = "regex";
 
+    const CompiledRegex& impl() const { return m_impl; }
+
 private:
     String m_str;
+    CompiledRegex m_impl;
 };
 
 template<typename It>
@@ -102,12 +106,39 @@ inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow,
            (eow ? RegexConstant::match_default : RegexConstant::match_not_eow);
 }
 
+void regex_mismatch(const Regex& re);
+
+template<typename It>
+void check_captures(const Regex& re, const MatchResults<It>& res, const Vector<It>& captures)
+{
+    if (res.size() > captures.size() * 2)
+        return regex_mismatch(re);
+
+    for (size_t i = 0; i < res.size(); ++i)
+    {
+        if (not res[i].matched)
+        {
+            if (captures[i*2] != It{} or captures[i*2+1] != It{})
+                regex_mismatch(re);
+            continue;
+        }
+
+        if (res[i].first != captures[i*2])
+            regex_mismatch(re);
+        if (res[i].second != captures[i*2+1])
+            regex_mismatch(re);
+    }
+}
+
 template<typename It>
 bool regex_match(It begin, It end, const Regex& re)
 {
     try
     {
-        return boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
+        bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
+        if (re.impl() and matched != regex_match(begin, end, re.impl()))
+            regex_mismatch(re);
+        return matched;
     }
     catch (std::runtime_error& err)
     {
@@ -120,7 +151,13 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
 {
     try
     {
-        return boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
+        bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
+        Vector<It> captures;
+        if (re.impl() and matched != regex_match(begin, end, captures, re.impl()))
+            regex_mismatch(re);
+        if (re.impl() and matched)
+            check_captures(re, res, captures);
+        return matched;
     }
     catch (std::runtime_error& err)
     {
@@ -134,7 +171,10 @@ bool regex_search(It begin, It end, const Regex& re,
 {
     try
     {
-        return boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
+        bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
+        if (re.impl() and matched != regex_search(begin, end, re.impl()))
+            regex_mismatch(re);
+        return matched;
     }
     catch (std::runtime_error& err)
     {
@@ -148,7 +188,13 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
 {
     try
     {
-        return boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
+        bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
+        Vector<It> captures;
+        if (re.impl() and matched != regex_search(begin, end, captures, re.impl()))
+            regex_mismatch(re);
+        if (re.impl() and matched)
+            check_captures(re, res, captures);
+        return matched;
     }
     catch (std::runtime_error& err)
     {
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index ea8b22e5..c8c5bc05 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -497,38 +497,6 @@ const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8]
     { 'H', nullptr, " \t", true },
 };
 
-struct CompiledRegex
-{
-    enum Op : char
-    {
-        Match,
-        Literal,
-        LiteralIgnoreCase,
-        AnyChar,
-        Matcher,
-        Jump,
-        Split_PrioritizeParent,
-        Split_PrioritizeChild,
-        Save,
-        LineStart,
-        LineEnd,
-        WordBoundary,
-        NotWordBoundary,
-        SubjectBegin,
-        SubjectEnd,
-        LookAhead,
-        LookBehind,
-        NegativeLookAhead,
-        NegativeLookBehind,
-    };
-
-    using Offset = unsigned;
-
-    Vector<char> bytecode;
-    Vector<std::function<bool (Codepoint)>> matchers;
-    size_t save_count;
-};
-
 struct RegexCompiler
 {
     RegexCompiler(const ParsedRegex& parsed_regex)
@@ -544,7 +512,6 @@ struct RegexCompiler
     CompiledRegex get_compiled_regex() { return std::move(m_program); }
 
     using Offset = CompiledRegex::Offset;
-    static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
 
     static CompiledRegex compile(StringView re)
     {
@@ -697,7 +664,7 @@ private:
     {
         kak_assert(m_program.bytecode.empty());
         push_op(CompiledRegex::Split_PrioritizeChild);
-        get_offset(alloc_offset()) = search_prefix_size;
+        get_offset(alloc_offset()) = CompiledRegex::search_prefix_size;
         push_op(CompiledRegex::AnyChar);
         push_op(CompiledRegex::Split_PrioritizeParent);
         get_offset(alloc_offset()) = 1 + sizeof(Offset);
@@ -830,230 +797,18 @@ void dump_regex(const CompiledRegex& program)
     }
 }
 
-template<typename Iterator>
-struct ThreadedRegexVM
-{
-    ThreadedRegexVM(const CompiledRegex& program)
-      : m_program{program} {}
-
-    struct Thread
-    {
-        const char* inst;
-        Vector<const char*> saves = {};
-    };
-
-    enum class StepResult { Consumed, Matched, Failed };
-    StepResult step(size_t thread_index)
-    {
-        const auto prog_start = m_program.bytecode.data();
-        const auto prog_end = prog_start + m_program.bytecode.size();
-        while (true)
-        {
-            auto& thread = m_threads[thread_index];
-            const Codepoint cp = m_pos == m_end ? 0 : *m_pos;
-            const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
-            switch (op)
-            {
-                case CompiledRegex::Literal:
-                    if (utf8::read_codepoint(thread.inst, prog_end) == cp)
-                        return StepResult::Consumed;
-                    return StepResult::Failed;
-                case CompiledRegex::LiteralIgnoreCase:
-                    if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
-                        return StepResult::Consumed;
-                    return StepResult::Failed;
-                case CompiledRegex::AnyChar:
-                    return StepResult::Consumed;
-                case CompiledRegex::Jump:
-                {
-                    auto inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
-                    // if instruction is already going to be executed by another thread, drop this thread
-                    if (std::find_if(m_threads.begin(), m_threads.end(),
-                                     [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
-                        return StepResult::Failed;
-                    thread.inst = inst;
-                    break;
-                }
-                case CompiledRegex::Split_PrioritizeParent:
-                {
-                    add_thread(thread_index+1, *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst), thread.saves);
-                    // thread is invalidated now, as we mutated the m_thread vector
-                    m_threads[thread_index].inst += sizeof(CompiledRegex::Offset);
-                    break;
-                }
-                case CompiledRegex::Split_PrioritizeChild:
-                {
-                    add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves);
-                    // thread is invalidated now, as we mutated the m_thread vector
-                    m_threads[thread_index].inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(m_threads[thread_index].inst);
-                    break;
-                }
-                case CompiledRegex::Save:
-                {
-                    const char index = *thread.inst++;
-                    thread.saves[index] = m_pos.base();
-                    break;
-                }
-                case CompiledRegex::Matcher:
-                {
-                    const int matcher_id = *thread.inst++;
-                    return m_program.matchers[matcher_id](*m_pos) ?
-                        StepResult::Consumed : StepResult::Failed;
-                }
-                case CompiledRegex::LineStart:
-                    if (not is_line_start())
-                        return StepResult::Failed;
-                    break;
-                case CompiledRegex::LineEnd:
-                    if (not is_line_end())
-                        return StepResult::Failed;
-                    break;
-                case CompiledRegex::WordBoundary:
-                    if (not is_word_boundary())
-                        return StepResult::Failed;
-                    break;
-                case CompiledRegex::NotWordBoundary:
-                    if (is_word_boundary())
-                        return StepResult::Failed;
-                    break;
-                case CompiledRegex::SubjectBegin:
-                    if (m_pos != m_begin)
-                        return StepResult::Failed;
-                    break;
-                case CompiledRegex::SubjectEnd:
-                    if (m_pos != m_end)
-                        return StepResult::Failed;
-                    break;
-                case CompiledRegex::LookAhead:
-                case CompiledRegex::NegativeLookAhead:
-                {
-                    int count = *thread.inst++;
-                    for (auto it = m_pos; count and it != m_end; ++it, --count)
-                        if (*it != utf8::read(thread.inst))
-                            break;
-                    if ((op == CompiledRegex::LookAhead and count != 0) or
-                        (op == CompiledRegex::NegativeLookAhead and count == 0))
-                        return StepResult::Failed;
-                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
-                    break;
-                }
-                case CompiledRegex::LookBehind:
-                case CompiledRegex::NegativeLookBehind:
-                {
-                    int count = *thread.inst++;
-                    for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
-                        if (*it != utf8::read(thread.inst))
-                            break;
-                    if ((op == CompiledRegex::LookBehind and count != 0) or
-                        (op == CompiledRegex::NegativeLookBehind and count == 0))
-                        return StepResult::Failed;
-                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
-                    break;
-                }
-                case CompiledRegex::Match:
-                    thread.inst = nullptr;
-                    return StepResult::Matched;
-            }
-        }
-        return StepResult::Failed;
-    }
-
-    bool exec(StringView data, bool match = true, bool longest = false)
-    {
-        bool found_match = false;
-        m_threads.clear();
-        add_thread(0, match ? RegexCompiler::search_prefix_size : 0,
-                   Vector<const char*>(m_program.save_count, nullptr));
-
-        m_begin = data.begin();
-        m_end = data.end();
-
-        for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
-        {
-            for (int i = 0; i < m_threads.size(); ++i)
-            {
-                const auto res = step(i);
-                if (res == StepResult::Matched)
-                {
-                    if (match)
-                        continue; // We are not at end, this is not a full match
-
-                    m_captures = std::move(m_threads[i].saves);
-                    found_match = true;
-                    m_threads.resize(i); // remove this and lower priority threads
-                    if (not longest)
-                        return true;
-                }
-                else if (res == StepResult::Failed)
-                    m_threads[i].inst = nullptr;
-            }
-            m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
-                                           [](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
-            if (m_threads.empty())
-                return found_match;
-        }
-
-        // Step remaining threads to see if they match without consuming anything else
-        for (int i = 0; i < m_threads.size(); ++i)
-        {
-            if (step(i) == StepResult::Matched)
-            {
-                m_captures = std::move(m_threads[i].saves);
-                found_match = true;
-                m_threads.resize(i); // remove this and lower priority threads
-                if (not longest)
-                    return true;
-            }
-        }
-        return found_match;
-    }
-
-    void add_thread(int index, CompiledRegex::Offset pos, Vector<const char*> saves)
-    {
-        const char* inst = m_program.bytecode.data() + pos;
-        if (std::find_if(m_threads.begin(), m_threads.end(),
-                         [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
-            m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)});
-    }
-
-    bool is_line_start() const
-    {
-        return m_pos == m_begin or *(m_pos-1) == '\n';
-    }
-
-    bool is_line_end() const
-    {
-        return m_pos == m_end or *m_pos == '\n';
-    }
-
-    bool is_word_boundary() const
-    {
-        return m_pos == m_begin or m_pos == m_end or
-               is_word(*(m_pos-1)) != is_word(*m_pos);
-    }
-
-    const CompiledRegex& m_program;
-    Vector<Thread> m_threads;
-
-    using Utf8It = utf8::iterator<Iterator>;
-
-    Iterator m_begin;
-    Iterator m_end;
-    Utf8It m_pos;
-
-    Vector<const char*> m_captures;
-};
-
-void validate_regex(StringView re)
+CompiledRegex compile_regex(StringView re)
 {
+    CompiledRegex res;
     try
     {
-        RegexParser{re};
+        res = RegexCompiler::compile(re);
     }
     catch (runtime_error& err)
     {
         write_to_debug_buffer(err.what());
     }
+    return std::move(res);
 }
 
 auto test_regex = UnitTest{[]{
@@ -1064,6 +819,11 @@ auto test_regex = UnitTest{[]{
               m_program{RegexCompiler::compile(re)}
         { if (dump) dump_regex(m_program); }
 
+        bool exec(StringView re, bool match = true, bool longest = false)
+        {
+            return ThreadedRegexVM::exec(re.begin(), re.end(), match, longest);
+        }
+
         CompiledRegex m_program;
     };
 
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index 8fafcacd..322b60a4 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -1,12 +1,302 @@
 #ifndef regex_impl_hh_INCLUDED
 #define regex_impl_hh_INCLUDED
 
+#include "unicode.hh"
+#include "utf8.hh"
+#include "utf8_iterator.hh"
+#include "vector.hh"
+
 namespace Kakoune
 {
 
-class StringView;
+struct CompiledRegex
+{
+    enum Op : char
+    {
+        Match,
+        Literal,
+        LiteralIgnoreCase,
+        AnyChar,
+        Matcher,
+        Jump,
+        Split_PrioritizeParent,
+        Split_PrioritizeChild,
+        Save,
+        LineStart,
+        LineEnd,
+        WordBoundary,
+        NotWordBoundary,
+        SubjectBegin,
+        SubjectEnd,
+        LookAhead,
+        LookBehind,
+        NegativeLookAhead,
+        NegativeLookBehind,
+    };
 
-void validate_regex(StringView re);
+    using Offset = unsigned;
+    static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
+
+    explicit operator bool() const { return not bytecode.empty(); }
+
+    Vector<char> bytecode;
+    Vector<std::function<bool (Codepoint)>> matchers;
+    size_t save_count;
+};
+
+CompiledRegex compile_regex(StringView re);
+
+template<typename Iterator>
+struct ThreadedRegexVM
+{
+    ThreadedRegexVM(const CompiledRegex& program)
+      : m_program{program} { kak_assert(m_program); }
+
+    struct Thread
+    {
+        const char* inst;
+        Vector<Iterator> saves = {};
+    };
+
+    enum class StepResult { Consumed, Matched, Failed };
+    StepResult step(size_t thread_index)
+    {
+        const auto prog_start = m_program.bytecode.data();
+        const auto prog_end = prog_start + m_program.bytecode.size();
+        while (true)
+        {
+            auto& thread = m_threads[thread_index];
+            const Codepoint cp = m_pos == m_end ? 0 : *m_pos;
+            const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
+            switch (op)
+            {
+                case CompiledRegex::Literal:
+                    if (utf8::read_codepoint(thread.inst, prog_end) == cp)
+                        return StepResult::Consumed;
+                    return StepResult::Failed;
+                case CompiledRegex::LiteralIgnoreCase:
+                    if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
+                        return StepResult::Consumed;
+                    return StepResult::Failed;
+                case CompiledRegex::AnyChar:
+                    return StepResult::Consumed;
+                case CompiledRegex::Jump:
+                {
+                    auto inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst);
+                    // if instruction is already going to be executed by another thread, drop this thread
+                    if (std::find_if(m_threads.begin(), m_threads.end(),
+                                     [inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
+                        return StepResult::Failed;
+                    thread.inst = inst;
+                    break;
+                }
+                case CompiledRegex::Split_PrioritizeParent:
+                {
+                    add_thread(thread_index+1, *reinterpret_cast<const CompiledRegex::Offset*>(thread.inst), thread.saves);
+                    // thread is invalidated now, as we mutated the m_thread vector
+                    m_threads[thread_index].inst += sizeof(CompiledRegex::Offset);
+                    break;
+                }
+                case CompiledRegex::Split_PrioritizeChild:
+                {
+                    add_thread(thread_index+1, thread.inst + sizeof(CompiledRegex::Offset) - prog_start, thread.saves);
+                    // thread is invalidated now, as we mutated the m_thread vector
+                    m_threads[thread_index].inst = prog_start + *reinterpret_cast<const CompiledRegex::Offset*>(m_threads[thread_index].inst);
+                    break;
+                }
+                case CompiledRegex::Save:
+                {
+                    const char index = *thread.inst++;
+                    thread.saves[index] = m_pos.base();
+                    break;
+                }
+                case CompiledRegex::Matcher:
+                {
+                    const int matcher_id = *thread.inst++;
+                    return m_program.matchers[matcher_id](*m_pos) ?
+                        StepResult::Consumed : StepResult::Failed;
+                }
+                case CompiledRegex::LineStart:
+                    if (not is_line_start())
+                        return StepResult::Failed;
+                    break;
+                case CompiledRegex::LineEnd:
+                    if (not is_line_end())
+                        return StepResult::Failed;
+                    break;
+                case CompiledRegex::WordBoundary:
+                    if (not is_word_boundary())
+                        return StepResult::Failed;
+                    break;
+                case CompiledRegex::NotWordBoundary:
+                    if (is_word_boundary())
+                        return StepResult::Failed;
+                    break;
+                case CompiledRegex::SubjectBegin:
+                    if (m_pos != m_begin)
+                        return StepResult::Failed;
+                    break;
+                case CompiledRegex::SubjectEnd:
+                    if (m_pos != m_end)
+                        return StepResult::Failed;
+                    break;
+                case CompiledRegex::LookAhead:
+                case CompiledRegex::NegativeLookAhead:
+                {
+                    int count = *thread.inst++;
+                    for (auto it = m_pos; count and it != m_end; ++it, --count)
+                        if (*it != utf8::read(thread.inst))
+                            break;
+                    if ((op == CompiledRegex::LookAhead and count != 0) or
+                        (op == CompiledRegex::NegativeLookAhead and count == 0))
+                        return StepResult::Failed;
+                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
+                    break;
+                }
+                case CompiledRegex::LookBehind:
+                case CompiledRegex::NegativeLookBehind:
+                {
+                    int count = *thread.inst++;
+                    for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
+                        if (*it != utf8::read(thread.inst))
+                            break;
+                    if ((op == CompiledRegex::LookBehind and count != 0) or
+                        (op == CompiledRegex::NegativeLookBehind and count == 0))
+                        return StepResult::Failed;
+                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
+                    break;
+                }
+                case CompiledRegex::Match:
+                    thread.inst = nullptr;
+                    return StepResult::Matched;
+            }
+        }
+        return StepResult::Failed;
+    }
+
+    bool exec(Iterator begin, Iterator end, bool match = true, bool longest = false)
+    {
+        bool found_match = false;
+        m_threads.clear();
+        add_thread(0, match ? CompiledRegex::search_prefix_size : 0,
+                   Vector<Iterator>(m_program.save_count, Iterator{}));
+
+        m_begin = begin;
+        m_end = end;
+
+        for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
+        {
+            for (int i = 0; i < m_threads.size(); ++i)
+            {
+                const auto res = step(i);
+                if (res == StepResult::Matched)
+                {
+                    if (match)
+                        continue; // We are not at end, this is not a full match
+
+                    m_captures = std::move(m_threads[i].saves);
+                    found_match = true;
+                    m_threads.resize(i); // remove this and lower priority threads
+                    if (not longest)
+                        return true;
+                }
+                else if (res == StepResult::Failed)
+                    m_threads[i].inst = nullptr;
+            }
+            m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
+                                           [](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
+            if (m_threads.empty())
+                return found_match;
+        }
+
+        // Step remaining threads to see if they match without consuming anything else
+        for (int i = 0; i < m_threads.size(); ++i)
+        {
+            if (step(i) == StepResult::Matched)
+            {
+                m_captures = std::move(m_threads[i].saves);
+                found_match = true;
+                m_threads.resize(i); // remove this and lower priority threads
+                if (not longest)
+                    return true;
+            }
+        }
+        return found_match;
+    }
+
+    void add_thread(int index, CompiledRegex::Offset pos, Vector<Iterator> saves)
+    {
+        const char* inst = m_program.bytecode.data() + pos;
+        if (std::find_if(m_threads.begin(), m_threads.end(),
+                         [inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
+            m_threads.insert(m_threads.begin() + index, {inst, std::move(saves)});
+    }
+
+    bool is_line_start() const
+    {
+        return m_pos == m_begin or *(m_pos-1) == '\n';
+    }
+
+    bool is_line_end() const
+    {
+        return m_pos == m_end or *m_pos == '\n';
+    }
+
+    bool is_word_boundary() const
+    {
+        return m_pos == m_begin or m_pos == m_end or
+               is_word(*(m_pos-1)) != is_word(*m_pos);
+    }
+
+    const CompiledRegex& m_program;
+    Vector<Thread> m_threads;
+
+    using Utf8It = utf8::iterator<Iterator>;
+
+    Iterator m_begin;
+    Iterator m_end;
+    Utf8It m_pos;
+
+    Vector<Iterator> m_captures;
+};
+
+template<typename It>
+bool regex_match(It begin, It end, const CompiledRegex& re)
+{
+    ThreadedRegexVM<It> vm{re};
+    return vm.exec(begin, end, true, false);
+}
+
+template<typename It>
+bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re)
+{
+    ThreadedRegexVM<It> vm{re};
+    if (vm.exec(begin, end, true, true))
+    {
+        captures = std::move(vm.m_captures);
+        return true;
+    }
+    return false;
+}
+
+template<typename It>
+bool regex_search(It begin, It end, const CompiledRegex& re)
+{
+    ThreadedRegexVM<It> vm{re};
+    return vm.exec(begin, end, false, false);
+}
+
+template<typename It>
+bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re)
+{
+    ThreadedRegexVM<It> vm{re};
+    if (vm.exec(begin, end, false, true))
+    {
+        captures = std::move(vm.m_captures);
+        return true;
+    }
+    return false;
+}
 
 }