Remove use of utf8::iterator in regex execution

This avoids having two copies of the subject string bounds, one in the ExecConfig and one in the utf8 iterator.
2018-11-05 08:17:50 +11:00 · 2018-11-05 08:17:50 +11:00 · 7463a0d449
commit 7463a0d449
parent b4571bd172
1 changed files with 53 additions and 47 deletions
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@ -6,7 +6,6 @@
 #include "ref_ptr.hh"
 #include "unicode.hh"
 #include "utf8.hh"
-#include "utf8_iterator.hh"
 #include "vector.hh"
 #include "utils.hh"

@ -167,7 +166,7 @@ public:
    ThreadedRegexVM(const CompiledRegex& program)
      : m_program{program}
    {
-        kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or
+        kak_assert((forward and program.first_backward_inst != 0) or
                   (direction == MatchDirection::Backward and program.first_backward_inst != -1));
    }

@ -195,15 +194,13 @@ public:
        const bool search = (flags & RegexExecFlags::Search);

        ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
-        if (direction == MatchDirection::Forward)
+        if (forward)
            instructions = instructions.subrange(0, m_program.first_backward_inst);
        else
            instructions = instructions.subrange(m_program.first_backward_inst);
        if (not search)
            instructions = instructions.subrange(CompiledRegex::search_prefix_size);

-        constexpr bool forward = direction == MatchDirection::Forward;
-
        const ExecConfig config{
            Sentinel{forward ? begin : end},
            Sentinel{forward ? end : begin},
@ -213,24 +210,22 @@ public:
            instructions
        };

-        Utf8It start{Utf8It{
-            forward ? begin : end,
-            Sentinel{subject_begin},
-            Sentinel{subject_end}
-        }};
-
+        Iterator start = forward ? begin : end;
        if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
        {
            if (search)
            {
-                to_next_start(start, config.end, *start_desc);
+                to_next_start(start, config, *start_desc);
                if (start == config.end) // If start_desc is not null, it means we consume at least one char
                    return false;
            }
-            else if (start != config.end and
-                     not start_desc->map[codepoint(start) < StartDesc::count ? codepoint(start) : StartDesc::other])
+            else if (start != config.end)
+            {
+                const Codepoint cp = codepoint(start, config);
+                 if (not start_desc->map[cp < StartDesc::count ? cp : StartDesc::other])
                    return false;
            }
+        }

        return exec_program(std::move(start), config);
    }
@ -297,7 +292,6 @@ private:

    using StartDesc = CompiledRegex::StartDesc;
    using Sentinel = typename SentinelType<Iterator>::Type;
-    using Utf8It = utf8::iterator<Iterator, Sentinel>;
    struct ExecConfig
    {
        const Sentinel begin;
@ -311,7 +305,7 @@ private:
    enum class StepResult { Consumed, Matched, Failed, FindNextStart };

    // Steps a thread until it consumes the current character, matches or fail
-    StepResult step_thread(const Utf8It& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
+    StepResult step_thread(const Iterator& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
    {
        const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
        auto* instructions = m_program.instructions.data();
@ -327,17 +321,17 @@ private:
            switch (inst.op)
            {
                case CompiledRegex::Literal:
-                    if (pos != config.end and inst.param == codepoint(pos))
+                    if (pos != config.end and inst.param == codepoint(pos, config))
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::Literal_IgnoreCase:
-                    if (pos != config.end and inst.param == to_lower(codepoint(pos)))
+                    if (pos != config.end and inst.param == to_lower(codepoint(pos, config)))
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::AnyChar:
                    return StepResult::Consumed;
                case CompiledRegex::AnyCharExceptNewLine:
-                    if (pos != config.end and codepoint(pos) != '\n')
+                    if (pos != config.end and codepoint(pos, config) != '\n')
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::Jump:
@ -369,18 +363,18 @@ private:
                        --m_saves[thread.saves]->refcount;
                        thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
                    }
-                    m_saves[thread.saves]->pos[inst.param] = pos.base();
+                    m_saves[thread.saves]->pos[inst.param] = pos;
                    break;
                }
                case CompiledRegex::Class:
                    if (pos == config.end)
                        return StepResult::Failed;
-                    return is_character_class(m_program.character_classes[inst.param], codepoint(pos)) ?
+                    return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ?
                        StepResult::Consumed : StepResult::Failed;
                case CompiledRegex::CharacterType:
                    if (pos == config.end)
                        return StepResult::Failed;
-                    return is_ctype((CharacterType)inst.param, codepoint(pos)) ?
+                    return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ?
                        StepResult::Consumed : StepResult::Failed;
                case CompiledRegex::LineStart:
                    if (not is_line_start(pos, config))
@ -442,15 +436,14 @@ private:
        return StepResult::Failed;
    }

-    bool exec_program(Utf8It pos, const ExecConfig& config)
+    bool exec_program(Iterator pos, const ExecConfig& config)
    {
        kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
        release_saves(m_captures);
        m_captures = -1;
        m_threads.push_current({static_cast<int16_t>(&config.instructions[0] - &m_program.instructions[0]), -1});

-        const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
-                                                                      : m_program.backward_start_desc;
+        const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc;

        uint16_t current_step = -1;
        bool found_match = false;
@ -517,28 +510,30 @@ private:
            }

            m_threads.swap_next();
-            (direction == MatchDirection::Forward) ? ++pos : --pos;
+            forward ? utf8::to_next(pos, config.subject_end)
+                    : utf8::to_previous(pos, config.subject_begin);

            if (find_next_start and start_desc)
-                to_next_start(pos, config.end, *start_desc);
+                to_next_start(pos, config, *start_desc);
        }
    }

-    void to_next_start(Utf8It& start, const Sentinel& end, const StartDesc& start_desc)
+    void to_next_start(Iterator& start, const ExecConfig& config, const StartDesc& start_desc)
    {
-        while (start != end)
+        while (start != config.end)
        {
-            const Codepoint cp = read_codepoint(start);
+            const Codepoint cp = read_codepoint(start, config);
            if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
            {
-                (direction == MatchDirection::Forward) ? --start : ++start;
+                forward ? utf8::to_previous(start, config.subject_begin)
+                        : utf8::to_next(start, config.subject_end);
                return;
            }
        }
    }

    template<MatchDirection look_direction, bool ignore_case>
-    bool lookaround(uint32_t index, Utf8It pos, const ExecConfig& config) const
+    bool lookaround(uint32_t index, Iterator pos, const ExecConfig& config) const
    {
        using Lookaround = CompiledRegex::Lookaround;

@ -546,7 +541,7 @@ private:
        {
            if (pos == config.subject_begin)
                return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
-            --pos;
+            utf8::to_previous(pos, config.subject_begin);
        }

        for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
@ -554,7 +549,7 @@ private:
            if (look_direction == MatchDirection::Forward and pos == config.subject_end)
                return false;

-            Codepoint cp = *pos;
+            Codepoint cp = utf8::codepoint(pos, config.subject_end);
            if (ignore_case)
                cp = to_lower(cp);

@ -584,43 +579,52 @@ private:
            if (look_direction == MatchDirection::Backward and pos == config.subject_begin)
                return *++it == Lookaround::EndOfLookaround;

-            (look_direction == MatchDirection::Forward) ? ++pos : --pos;
+            (look_direction == MatchDirection::Forward) ? utf8::to_next(pos, config.subject_end)
+                                                        : utf8::to_previous(pos, config.subject_begin);
        }
        return true;
    }

-    static bool is_line_start(const Utf8It& pos, const ExecConfig& config)
+    static bool is_line_start(const Iterator& pos, const ExecConfig& config)
    {
        if (pos == config.subject_begin)
            return not (config.flags & RegexExecFlags::NotBeginOfLine);
-        return *(pos-1) == '\n';
+        return utf8::codepoint(utf8::previous(pos, config.subject_begin), config.subject_end) == '\n';
    }

-    static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
+    static bool is_line_end(const Iterator& pos, const ExecConfig& config)
    {
        if (pos == config.subject_end)
            return not (config.flags & RegexExecFlags::NotEndOfLine);
-        return *pos == '\n';
+        return utf8::codepoint(pos, config.subject_end) == '\n';
    }

-    static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
+    static bool is_word_boundary(const Iterator& pos, const ExecConfig& config)
    {
        if (pos == config.subject_begin)
            return not (config.flags & RegexExecFlags::NotBeginOfWord);
        if (pos == config.subject_end)
            return not (config.flags & RegexExecFlags::NotEndOfWord);
-        return is_word(*(pos-1)) != is_word(*pos);
+        return is_word(utf8::codepoint(utf8::previous(pos, config.subject_begin), config.subject_end)) !=
+               is_word(utf8::codepoint(pos, config.subject_end));
    }

-    static Codepoint read_codepoint(Utf8It& it)
+    static Codepoint read_codepoint(Iterator& it, const ExecConfig& config)
    {
-        if (direction == MatchDirection::Forward)
-            return it.read();
+        if (forward)
+            return utf8::read_codepoint(it, config.subject_end);
        else
-            return *--it;
+        {
+            utf8::to_previous(it, config.subject_begin);
+            return utf8::codepoint(it, config.subject_end);
+        }
    }

-    static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
+    static Codepoint codepoint(const Iterator& it, const ExecConfig& config)
+    {
+        return utf8::codepoint(forward ? it : utf8::previous(it, config.subject_begin),
+                               config.subject_end);
+    }

    const CompiledRegex& m_program;

@ -664,6 +668,8 @@ private:
        int32_t m_next = 0;
    };

+    static constexpr bool forward = direction == MatchDirection::Forward;
+
    DualThreadStack m_threads;
    Vector<Saves*, MemoryDomain::Regex> m_saves;
    int16_t m_first_free = -1;