Speed up WordSplitter

Only do utf8 decoding once per codepoint instead of twice, limit the byte length instead of the codepoint length.
2023-10-25 12:52:14 +11:00 · 2023-10-25 12:52:14 +11:00 · be33dee211
commit be33dee211
parent b33b673f10
2 changed files with 11 additions and 20 deletions
--- a/src/word_db.cc
+++ b/src/word_db.cc
@ -34,30 +34,21 @@ struct WordSplitter
            const auto* end = m_splitter->m_content.end();
            auto extra_chars = m_splitter->m_extra_word_chars;

-            while (true)
+            do
            {
-                m_word_begin = m_word_end;
-                while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars))
-                    utf8::to_next(m_word_begin, end);
-                m_word_end = m_word_begin;
-                CharCount word_len = 0;
-                while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars))
-                {
-                    utf8::to_next(m_word_end, end);
-                    ++word_len;
-                }
-                if (m_word_begin == end or word_len < WordDB::max_word_len)
-                    break;
-            }
+                auto it = m_word_begin = m_word_end;
+                while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars))
+                    m_word_begin = it;
+
+                m_word_end = it;
+                while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars))
+                    m_word_end = it;
+            } while (m_word_begin != end and (m_word_end - m_word_begin) > WordDB::max_word_len);

            return *this;
        }

-        friend bool operator==(const Iterator& lhs, const Iterator& rhs)
-        { return lhs.m_word_begin == rhs.m_word_begin and lhs.m_word_end == rhs.m_word_end; }
-
-        friend bool operator!=(const Iterator& lhs, const Iterator& rhs)
-        { return not (lhs == rhs); }
+        friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default;

        const char* m_word_begin;
        const char* m_word_end;
--- a/src/word_db.hh
+++ b/src/word_db.hh
@ -18,7 +18,7 @@ class Buffer;
 class WordDB : public OptionManagerWatcher
 {
 public:
-    static constexpr CharCount max_word_len = 50;
+    static constexpr ByteCount max_word_len = 50;

    WordDB(const Buffer& buffer);
    ~WordDB();