From ad882c33707c65344ca05d421ea3a29c95168eeb Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 22 Mar 2019 18:03:49 +1100 Subject: [PATCH] Limit WordDB word length to 50 bytes Should improve both performance and relevancy of the word completions. --- src/word_db.cc | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/word_db.cc b/src/word_db.cc index a9f0b751..9a7680a5 100644 --- a/src/word_db.cc +++ b/src/word_db.cc @@ -21,6 +21,8 @@ WordDB& get_word_db(const Buffer& buffer) struct WordSplitter { + static constexpr CharCount max_word_len = 50; + struct Iterator { Iterator(const char* begin, const WordSplitter& splitter) @@ -34,12 +36,22 @@ struct WordSplitter const auto* end = m_splitter->m_content.end(); auto extra_chars = m_splitter->m_extra_word_chars; - m_word_begin = m_word_end; - while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars)) - utf8::to_next(m_word_begin, end); - m_word_end = m_word_begin; - while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars)) - utf8::to_next(m_word_end, end); + while (true) + { + m_word_begin = m_word_end; + while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars)) + utf8::to_next(m_word_begin, end); + m_word_end = m_word_begin; + CharCount word_len = 0; + while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars)) + { + utf8::to_next(m_word_end, end); + ++word_len; + } + if (m_word_begin == end or word_len < max_word_len) + break; + } + return *this; }