Limit WordDB word length to 50 bytes

Should improve both performance and relevancy of the word completions.
This commit is contained in:
Maxime Coste 2019-03-22 18:03:49 +11:00
parent b9c1fa61a0
commit ad882c3370

View File

@ -21,6 +21,8 @@ WordDB& get_word_db(const Buffer& buffer)
struct WordSplitter struct WordSplitter
{ {
static constexpr CharCount max_word_len = 50;
struct Iterator struct Iterator
{ {
Iterator(const char* begin, const WordSplitter& splitter) Iterator(const char* begin, const WordSplitter& splitter)
@ -34,12 +36,22 @@ struct WordSplitter
const auto* end = m_splitter->m_content.end(); const auto* end = m_splitter->m_content.end();
auto extra_chars = m_splitter->m_extra_word_chars; auto extra_chars = m_splitter->m_extra_word_chars;
m_word_begin = m_word_end; while (true)
while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars)) {
utf8::to_next(m_word_begin, end); m_word_begin = m_word_end;
m_word_end = m_word_begin; while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars))
while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars)) utf8::to_next(m_word_begin, end);
utf8::to_next(m_word_end, end); m_word_end = m_word_begin;
CharCount word_len = 0;
while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars))
{
utf8::to_next(m_word_end, end);
++word_len;
}
if (m_word_begin == end or word_len < max_word_len)
break;
}
return *this; return *this;
} }