Speed up WordSplitter

Only do utf8 decoding once per codepoint instead of twice, limit
the byte length instead of the codepoint length.
This commit is contained in:
Maxime Coste 2023-10-25 12:52:14 +11:00
parent b33b673f10
commit be33dee211
2 changed files with 11 additions and 20 deletions

View File

@ -34,30 +34,21 @@ struct WordSplitter
const auto* end = m_splitter->m_content.end(); const auto* end = m_splitter->m_content.end();
auto extra_chars = m_splitter->m_extra_word_chars; auto extra_chars = m_splitter->m_extra_word_chars;
while (true) do
{ {
m_word_begin = m_word_end; auto it = m_word_begin = m_word_end;
while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars)) while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars))
utf8::to_next(m_word_begin, end); m_word_begin = it;
m_word_end = m_word_begin;
CharCount word_len = 0; m_word_end = it;
while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars)) while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars))
{ m_word_end = it;
utf8::to_next(m_word_end, end); } while (m_word_begin != end and (m_word_end - m_word_begin) > WordDB::max_word_len);
++word_len;
}
if (m_word_begin == end or word_len < WordDB::max_word_len)
break;
}
return *this; return *this;
} }
friend bool operator==(const Iterator& lhs, const Iterator& rhs) friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default;
{ return lhs.m_word_begin == rhs.m_word_begin and lhs.m_word_end == rhs.m_word_end; }
friend bool operator!=(const Iterator& lhs, const Iterator& rhs)
{ return not (lhs == rhs); }
const char* m_word_begin; const char* m_word_begin;
const char* m_word_end; const char* m_word_end;

View File

@ -18,7 +18,7 @@ class Buffer;
class WordDB : public OptionManagerWatcher class WordDB : public OptionManagerWatcher
{ {
public: public:
static constexpr CharCount max_word_len = 50; static constexpr ByteCount max_word_len = 50;
WordDB(const Buffer& buffer); WordDB(const Buffer& buffer);
~WordDB(); ~WordDB();