home/src/word_db.cc
2015-10-21 20:16:36 +01:00

254 lines
7.1 KiB
C++

#include "word_db.hh"
#include "utils.hh"
#include "line_modification.hh"
#include "utf8_iterator.hh"
#include "unit_tests.hh"
namespace Kakoune
{
UsedLetters used_letters(StringView str)
{
UsedLetters res;
for (auto c : str)
{
if (c >= 'a' and c <= 'z')
res.set(c - 'a');
else if (c >= 'A' and c <= 'Z')
res.set(c - 'A' + 26);
else if (c == '_')
res.set(53);
else if (c == '-')
res.set(54);
else
res.set(63);
}
return res;
}
constexpr UsedLetters upper_mask = 0xFFFFFFC000000;
UsedLetters to_lower(UsedLetters letters)
{
return ((letters & upper_mask) >> 26) | (letters & (~upper_mask));
}
using WordList = Vector<StringView>;
static WordList get_words(const SharedString& content)
{
WordList res;
using Utf8It = utf8::iterator<const char*, utf8::InvalidPolicy::Pass>;
const char* word_start = content.begin();
bool in_word = false;
for (Utf8It it{word_start, content}, end{content.end(), content}; it != end; ++it)
{
Codepoint c = *it;
const bool word = is_word(c);
if (not in_word and word)
{
word_start = it.base();
in_word = true;
}
else if (in_word and not word)
{
const ByteCount start = word_start - content.begin();
const ByteCount length = it.base() - word_start;
res.push_back(content.acquire_substr(start, length));
in_word = false;
}
}
return res;
}
void WordDB::add_words(const SharedString& line)
{
for (auto& w : get_words(line))
{
WordDB::WordInfo& info = m_words[intern(w)];
++info.refcount;
if (info.letters.none())
info.letters = used_letters(w);
}
}
void WordDB::remove_words(const SharedString& line)
{
for (auto& w : get_words(line))
{
auto it = m_words.find({w, SharedString::NoCopy()});
kak_assert(it != m_words.end() and it->second.refcount > 0);
if (--it->second.refcount == 0)
m_words.erase(it);
}
}
WordDB::WordDB(const Buffer& buffer)
: m_buffer{&buffer}, m_timestamp{buffer.timestamp()}
{
m_lines.reserve((int)buffer.line_count());
for (auto line = 0_line, end = buffer.line_count(); line < end; ++line)
{
m_lines.push_back(buffer.line_storage(line));
add_words(SharedString{m_lines.back()});
}
}
void WordDB::update_db()
{
auto& buffer = *m_buffer;
auto modifs = compute_line_modifications(buffer, m_timestamp);
m_timestamp = buffer.timestamp();
if (modifs.empty())
return;
Lines new_lines;
new_lines.reserve((int)buffer.line_count());
auto old_line = 0_line;
for (auto& modif : modifs)
{
kak_assert(0_line <= modif.new_line and modif.new_line <= buffer.line_count());
kak_assert(modif.new_line < buffer.line_count() or modif.num_added == 0);
kak_assert(old_line <= modif.old_line);
while (old_line < modif.old_line)
new_lines.push_back(std::move(m_lines[(int)old_line++]));
kak_assert((int)new_lines.size() == (int)modif.new_line);
while (old_line < modif.old_line + modif.num_removed)
{
kak_assert(old_line < m_lines.size());
remove_words(SharedString{m_lines[(int)old_line++]});
}
for (auto l = 0_line; l < modif.num_added; ++l)
{
new_lines.push_back(buffer.line_storage(modif.new_line + l));
add_words(SharedString{new_lines.back()});
}
}
while (old_line != (int)m_lines.size())
new_lines.push_back(std::move(m_lines[(int)old_line++]));
m_lines = std::move(new_lines);
}
int WordDB::get_word_occurences(StringView word) const
{
auto it = m_words.find({word, SharedString::NoCopy()});
if (it != m_words.end())
return it->second.refcount;
return 0;
}
WordDB::RankedWordList WordDB::find_matching(StringView query)
{
auto match_rank = [](StringView candidate, StringView query)
{
int rank = 0;
auto it = candidate.begin();
char prev = 0;
for (auto c : query)
{
if (it == candidate.end())
return 0;
const bool islow = islower(c);
auto eq_c = [islow, c](char ch) { return islow ? tolower(ch) == c : ch == c; };
if (eq_c(*it)) // improve rank on contiguous
++rank;
while (!eq_c(*it))
{
prev = *it;
if (++it == candidate.end())
return 0;
}
// Improve rank on word boundaries
if (prev == 0 or prev == '_' or
(islower(prev) and isupper(*it)))
rank += 5;
prev = c;
++rank;
++it;
}
return rank;
};
auto matches = [](UsedLetters query, UsedLetters letters)
{
return (query & letters) == query;
};
update_db();
const UsedLetters letters = used_letters(query);
RankedWordList res;
for (auto&& word : m_words)
{
if (query.empty())
{
res.push_back({word.first, 1 });
continue;
}
UsedLetters word_letters = word.second.letters;
if (not matches(to_lower(letters), to_lower(word_letters)) or
not matches(letters & upper_mask, word_letters & upper_mask))
continue;
if (int rank = match_rank(word.first, query))
res.push_back({ word.first, rank });
}
return res;
}
UnitTest test_word_db{[]()
{
auto cmp_words = [](const WordDB::RankedWord& lhs, const WordDB::RankedWord& rhs) {
return lhs.word < rhs.word;
};
auto eq = [](ArrayView<const WordDB::RankedWord> lhs, const WordList& rhs) {
return lhs.size() == rhs.size() and
std::equal(lhs.begin(), lhs.end(), rhs.begin(),
[](const WordDB::RankedWord& lhs, const StringView& rhs) {
return lhs.word == rhs;
});
};
Buffer buffer("test", Buffer::Flags::None,
"tchou mutch\n"
"tchou kanaky tchou\n"
"\n"
"tchaa tchaa\n"
"allo\n");
WordDB word_db(buffer);
auto res = word_db.find_matching("");
std::sort(res.begin(), res.end(), cmp_words);
kak_assert(eq(res, WordList{ "allo" COMMA "kanaky" COMMA "mutch" COMMA "tchaa" COMMA "tchou" }));
kak_assert(word_db.get_word_occurences("tchou") == 3);
kak_assert(word_db.get_word_occurences("allo") == 1);
buffer.erase(buffer.iterator_at({1, 6}), buffer.iterator_at({4, 0}));
res = word_db.find_matching("");
std::sort(res.begin(), res.end(), cmp_words);
kak_assert(eq(res, WordList{ "allo" COMMA "mutch" COMMA "tchou" }));
buffer.insert(buffer.iterator_at({1, 0}), "re");
res = word_db.find_matching("");
std::sort(res.begin(), res.end(), cmp_words);
kak_assert(eq(res, WordList{ "allo" COMMA "mutch" COMMA "retchou" COMMA "tchou" }));
}};
UnitTest test_used_letters{[]()
{
kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD")));
}};
}