2014-01-16 23:07:42 +01:00
|
|
|
#include "word_db.hh"
|
|
|
|
|
|
|
|
#include "utils.hh"
|
2014-05-26 22:00:26 +02:00
|
|
|
#include "line_modification.hh"
|
2014-01-16 23:07:42 +01:00
|
|
|
#include "utf8_iterator.hh"
|
2015-05-22 14:58:56 +02:00
|
|
|
#include "unit_tests.hh"
|
2014-01-16 23:07:42 +01:00
|
|
|
|
|
|
|
namespace Kakoune
|
|
|
|
{
|
|
|
|
|
2014-12-23 20:32:42 +01:00
|
|
|
UsedLetters used_letters(StringView str)
|
2014-10-28 20:23:02 +01:00
|
|
|
{
|
2014-12-23 20:32:42 +01:00
|
|
|
UsedLetters res;
|
2014-10-28 20:23:02 +01:00
|
|
|
for (auto c : str)
|
|
|
|
{
|
|
|
|
if (c >= 'a' and c <= 'z')
|
|
|
|
res.set(c - 'a');
|
|
|
|
else if (c >= 'A' and c <= 'Z')
|
|
|
|
res.set(c - 'A' + 26);
|
|
|
|
else if (c == '_')
|
|
|
|
res.set(53);
|
|
|
|
else if (c == '-')
|
|
|
|
res.set(54);
|
|
|
|
else
|
|
|
|
res.set(63);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2015-10-18 17:55:21 +02:00
|
|
|
constexpr UsedLetters upper_mask = 0xFFFFFFC000000;
|
|
|
|
|
|
|
|
UsedLetters to_lower(UsedLetters letters)
|
|
|
|
{
|
|
|
|
return ((letters & upper_mask) >> 26) | (letters & (~upper_mask));
|
|
|
|
}
|
|
|
|
|
2015-10-21 21:16:36 +02:00
|
|
|
using WordList = Vector<StringView>;
|
|
|
|
|
|
|
|
|
2016-02-05 10:36:07 +01:00
|
|
|
static WordList get_words(StringView content)
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
2015-10-21 21:16:36 +02:00
|
|
|
WordList res;
|
2015-09-23 20:39:21 +02:00
|
|
|
using Utf8It = utf8::iterator<const char*, utf8::InvalidPolicy::Pass>;
|
2014-05-11 13:51:37 +02:00
|
|
|
const char* word_start = content.begin();
|
2014-01-16 23:07:42 +01:00
|
|
|
bool in_word = false;
|
2015-09-23 20:39:21 +02:00
|
|
|
for (Utf8It it{word_start, content}, end{content.end(), content}; it != end; ++it)
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
|
|
|
Codepoint c = *it;
|
|
|
|
const bool word = is_word(c);
|
|
|
|
if (not in_word and word)
|
|
|
|
{
|
|
|
|
word_start = it.base();
|
|
|
|
in_word = true;
|
|
|
|
}
|
|
|
|
else if (in_word and not word)
|
|
|
|
{
|
2014-10-07 10:15:32 +02:00
|
|
|
const ByteCount start = word_start - content.begin();
|
|
|
|
const ByteCount length = it.base() - word_start;
|
2016-02-05 10:36:07 +01:00
|
|
|
res.push_back(content.substr(start, length));
|
2014-01-16 23:07:42 +01:00
|
|
|
in_word = false;
|
|
|
|
}
|
|
|
|
}
|
2014-01-24 01:56:33 +01:00
|
|
|
return res;
|
2014-01-16 23:07:42 +01:00
|
|
|
}
|
|
|
|
|
2016-02-05 10:36:07 +01:00
|
|
|
void WordDB::add_words(StringView line)
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
2015-03-10 14:50:25 +01:00
|
|
|
for (auto& w : get_words(line))
|
2014-10-28 20:23:02 +01:00
|
|
|
{
|
2016-02-05 10:36:07 +01:00
|
|
|
auto it = m_words.find(w);
|
|
|
|
if (it == m_words.end())
|
|
|
|
{
|
|
|
|
auto word = intern(w);
|
|
|
|
WordDB::WordInfo& info = m_words[word->strview()];
|
|
|
|
info.word = word;
|
2014-10-28 20:23:02 +01:00
|
|
|
info.letters = used_letters(w);
|
2016-02-05 10:36:07 +01:00
|
|
|
++info.refcount;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
++ it->second.refcount;
|
2014-10-28 20:23:02 +01:00
|
|
|
}
|
2014-01-24 01:56:33 +01:00
|
|
|
}
|
2014-01-16 23:07:42 +01:00
|
|
|
|
2016-02-05 10:36:07 +01:00
|
|
|
void WordDB::remove_words(StringView line)
|
2014-01-24 01:56:33 +01:00
|
|
|
{
|
2015-03-10 14:50:25 +01:00
|
|
|
for (auto& w : get_words(line))
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
2016-02-05 10:36:07 +01:00
|
|
|
auto it = m_words.find(w);
|
2014-12-23 20:32:42 +01:00
|
|
|
kak_assert(it != m_words.end() and it->second.refcount > 0);
|
2014-10-28 20:23:02 +01:00
|
|
|
if (--it->second.refcount == 0)
|
2014-12-23 20:32:42 +01:00
|
|
|
m_words.erase(it);
|
2014-01-16 23:07:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-24 01:56:33 +01:00
|
|
|
WordDB::WordDB(const Buffer& buffer)
|
2014-05-14 22:19:19 +02:00
|
|
|
: m_buffer{&buffer}, m_timestamp{buffer.timestamp()}
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
2015-01-15 14:58:55 +01:00
|
|
|
m_lines.reserve((int)buffer.line_count());
|
2014-01-24 01:56:33 +01:00
|
|
|
for (auto line = 0_line, end = buffer.line_count(); line < end; ++line)
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
2015-01-27 14:11:32 +01:00
|
|
|
m_lines.push_back(buffer.line_storage(line));
|
2016-02-05 10:36:07 +01:00
|
|
|
add_words(m_lines.back()->strview());
|
2014-01-16 23:07:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-24 01:56:33 +01:00
|
|
|
void WordDB::update_db()
|
2014-01-16 23:07:42 +01:00
|
|
|
{
|
2014-05-14 22:19:19 +02:00
|
|
|
auto& buffer = *m_buffer;
|
|
|
|
|
2014-05-26 22:00:26 +02:00
|
|
|
auto modifs = compute_line_modifications(buffer, m_timestamp);
|
2014-05-14 22:19:19 +02:00
|
|
|
m_timestamp = buffer.timestamp();
|
|
|
|
|
2014-01-24 01:56:33 +01:00
|
|
|
if (modifs.empty())
|
|
|
|
return;
|
2014-01-16 23:07:42 +01:00
|
|
|
|
2015-01-15 14:58:55 +01:00
|
|
|
Lines new_lines;
|
2014-01-24 01:56:33 +01:00
|
|
|
new_lines.reserve((int)buffer.line_count());
|
|
|
|
|
|
|
|
auto old_line = 0_line;
|
|
|
|
for (auto& modif : modifs)
|
|
|
|
{
|
2015-02-04 00:39:04 +01:00
|
|
|
kak_assert(0_line <= modif.new_line and modif.new_line <= buffer.line_count());
|
|
|
|
kak_assert(modif.new_line < buffer.line_count() or modif.num_added == 0);
|
2014-05-26 22:00:26 +02:00
|
|
|
kak_assert(old_line <= modif.old_line);
|
|
|
|
while (old_line < modif.old_line)
|
2015-01-15 14:58:55 +01:00
|
|
|
new_lines.push_back(std::move(m_lines[(int)old_line++]));
|
2014-01-24 01:56:33 +01:00
|
|
|
|
2014-05-26 22:00:26 +02:00
|
|
|
kak_assert((int)new_lines.size() == (int)modif.new_line);
|
|
|
|
|
2015-02-01 00:50:24 +01:00
|
|
|
while (old_line < modif.old_line + modif.num_removed)
|
2014-05-26 22:00:26 +02:00
|
|
|
{
|
2015-01-15 14:58:55 +01:00
|
|
|
kak_assert(old_line < m_lines.size());
|
2016-02-05 10:36:07 +01:00
|
|
|
remove_words(m_lines[(int)old_line++]->strview());
|
2014-05-26 22:00:26 +02:00
|
|
|
}
|
2014-01-24 01:56:33 +01:00
|
|
|
|
2015-02-01 00:50:24 +01:00
|
|
|
for (auto l = 0_line; l < modif.num_added; ++l)
|
2014-01-24 01:56:33 +01:00
|
|
|
{
|
2015-01-27 14:11:32 +01:00
|
|
|
new_lines.push_back(buffer.line_storage(modif.new_line + l));
|
2016-02-05 10:36:07 +01:00
|
|
|
add_words(new_lines.back()->strview());
|
2014-01-24 01:56:33 +01:00
|
|
|
}
|
|
|
|
}
|
2015-01-15 14:58:55 +01:00
|
|
|
while (old_line != (int)m_lines.size())
|
|
|
|
new_lines.push_back(std::move(m_lines[(int)old_line++]));
|
2014-01-24 01:56:33 +01:00
|
|
|
|
2015-01-15 14:58:55 +01:00
|
|
|
m_lines = std::move(new_lines);
|
2014-01-16 23:07:42 +01:00
|
|
|
}
|
|
|
|
|
2014-10-01 01:20:12 +02:00
|
|
|
int WordDB::get_word_occurences(StringView word) const
|
2014-04-22 20:31:31 +02:00
|
|
|
{
|
2016-02-05 10:36:07 +01:00
|
|
|
auto it = m_words.find(word);
|
2014-04-22 20:31:31 +02:00
|
|
|
if (it != m_words.end())
|
2014-10-28 20:23:02 +01:00
|
|
|
return it->second.refcount;
|
2014-04-22 20:31:31 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:49:08 +02:00
|
|
|
RankedMatchList WordDB::find_matching(StringView query)
|
2015-10-18 17:55:21 +02:00
|
|
|
{
|
|
|
|
auto matches = [](UsedLetters query, UsedLetters letters)
|
|
|
|
{
|
|
|
|
return (query & letters) == query;
|
|
|
|
};
|
|
|
|
|
|
|
|
update_db();
|
|
|
|
const UsedLetters letters = used_letters(query);
|
2015-10-22 20:49:08 +02:00
|
|
|
RankedMatchList res;
|
2015-10-18 17:55:21 +02:00
|
|
|
for (auto&& word : m_words)
|
|
|
|
{
|
2015-10-21 21:16:36 +02:00
|
|
|
if (query.empty())
|
|
|
|
{
|
2015-10-27 22:25:18 +01:00
|
|
|
res.push_back(RankedMatch{word.first, query});
|
2015-10-21 21:16:36 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2015-10-18 17:55:21 +02:00
|
|
|
UsedLetters word_letters = word.second.letters;
|
|
|
|
if (not matches(to_lower(letters), to_lower(word_letters)) or
|
|
|
|
not matches(letters & upper_mask, word_letters & upper_mask))
|
|
|
|
continue;
|
2015-10-27 22:25:18 +01:00
|
|
|
|
|
|
|
if (RankedMatch match{word.first, query})
|
|
|
|
res.push_back(match);
|
2015-10-18 17:55:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2015-05-22 14:58:56 +02:00
|
|
|
UnitTest test_word_db{[]()
|
|
|
|
{
|
2015-10-22 20:49:08 +02:00
|
|
|
auto cmp_words = [](const RankedMatch& lhs, const RankedMatch& rhs) {
|
2015-10-27 22:25:18 +01:00
|
|
|
return lhs.candidate() < rhs.candidate();
|
2015-10-21 21:16:36 +02:00
|
|
|
};
|
|
|
|
|
2015-10-22 20:49:08 +02:00
|
|
|
auto eq = [](ArrayView<const RankedMatch> lhs, const WordList& rhs) {
|
2015-10-21 21:16:36 +02:00
|
|
|
return lhs.size() == rhs.size() and
|
|
|
|
std::equal(lhs.begin(), lhs.end(), rhs.begin(),
|
2015-10-22 20:49:08 +02:00
|
|
|
[](const RankedMatch& lhs, const StringView& rhs) {
|
2015-10-27 22:25:18 +01:00
|
|
|
return lhs.candidate() == rhs;
|
2015-10-21 21:16:36 +02:00
|
|
|
});
|
|
|
|
};
|
|
|
|
|
2015-05-22 14:58:56 +02:00
|
|
|
Buffer buffer("test", Buffer::Flags::None,
|
2015-10-16 14:52:14 +02:00
|
|
|
"tchou mutch\n"
|
|
|
|
"tchou kanaky tchou\n"
|
|
|
|
"\n"
|
|
|
|
"tchaa tchaa\n"
|
|
|
|
"allo\n");
|
2015-05-22 14:58:56 +02:00
|
|
|
WordDB word_db(buffer);
|
2015-10-21 21:16:36 +02:00
|
|
|
auto res = word_db.find_matching("");
|
|
|
|
std::sort(res.begin(), res.end(), cmp_words);
|
|
|
|
kak_assert(eq(res, WordList{ "allo" COMMA "kanaky" COMMA "mutch" COMMA "tchaa" COMMA "tchou" }));
|
2015-05-22 14:58:56 +02:00
|
|
|
kak_assert(word_db.get_word_occurences("tchou") == 3);
|
|
|
|
kak_assert(word_db.get_word_occurences("allo") == 1);
|
|
|
|
buffer.erase(buffer.iterator_at({1, 6}), buffer.iterator_at({4, 0}));
|
2015-10-21 21:16:36 +02:00
|
|
|
res = word_db.find_matching("");
|
|
|
|
std::sort(res.begin(), res.end(), cmp_words);
|
|
|
|
kak_assert(eq(res, WordList{ "allo" COMMA "mutch" COMMA "tchou" }));
|
2015-05-22 14:58:56 +02:00
|
|
|
buffer.insert(buffer.iterator_at({1, 0}), "re");
|
2015-10-21 21:16:36 +02:00
|
|
|
res = word_db.find_matching("");
|
|
|
|
std::sort(res.begin(), res.end(), cmp_words);
|
|
|
|
kak_assert(eq(res, WordList{ "allo" COMMA "mutch" COMMA "retchou" COMMA "tchou" }));
|
2015-05-22 14:58:56 +02:00
|
|
|
}};
|
|
|
|
|
2015-10-18 17:55:21 +02:00
|
|
|
UnitTest test_used_letters{[]()
|
|
|
|
{
|
|
|
|
kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD")));
|
|
|
|
}};
|
|
|
|
|
2014-01-16 23:07:42 +01:00
|
|
|
}
|