Add support for interned strings

Use interned strings for Modification contents and word database.
Interned strings are guaranteed not to move in memory and are
reference counted.
This commit is contained in:
Maxime Coste 2014-10-01 00:20:12 +01:00
parent d9e462851c
commit d55d041c6a
10 changed files with 270 additions and 26 deletions

View File

@ -2,11 +2,12 @@
#include "assert.hh" #include "assert.hh"
#include "buffer_manager.hh" #include "buffer_manager.hh"
#include "client.hh"
#include "context.hh" #include "context.hh"
#include "file.hh" #include "file.hh"
#include "interned_string.hh"
#include "utils.hh" #include "utils.hh"
#include "window.hh" #include "window.hh"
#include "client.hh"
#include <algorithm> #include <algorithm>
@ -170,9 +171,9 @@ struct Buffer::Modification
Type type; Type type;
ByteCoord coord; ByteCoord coord;
String content; InternedString content;
Modification(Type type, ByteCoord coord, String content) Modification(Type type, ByteCoord coord, InternedString content)
: type(type), coord(coord), content(std::move(content)) {} : type(type), coord(coord), content(std::move(content)) {}
Modification inverse() const Modification inverse() const
@ -240,7 +241,7 @@ void Buffer::check_invariant() const
#endif #endif
} }
ByteCoord Buffer::do_insert(ByteCoord pos, const String& content) ByteCoord Buffer::do_insert(ByteCoord pos, StringView content)
{ {
kak_assert(is_valid(pos)); kak_assert(is_valid(pos));
@ -341,7 +342,7 @@ ByteCoord Buffer::do_erase(ByteCoord begin, ByteCoord end)
void Buffer::apply_modification(const Modification& modification) void Buffer::apply_modification(const Modification& modification)
{ {
const String& content = modification.content; StringView content = modification.content;
ByteCoord coord = modification.coord; ByteCoord coord = modification.coord;
kak_assert(is_valid(coord)); kak_assert(is_valid(coord));

View File

@ -190,7 +190,7 @@ private:
}; };
LineList m_lines; LineList m_lines;
ByteCoord do_insert(ByteCoord pos, const String& content); ByteCoord do_insert(ByteCoord pos, StringView content);
ByteCoord do_erase(ByteCoord begin, ByteCoord end); ByteCoord do_erase(ByteCoord begin, ByteCoord end);
String m_name; String m_name;

50
src/interned_string.cc Normal file
View File

@ -0,0 +1,50 @@
#include "interned_string.hh"
namespace Kakoune
{
InternedString StringRegistry::acquire(StringView str)
{
auto it = m_slot_map.find(str);
if (it == m_slot_map.end())
{
size_t slot;
if (not m_free_slots.empty())
{
slot = m_free_slots.back();
m_free_slots.pop_back();
m_storage[slot] = DataAndRefCount({str.begin(), str.end()}, 1);
}
else
{
slot = m_storage.size();
m_storage.push_back(DataAndRefCount({str.begin(), str.end()}, 1));
}
// Create a new string view that point to the storage data
StringView storage_view{m_storage[slot].first.data(), (int)m_storage[slot].first.size()};
m_slot_map[storage_view] = slot;
return InternedString{storage_view, InternedString::AlreadyAcquired{}};
}
size_t slot = it->second;
m_storage[slot].second++;
StringView storage_view{m_storage[slot].first.data(), (int)m_storage[slot].first.size()};
return InternedString{storage_view, InternedString::AlreadyAcquired{}};
}
void StringRegistry::release(StringView str)
{
auto it = m_slot_map.find(str);
kak_assert(it != m_slot_map.end());
size_t slot = it->second;
if (--m_storage[slot].second == 0)
{
m_free_slots.push_back(slot);
m_slot_map.erase(it);
m_storage[slot].first.clear();
}
}
}

111
src/interned_string.hh Normal file
View File

@ -0,0 +1,111 @@
#ifndef interned_string_hh_INCLUDED
#define interned_string_hh_INCLUDED
#include "string.hh"
#include "utils.hh"
#include <unordered_map>
namespace Kakoune
{
class InternedString;
class StringRegistry : public Singleton<StringRegistry>
{
private:
friend class InternedString;
InternedString acquire(StringView str);
void release(StringView str);
std::unordered_map<StringView, size_t> m_slot_map;
std::vector<size_t> m_free_slots;
using DataAndRefCount = std::pair<std::vector<char>, int>;
std::vector<DataAndRefCount> m_storage;
};
class InternedString : public StringView
{
public:
InternedString() = default;
InternedString(const InternedString& str) { acquire_ifn(str); }
InternedString(InternedString&& str) : StringView(str)
{
static_cast<StringView&>(str) = StringView{};
}
InternedString(const char* str) : StringView() { acquire_ifn(str); }
InternedString(StringView str) : StringView() { acquire_ifn(str); }
InternedString(const String& str) : StringView() { acquire_ifn(str); }
InternedString& operator=(const InternedString& str)
{
if (str.data() == data() && str.length() == length())
return *this;
release_ifn();
acquire_ifn(str);
return *this;
}
InternedString& operator=(InternedString&& str)
{
static_cast<StringView&>(*this) = str;
static_cast<StringView&>(str) = StringView{};
return *this;
}
~InternedString()
{
release_ifn();
}
bool operator==(const InternedString& str) const
{ return data() == str.data() && length() == str.length(); }
bool operator!=(const InternedString& str) const
{ return !(*this == str); }
using StringView::operator==;
using StringView::operator!=;
private:
friend class StringRegistry;
struct AlreadyAcquired{};
InternedString(StringView str, AlreadyAcquired)
: StringView(str) {}
void acquire_ifn(StringView str)
{
if (str.empty())
static_cast<StringView&>(*this) = StringView{};
else
*this = StringRegistry::instance().acquire(str);
}
void release_ifn()
{
if (!empty())
StringRegistry::instance().release(*this);
}
};
}
namespace std
{
template<>
struct hash<Kakoune::InternedString>
{
size_t operator()(const Kakoune::InternedString& str) const
{
return hash<const char*>{}(str.data()) ^
hash<int>{}((int)str.length());
}
};
}
#endif // interned_string_hh_INCLUDED

View File

@ -20,6 +20,7 @@
#include "remote.hh" #include "remote.hh"
#include "shell_manager.hh" #include "shell_manager.hh"
#include "string.hh" #include "string.hh"
#include "interned_string.hh"
#include "window.hh" #include "window.hh"
#if defined(__APPLE__) #if defined(__APPLE__)
@ -303,6 +304,7 @@ int run_server(StringView session, StringView init_command,
signal(SIGTERM, [](int) { terminate = true; }); signal(SIGTERM, [](int) { terminate = true; });
} }
StringRegistry string_registry;
EventManager event_manager; EventManager event_manager;
GlobalOptions global_options; GlobalOptions global_options;
GlobalHooks global_hooks; GlobalHooks global_hooks;

View File

@ -7,6 +7,14 @@
namespace Kakoune namespace Kakoune
{ {
bool operator<(StringView lhs, StringView rhs)
{
int cmp = strncmp(lhs.data(), rhs.data(), (int)std::min(lhs.length(), rhs.length()));
if (cmp == 0)
return lhs.length() < rhs.length();
return cmp < 0;
}
std::vector<String> split(StringView str, char separator, char escape) std::vector<String> split(StringView str, char separator, char escape)
{ {
std::vector<String> res; std::vector<String> res;
@ -139,4 +147,64 @@ String expand_tabs(StringView line, CharCount tabstop, CharCount col)
return res; return res;
} }
[[gnu::always_inline]]
static inline uint32_t rotl(uint32_t x, int8_t r)
{
return (x << r) | (x >> (32 - r));
}
[[gnu::always_inline]]
static inline uint32_t fmix(uint32_t h)
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
// murmur3 hash, based on https://github.com/PeterScott/murmur3
size_t hash_data(const char* input, size_t len)
{
const uint8_t* data = reinterpret_cast<const uint8_t*>(input);
uint32_t hash = 0x1235678;
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
const int nblocks = len / 4;
const uint32_t* blocks = reinterpret_cast<const uint32_t*>(data + nblocks*4);
for (int i = -nblocks; i; ++i)
{
uint32_t key = blocks[i];
key *= c1;
key = rotl(key, 15);
key *= c2;
hash ^= key;
hash = rotl(hash, 13);
hash = hash * 5 + 0xe6546b64;
}
const uint8_t* tail = data + nblocks * 4;
uint32_t key = 0;
switch (len & 3)
{
case 3: key ^= tail[2] << 16;
case 2: key ^= tail[1] << 8;
case 1: key ^= tail[0];
key *= c1;
key = rotl(key,15);
key *= c2;
hash ^= key;
}
hash ^= len;
hash = fmix(hash);
return hash;
}
} }

View File

@ -140,6 +140,8 @@ inline bool StringView::operator!=(StringView other) const
return !this->operator==(other); return !this->operator==(other);
} }
bool operator<(StringView lhs, StringView rhs);
inline bool operator==(const char* lhs, StringView rhs) inline bool operator==(const char* lhs, StringView rhs)
{ {
return StringView{lhs} == rhs; return StringView{lhs} == rhs;
@ -304,6 +306,8 @@ bool subsequence_match(StringView str, StringView subseq);
String expand_tabs(StringView line, CharCount tabstop, CharCount col = 0); String expand_tabs(StringView line, CharCount tabstop, CharCount col = 0);
size_t hash_data(const char* data, size_t len);
} }
namespace std namespace std
@ -316,6 +320,15 @@ namespace std
return hash<std::string>::operator()(str); return hash<std::string>::operator()(str);
} }
}; };
template<>
struct hash<Kakoune::StringView>
{
size_t operator()(Kakoune::StringView str) const
{
return Kakoune::hash_data(str.data(), (int)str.length());
}
};
} }
#endif // string_hh_INCLUDED #endif // string_hh_INCLUDED

View File

@ -82,17 +82,17 @@ void test_word_db()
WordDB word_db(buffer); WordDB word_db(buffer);
auto res = word_db.find_prefix(""); auto res = word_db.find_prefix("");
std::sort(res.begin(), res.end()); std::sort(res.begin(), res.end());
kak_assert(res == std::vector<String>{ "allo" COMMA "kanaky" COMMA "mutch" COMMA "tchaa" COMMA "tchou" }); kak_assert(res == std::vector<InternedString>{ "allo" COMMA "kanaky" COMMA "mutch" COMMA "tchaa" COMMA "tchou" });
kak_assert(word_db.get_word_occurences("tchou") == 3); kak_assert(word_db.get_word_occurences("tchou") == 3);
kak_assert(word_db.get_word_occurences("allo") == 1); kak_assert(word_db.get_word_occurences("allo") == 1);
buffer.erase(buffer.iterator_at({1, 6}), buffer.iterator_at({4, 0})); buffer.erase(buffer.iterator_at({1, 6}), buffer.iterator_at({4, 0}));
res = word_db.find_prefix(""); res = word_db.find_prefix("");
std::sort(res.begin(), res.end()); std::sort(res.begin(), res.end());
kak_assert(res == std::vector<String>{ "allo" COMMA "mutch" COMMA "tchou" }); kak_assert(res == std::vector<InternedString>{ "allo" COMMA "mutch" COMMA "tchou" });
buffer.insert(buffer.iterator_at({1, 0}), "re"); buffer.insert(buffer.iterator_at({1, 0}), "re");
res = word_db.find_prefix(""); res = word_db.find_prefix("");
std::sort(res.begin(), res.end()); std::sort(res.begin(), res.end());
kak_assert(res == std::vector<String>{ "allo" COMMA "mutch" COMMA "retchou" COMMA "tchou" }); kak_assert(res == std::vector<InternedString>{ "allo" COMMA "mutch" COMMA "retchou" COMMA "tchou" });
} }
void test_utf8() void test_utf8()

View File

@ -7,9 +7,9 @@
namespace Kakoune namespace Kakoune
{ {
static std::vector<String> get_words(StringView content) static std::vector<InternedString> get_words(StringView content)
{ {
std::vector<String> res; std::vector<InternedString> res;
using Iterator = utf8::iterator<const char*, utf8::InvalidPolicy::Pass>; using Iterator = utf8::iterator<const char*, utf8::InvalidPolicy::Pass>;
const char* word_start = content.begin(); const char* word_start = content.begin();
bool in_word = false; bool in_word = false;
@ -24,20 +24,20 @@ static std::vector<String> get_words(StringView content)
} }
else if (in_word and not word) else if (in_word and not word)
{ {
res.push_back({word_start, it.base()}); res.push_back(StringView{word_start, it.base()});
in_word = false; in_word = false;
} }
} }
return res; return res;
} }
static void add_words(WordDB::WordList& wl, const std::vector<String>& words) static void add_words(WordDB::WordList& wl, const std::vector<InternedString>& words)
{ {
for (auto& w : words) for (auto& w : words)
++wl[w]; ++wl[w];
} }
static void remove_words(WordDB::WordList& wl, const std::vector<String>& words) static void remove_words(WordDB::WordList& wl, const std::vector<InternedString>& words)
{ {
for (auto& w : words) for (auto& w : words)
{ {
@ -104,11 +104,11 @@ void WordDB::update_db()
m_line_to_words = std::move(new_lines); m_line_to_words = std::move(new_lines);
} }
std::vector<String> WordDB::find_prefix(const String& prefix) std::vector<InternedString> WordDB::find_prefix(StringView prefix)
{ {
update_db(); update_db();
std::vector<String> res; std::vector<InternedString> res;
for (auto it = m_words.lower_bound(prefix); it != m_words.end(); ++it) for (auto it = m_words.lower_bound(prefix); it != m_words.end(); ++it)
{ {
if (not prefix_match(it->first, prefix)) if (not prefix_match(it->first, prefix))
@ -118,11 +118,11 @@ std::vector<String> WordDB::find_prefix(const String& prefix)
return res; return res;
} }
std::vector<String> WordDB::find_subsequence(const String& subsequence) std::vector<InternedString> WordDB::find_subsequence(StringView subsequence)
{ {
update_db(); update_db();
std::vector<String> res; std::vector<InternedString> res;
for (auto it = m_words.begin(); it != m_words.end(); ++it) for (auto it = m_words.begin(); it != m_words.end(); ++it)
{ {
if (subsequence_match(it->first, subsequence)) if (subsequence_match(it->first, subsequence))
@ -131,7 +131,7 @@ std::vector<String> WordDB::find_subsequence(const String& subsequence)
return res; return res;
} }
int WordDB::get_word_occurences(const String& word) const int WordDB::get_word_occurences(StringView word) const
{ {
auto it = m_words.find(word); auto it = m_words.find(word);
if (it != m_words.end()) if (it != m_words.end())

View File

@ -2,27 +2,26 @@
#define word_db_hh_INCLUDED #define word_db_hh_INCLUDED
#include "buffer.hh" #include "buffer.hh"
#include "interned_string.hh"
#include <map> #include <map>
namespace Kakoune namespace Kakoune
{ {
class String;
// maintain a database of words available in a buffer // maintain a database of words available in a buffer
class WordDB class WordDB
{ {
public: public:
WordDB(const Buffer& buffer); WordDB(const Buffer& buffer);
std::vector<String> find_prefix(const String& prefix); std::vector<InternedString> find_prefix(StringView prefix);
std::vector<String> find_subsequence(const String& subsequence); std::vector<InternedString> find_subsequence(StringView subsequence);
int get_word_occurences(const String& word) const; int get_word_occurences(StringView word) const;
using WordList = std::map<String, int>; using WordList = std::map<InternedString, int>;
private: private:
using LineToWords = std::vector<std::vector<String>>; using LineToWords = std::vector<std::vector<InternedString>>;
void update_db(); void update_db();