From d55d041c6aa9941cc74853e4b2bf6620773c84c1 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 1 Oct 2014 00:20:12 +0100 Subject: [PATCH] Add support for interned strings Use interned strings for Modification contents and word database. Interned strings are guaranteed not to move in memory and are reference counted. --- src/buffer.cc | 11 ++-- src/buffer.hh | 2 +- src/interned_string.cc | 50 +++++++++++++++++++ src/interned_string.hh | 111 +++++++++++++++++++++++++++++++++++++++++ src/main.cc | 2 + src/string.cc | 68 +++++++++++++++++++++++++ src/string.hh | 13 +++++ src/unit_tests.cc | 6 +-- src/word_db.cc | 20 ++++---- src/word_db.hh | 13 +++-- 10 files changed, 270 insertions(+), 26 deletions(-) create mode 100644 src/interned_string.cc create mode 100644 src/interned_string.hh diff --git a/src/buffer.cc b/src/buffer.cc index 7c7a4e63..aa6314d6 100644 --- a/src/buffer.cc +++ b/src/buffer.cc @@ -2,11 +2,12 @@ #include "assert.hh" #include "buffer_manager.hh" +#include "client.hh" #include "context.hh" #include "file.hh" +#include "interned_string.hh" #include "utils.hh" #include "window.hh" -#include "client.hh" #include @@ -170,9 +171,9 @@ struct Buffer::Modification Type type; ByteCoord coord; - String content; + InternedString content; - Modification(Type type, ByteCoord coord, String content) + Modification(Type type, ByteCoord coord, InternedString content) : type(type), coord(coord), content(std::move(content)) {} Modification inverse() const @@ -240,7 +241,7 @@ void Buffer::check_invariant() const #endif } -ByteCoord Buffer::do_insert(ByteCoord pos, const String& content) +ByteCoord Buffer::do_insert(ByteCoord pos, StringView content) { kak_assert(is_valid(pos)); @@ -341,7 +342,7 @@ ByteCoord Buffer::do_erase(ByteCoord begin, ByteCoord end) void Buffer::apply_modification(const Modification& modification) { - const String& content = modification.content; + StringView content = modification.content; ByteCoord coord = modification.coord; kak_assert(is_valid(coord)); diff --git a/src/buffer.hh b/src/buffer.hh index 17e0e4c7..e24bc786 100644 --- a/src/buffer.hh +++ b/src/buffer.hh @@ -190,7 +190,7 @@ private: }; LineList m_lines; - ByteCoord do_insert(ByteCoord pos, const String& content); + ByteCoord do_insert(ByteCoord pos, StringView content); ByteCoord do_erase(ByteCoord begin, ByteCoord end); String m_name; diff --git a/src/interned_string.cc b/src/interned_string.cc new file mode 100644 index 00000000..b8825f66 --- /dev/null +++ b/src/interned_string.cc @@ -0,0 +1,50 @@ +#include "interned_string.hh" + +namespace Kakoune +{ + +InternedString StringRegistry::acquire(StringView str) +{ + auto it = m_slot_map.find(str); + if (it == m_slot_map.end()) + { + size_t slot; + if (not m_free_slots.empty()) + { + slot = m_free_slots.back(); + m_free_slots.pop_back(); + m_storage[slot] = DataAndRefCount({str.begin(), str.end()}, 1); + } + else + { + slot = m_storage.size(); + m_storage.push_back(DataAndRefCount({str.begin(), str.end()}, 1)); + } + // Create a new string view that point to the storage data + StringView storage_view{m_storage[slot].first.data(), (int)m_storage[slot].first.size()}; + m_slot_map[storage_view] = slot; + + return InternedString{storage_view, InternedString::AlreadyAcquired{}}; + } + + size_t slot = it->second; + m_storage[slot].second++; + StringView storage_view{m_storage[slot].first.data(), (int)m_storage[slot].first.size()}; + return InternedString{storage_view, InternedString::AlreadyAcquired{}}; +} + +void StringRegistry::release(StringView str) +{ + auto it = m_slot_map.find(str); + kak_assert(it != m_slot_map.end()); + + size_t slot = it->second; + if (--m_storage[slot].second == 0) + { + m_free_slots.push_back(slot); + m_slot_map.erase(it); + m_storage[slot].first.clear(); + } +} + +} diff --git a/src/interned_string.hh b/src/interned_string.hh new file mode 100644 index 00000000..5d81afd3 --- /dev/null +++ b/src/interned_string.hh @@ -0,0 +1,111 @@ +#ifndef interned_string_hh_INCLUDED +#define interned_string_hh_INCLUDED + +#include "string.hh" +#include "utils.hh" + +#include + +namespace Kakoune +{ + +class InternedString; + +class StringRegistry : public Singleton +{ +private: + friend class InternedString; + + InternedString acquire(StringView str); + void release(StringView str); + + std::unordered_map m_slot_map; + std::vector m_free_slots; + using DataAndRefCount = std::pair, int>; + std::vector m_storage; +}; + +class InternedString : public StringView +{ +public: + InternedString() = default; + + InternedString(const InternedString& str) { acquire_ifn(str); } + + InternedString(InternedString&& str) : StringView(str) + { + static_cast(str) = StringView{}; + } + + InternedString(const char* str) : StringView() { acquire_ifn(str); } + InternedString(StringView str) : StringView() { acquire_ifn(str); } + InternedString(const String& str) : StringView() { acquire_ifn(str); } + + InternedString& operator=(const InternedString& str) + { + if (str.data() == data() && str.length() == length()) + return *this; + release_ifn(); + acquire_ifn(str); + return *this; + } + + InternedString& operator=(InternedString&& str) + { + static_cast(*this) = str; + static_cast(str) = StringView{}; + return *this; + } + + ~InternedString() + { + release_ifn(); + } + + bool operator==(const InternedString& str) const + { return data() == str.data() && length() == str.length(); } + bool operator!=(const InternedString& str) const + { return !(*this == str); } + + using StringView::operator==; + using StringView::operator!=; + +private: + friend class StringRegistry; + + struct AlreadyAcquired{}; + InternedString(StringView str, AlreadyAcquired) + : StringView(str) {} + + void acquire_ifn(StringView str) + { + if (str.empty()) + static_cast(*this) = StringView{}; + else + *this = StringRegistry::instance().acquire(str); + } + + void release_ifn() + { + if (!empty()) + StringRegistry::instance().release(*this); + } +}; + +} + +namespace std +{ + template<> + struct hash + { + size_t operator()(const Kakoune::InternedString& str) const + { + return hash{}(str.data()) ^ + hash{}((int)str.length()); + } + }; +} + +#endif // interned_string_hh_INCLUDED + diff --git a/src/main.cc b/src/main.cc index 32ed0c0c..654d3dc5 100644 --- a/src/main.cc +++ b/src/main.cc @@ -20,6 +20,7 @@ #include "remote.hh" #include "shell_manager.hh" #include "string.hh" +#include "interned_string.hh" #include "window.hh" #if defined(__APPLE__) @@ -303,6 +304,7 @@ int run_server(StringView session, StringView init_command, signal(SIGTERM, [](int) { terminate = true; }); } + StringRegistry string_registry; EventManager event_manager; GlobalOptions global_options; GlobalHooks global_hooks; diff --git a/src/string.cc b/src/string.cc index abc95ab4..172193d4 100644 --- a/src/string.cc +++ b/src/string.cc @@ -7,6 +7,14 @@ namespace Kakoune { +bool operator<(StringView lhs, StringView rhs) +{ + int cmp = strncmp(lhs.data(), rhs.data(), (int)std::min(lhs.length(), rhs.length())); + if (cmp == 0) + return lhs.length() < rhs.length(); + return cmp < 0; +} + std::vector split(StringView str, char separator, char escape) { std::vector res; @@ -139,4 +147,64 @@ String expand_tabs(StringView line, CharCount tabstop, CharCount col) return res; } +[[gnu::always_inline]] +static inline uint32_t rotl(uint32_t x, int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} + +[[gnu::always_inline]] +static inline uint32_t fmix(uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +// murmur3 hash, based on https://github.com/PeterScott/murmur3 +size_t hash_data(const char* input, size_t len) +{ + const uint8_t* data = reinterpret_cast(input); + uint32_t hash = 0x1235678; + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; + + const int nblocks = len / 4; + const uint32_t* blocks = reinterpret_cast(data + nblocks*4); + + for (int i = -nblocks; i; ++i) + { + uint32_t key = blocks[i]; + key *= c1; + key = rotl(key, 15); + key *= c2; + + hash ^= key; + hash = rotl(hash, 13); + hash = hash * 5 + 0xe6546b64; + } + + const uint8_t* tail = data + nblocks * 4; + uint32_t key = 0; + switch (len & 3) + { + case 3: key ^= tail[2] << 16; + case 2: key ^= tail[1] << 8; + case 1: key ^= tail[0]; + key *= c1; + key = rotl(key,15); + key *= c2; + hash ^= key; + } + + hash ^= len; + hash = fmix(hash); + + return hash; +} + } diff --git a/src/string.hh b/src/string.hh index 8f3df1a3..573853fe 100644 --- a/src/string.hh +++ b/src/string.hh @@ -140,6 +140,8 @@ inline bool StringView::operator!=(StringView other) const return !this->operator==(other); } +bool operator<(StringView lhs, StringView rhs); + inline bool operator==(const char* lhs, StringView rhs) { return StringView{lhs} == rhs; @@ -304,6 +306,8 @@ bool subsequence_match(StringView str, StringView subseq); String expand_tabs(StringView line, CharCount tabstop, CharCount col = 0); +size_t hash_data(const char* data, size_t len); + } namespace std @@ -316,6 +320,15 @@ namespace std return hash::operator()(str); } }; + + template<> + struct hash + { + size_t operator()(Kakoune::StringView str) const + { + return Kakoune::hash_data(str.data(), (int)str.length()); + } + }; } #endif // string_hh_INCLUDED diff --git a/src/unit_tests.cc b/src/unit_tests.cc index 3795b8a5..3c9e5c17 100644 --- a/src/unit_tests.cc +++ b/src/unit_tests.cc @@ -82,17 +82,17 @@ void test_word_db() WordDB word_db(buffer); auto res = word_db.find_prefix(""); std::sort(res.begin(), res.end()); - kak_assert(res == std::vector{ "allo" COMMA "kanaky" COMMA "mutch" COMMA "tchaa" COMMA "tchou" }); + kak_assert(res == std::vector{ "allo" COMMA "kanaky" COMMA "mutch" COMMA "tchaa" COMMA "tchou" }); kak_assert(word_db.get_word_occurences("tchou") == 3); kak_assert(word_db.get_word_occurences("allo") == 1); buffer.erase(buffer.iterator_at({1, 6}), buffer.iterator_at({4, 0})); res = word_db.find_prefix(""); std::sort(res.begin(), res.end()); - kak_assert(res == std::vector{ "allo" COMMA "mutch" COMMA "tchou" }); + kak_assert(res == std::vector{ "allo" COMMA "mutch" COMMA "tchou" }); buffer.insert(buffer.iterator_at({1, 0}), "re"); res = word_db.find_prefix(""); std::sort(res.begin(), res.end()); - kak_assert(res == std::vector{ "allo" COMMA "mutch" COMMA "retchou" COMMA "tchou" }); + kak_assert(res == std::vector{ "allo" COMMA "mutch" COMMA "retchou" COMMA "tchou" }); } void test_utf8() diff --git a/src/word_db.cc b/src/word_db.cc index e3be4245..8749b451 100644 --- a/src/word_db.cc +++ b/src/word_db.cc @@ -7,9 +7,9 @@ namespace Kakoune { -static std::vector get_words(StringView content) +static std::vector get_words(StringView content) { - std::vector res; + std::vector res; using Iterator = utf8::iterator; const char* word_start = content.begin(); bool in_word = false; @@ -24,20 +24,20 @@ static std::vector get_words(StringView content) } else if (in_word and not word) { - res.push_back({word_start, it.base()}); + res.push_back(StringView{word_start, it.base()}); in_word = false; } } return res; } -static void add_words(WordDB::WordList& wl, const std::vector& words) +static void add_words(WordDB::WordList& wl, const std::vector& words) { for (auto& w : words) ++wl[w]; } -static void remove_words(WordDB::WordList& wl, const std::vector& words) +static void remove_words(WordDB::WordList& wl, const std::vector& words) { for (auto& w : words) { @@ -104,11 +104,11 @@ void WordDB::update_db() m_line_to_words = std::move(new_lines); } -std::vector WordDB::find_prefix(const String& prefix) +std::vector WordDB::find_prefix(StringView prefix) { update_db(); - std::vector res; + std::vector res; for (auto it = m_words.lower_bound(prefix); it != m_words.end(); ++it) { if (not prefix_match(it->first, prefix)) @@ -118,11 +118,11 @@ std::vector WordDB::find_prefix(const String& prefix) return res; } -std::vector WordDB::find_subsequence(const String& subsequence) +std::vector WordDB::find_subsequence(StringView subsequence) { update_db(); - std::vector res; + std::vector res; for (auto it = m_words.begin(); it != m_words.end(); ++it) { if (subsequence_match(it->first, subsequence)) @@ -131,7 +131,7 @@ std::vector WordDB::find_subsequence(const String& subsequence) return res; } -int WordDB::get_word_occurences(const String& word) const +int WordDB::get_word_occurences(StringView word) const { auto it = m_words.find(word); if (it != m_words.end()) diff --git a/src/word_db.hh b/src/word_db.hh index 043f9960..ee4e6ab5 100644 --- a/src/word_db.hh +++ b/src/word_db.hh @@ -2,27 +2,26 @@ #define word_db_hh_INCLUDED #include "buffer.hh" +#include "interned_string.hh" #include namespace Kakoune { -class String; - // maintain a database of words available in a buffer class WordDB { public: WordDB(const Buffer& buffer); - std::vector find_prefix(const String& prefix); - std::vector find_subsequence(const String& subsequence); - int get_word_occurences(const String& word) const; + std::vector find_prefix(StringView prefix); + std::vector find_subsequence(StringView subsequence); + int get_word_occurences(StringView word) const; - using WordList = std::map; + using WordList = std::map; private: - using LineToWords = std::vector>; + using LineToWords = std::vector>; void update_db();