From e4a23a64faf56b5822fee13d976e038174408238 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 28 May 2021 17:03:06 +1000 Subject: [PATCH] Support opening files bigger than 2 GiB The real technical limit is with lines bigger than 2 GiB and buffers with more than 2 Gi lines, refactor buffer loading to make it possible to load those files. Fix an overflow with the hash_data function at the same time --- src/buffer.cc | 120 ++++++++------------------------------- src/buffer.hh | 8 ++- src/buffer_manager.cc | 10 ++-- src/buffer_manager.hh | 6 +- src/buffer_utils.cc | 90 +++++++++++++++++++++++------ src/buffer_utils.hh | 1 + src/commands.cc | 2 +- src/file.cc | 4 +- src/hash.cc | 4 +- src/line_modification.cc | 12 ++-- src/main.cc | 4 +- src/word_db.cc | 8 +-- 12 files changed, 126 insertions(+), 143 deletions(-) diff --git a/src/buffer.cc b/src/buffer.cc index ddbad72c..2ed27bb8 100644 --- a/src/buffer.cc +++ b/src/buffer.cc @@ -20,57 +20,13 @@ namespace Kakoune { -struct ParsedLines -{ - BufferLines lines; - ByteOrderMark bom = ByteOrderMark::None; - EolFormat eolformat = EolFormat::Lf; -}; - -static ParsedLines parse_lines(StringView data) -{ - ParsedLines res; - const char* pos = data.begin(); - if (data.substr(0, 3_byte) == "\xEF\xBB\xBF") - { - res.bom = ByteOrderMark::Utf8; - pos = data.begin() + 3; - } - - bool has_crlf = false, has_lf = false; - for (auto it = pos; it != data.end(); ++it) - { - if (*it == '\n') - ((it != pos and *(it-1) == '\r') ? has_crlf : has_lf) = true; - } - const bool crlf = has_crlf and not has_lf; - res.eolformat = crlf ? EolFormat::Crlf : EolFormat::Lf; - - while (pos < data.end()) - { - const char* eol = std::find(pos, data.end(), '\n'); - res.lines.emplace_back(StringData::create({{pos, eol - (crlf and eol != data.end() ? 1 : 0)}, "\n"})); - pos = eol + 1; - } - - if (res.lines.empty()) - res.lines.emplace_back(StringData::create({"\n"})); - - return res; -} - -static void apply_options(OptionManager& options, const ParsedLines& parsed_lines) -{ - options.get_local_option("eolformat").set(parsed_lines.eolformat); - options.get_local_option("BOM").set(parsed_lines.bom); -} - Buffer::HistoryNode::HistoryNode(HistoryId parent) : parent{parent}, committed{Clock::now()} {} -Buffer::Buffer(String name, Flags flags, StringView data, - timespec fs_timestamp) +Buffer::Buffer(String name, Flags flags, BufferLines lines, + ByteOrderMark bom, EolFormat eolformat, + FsStatus fs_status) : Scope{GlobalScope::instance()}, m_name{(flags & Flags::File) ? real_path(parse_filename(name)) : std::move(name)}, m_display_name{(flags & Flags::File) ? compact_path(m_name) : m_name}, @@ -78,20 +34,19 @@ Buffer::Buffer(String name, Flags flags, StringView data, m_history{{HistoryId::Invalid}}, m_history_id{HistoryId::First}, m_last_save_history_id{HistoryId::First}, - m_fs_status{fs_timestamp, data.length(), hash_value(data)} + m_fs_status{fs_status} { - ParsedLines parsed_lines = parse_lines(data); - #ifdef KAK_DEBUG - for (auto& line : parsed_lines.lines) + for (auto& line : lines) kak_assert(not (line->length == 0) and line->data()[line->length-1] == '\n'); #endif - static_cast(m_lines) = std::move(parsed_lines.lines); + static_cast(m_lines) = std::move(lines); m_changes.push_back({ Change::Insert, {0,0}, line_count() }); - apply_options(options(), parsed_lines); + options().get_local_option("eolformat").set(eolformat); + options().get_local_option("BOM").set(bom); // now we may begin to record undo data if (not (flags & Flags::NoUndo)) @@ -236,10 +191,8 @@ Buffer::Modification Buffer::Modification::inverse() const return {type == Insert ? Erase : Insert, coord, content}; } -void Buffer::reload(StringView data, timespec fs_timestamp) +void Buffer::reload(BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status) { - ParsedLines parsed_lines = parse_lines(data); - const bool record_undo = not (m_flags & Flags::NoUndo); commit_undo_group(); @@ -252,21 +205,21 @@ void Buffer::reload(StringView data, timespec fs_timestamp) m_history = {HistoryNode{HistoryId::Invalid}}; m_changes.push_back({ Change::Erase, {0,0}, line_count() }); - static_cast(m_lines) = std::move(parsed_lines.lines); + static_cast(m_lines) = std::move(lines); m_changes.push_back({ Change::Insert, {0,0}, line_count() }); } else { Vector diff; for_each_diff(m_lines.begin(), m_lines.size(), - parsed_lines.lines.begin(), parsed_lines.lines.size(), + lines.begin(), lines.size(), [&diff](DiffOp op, int len) { diff.push_back({op, len}); }, [](const StringDataPtr& lhs, const StringDataPtr& rhs) { return lhs->strview() == rhs->strview(); }); auto it = m_lines.begin(); - auto new_it = parsed_lines.lines.begin(); + auto new_it = lines.begin(); for (auto& d : diff) { if (d.op == DiffOp::Keep) @@ -303,10 +256,12 @@ void Buffer::reload(StringView data, timespec fs_timestamp) commit_undo_group(); - apply_options(options(), parsed_lines); + options().get_local_option("eolformat").set(eolformat); + options().get_local_option("BOM").set(bom); + m_last_save_history_id = m_history_id; - m_fs_status = {fs_timestamp, data.length(), hash_value(data)}; + m_fs_status = fs_status; } void Buffer::commit_undo_group() @@ -729,44 +684,13 @@ String Buffer::debug_description() const content_size, additional_size); } -UnitTest test_parse_line{[] -{ - { - auto lines = parse_lines("foo\nbar\nbaz\n"); - kak_assert(lines.eolformat == EolFormat::Lf); - kak_assert(lines.bom == ByteOrderMark::None); - kak_assert(lines.lines.size() == 3); - kak_assert(lines.lines[0]->strview() == "foo\n"); - kak_assert(lines.lines[1]->strview() == "bar\n"); - kak_assert(lines.lines[2]->strview() == "baz\n"); - } - - { - auto lines = parse_lines("\xEF\xBB\xBF" "foo\nbar\r\nbaz"); - kak_assert(lines.eolformat == EolFormat::Lf); - kak_assert(lines.bom == ByteOrderMark::Utf8); - kak_assert(lines.lines.size() == 3); - kak_assert(lines.lines[0]->strview() == "foo\n"); - kak_assert(lines.lines[1]->strview() == "bar\r\n"); - kak_assert(lines.lines[2]->strview() == "baz\n"); - } - - { - auto lines = parse_lines("foo\r\nbar\r\nbaz\r\n"); - kak_assert(lines.eolformat == EolFormat::Crlf); - kak_assert(lines.bom == ByteOrderMark::None); - kak_assert(lines.lines.size() == 3); - kak_assert(lines.lines[0]->strview() == "foo\n"); - kak_assert(lines.lines[1]->strview() == "bar\n"); - kak_assert(lines.lines[2]->strview() == "baz\n"); - } -}}; - UnitTest test_buffer{[]() { - Buffer empty_buffer("empty", Buffer::Flags::None, {}); + auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; }; - Buffer buffer("test", Buffer::Flags::None, "allo ?\nmais que fais la police\n hein ?\n youpi\n"); + Buffer empty_buffer("empty", Buffer::Flags::None, make_lines("\n")); + + Buffer buffer("test", Buffer::Flags::None, make_lines("allo ?\n", "mais que fais la police\n", " hein ?\n", " youpi\n")); kak_assert(buffer.line_count() == 4); BufferIterator pos = buffer.begin(); @@ -809,7 +733,9 @@ UnitTest test_buffer{[]() UnitTest test_undo{[]() { - Buffer buffer("test", Buffer::Flags::None, "allo ?\nmais que fais la police\n hein ?\n youpi\n"); + auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; }; + + Buffer buffer("test", Buffer::Flags::None, make_lines("allo ?\n", "mais que fais la police\n", " hein ?\n", " youpi\n")); auto pos = buffer.end_coord(); buffer.insert(pos, "kanaky\n"); // change 1 buffer.commit_undo_group(); diff --git a/src/buffer.hh b/src/buffer.hh index 73e5b370..4bd1409e 100644 --- a/src/buffer.hh +++ b/src/buffer.hh @@ -129,8 +129,10 @@ public: enum class HistoryId : size_t { First = 0, Invalid = (size_t)-1 }; - Buffer(String name, Flags flags, StringView data = {}, - timespec fs_timestamp = InvalidTime); + Buffer(String name, Flags flags, BufferLines lines, + ByteOrderMark bom = ByteOrderMark::None, + EolFormat eolformat = EolFormat::Lf, + FsStatus fs_status = {InvalidTime, {}, {}}); Buffer(const Buffer&) = delete; Buffer& operator= (const Buffer&) = delete; ~Buffer(); @@ -210,7 +212,7 @@ public: void run_hook_in_own_context(Hook hook, StringView param, String client_name = {}); - void reload(StringView data, timespec fs_timestamp = InvalidTime); + void reload(BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus status); void check_invariant() const; diff --git a/src/buffer_manager.cc b/src/buffer_manager.cc index 8d17cc76..32d03fc1 100644 --- a/src/buffer_manager.cc +++ b/src/buffer_manager.cc @@ -24,8 +24,7 @@ BufferManager::~BufferManager() ClientManager::instance().clear(true); } -Buffer* BufferManager::create_buffer(String name, Buffer::Flags flags, - StringView data, timespec fs_timestamp) +Buffer* BufferManager::create_buffer(String name, Buffer::Flags flags, BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status) { auto path = real_path(parse_filename(name)); for (auto& buf : m_buffers) @@ -35,7 +34,7 @@ Buffer* BufferManager::create_buffer(String name, Buffer::Flags flags, throw runtime_error{"buffer name is already in use"}; } - m_buffers.push_back(std::make_unique(std::move(name), flags, data, fs_timestamp)); + m_buffers.push_back(std::make_unique(std::move(name), flags, lines, bom, eolformat, fs_status)); auto* buffer = m_buffers.back().get(); buffer->on_registered(); @@ -84,8 +83,9 @@ Buffer& BufferManager::get_first_buffer() { if (all_of(m_buffers, [](auto& b) { return (b->flags() & Buffer::Flags::Debug); })) create_buffer("*scratch*", Buffer::Flags::None, - "*** this is a *scratch* buffer which won't be automatically saved ***\n" - "*** use it for notes or open a file buffer with the :edit command ***\n"); + {StringData::create({"*** this is a *scratch* buffer which won't be automatically saved ***\n"}), + StringData::create({"*** use it for notes or open a file buffer with the :edit command ***\n"})}, + ByteOrderMark::None, EolFormat::Lf, {InvalidTime, {}, {}}); return *m_buffers.back(); } diff --git a/src/buffer_manager.hh b/src/buffer_manager.hh index e89fa183..7a380959 100644 --- a/src/buffer_manager.hh +++ b/src/buffer_manager.hh @@ -9,6 +9,8 @@ namespace Kakoune { +class MappedFile; + class BufferManager : public Singleton { public: @@ -17,9 +19,7 @@ public: ~BufferManager(); - Buffer* create_buffer(String name, Buffer::Flags flags, - StringView data = {}, - timespec fs_timestamp = InvalidTime); + Buffer* create_buffer(String name, Buffer::Flags flags, BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status); void delete_buffer(Buffer& buffer); diff --git a/src/buffer_utils.cc b/src/buffer_utils.cc index 5dc53f65..e9f159d6 100644 --- a/src/buffer_utils.cc +++ b/src/buffer_utils.cc @@ -85,33 +85,86 @@ ByteCount get_byte_to_column(const Buffer& buffer, ColumnCount tabstop, DisplayC return (int)(it - line.begin()); } +static BufferLines parse_lines(const char* pos, const char* end, EolFormat eolformat) +{ + BufferLines lines; + while (pos < end) + { + if (lines.size() >= std::numeric_limits::max()) + throw runtime_error("too many lines"); + + const char* eol = std::find(pos, end, '\n'); + if ((eol - pos) >= std::numeric_limits::max()) + throw runtime_error("line is too long"); + + lines.emplace_back(StringData::create({{pos, eol - (eolformat == EolFormat::Crlf and eol != end ? 1 : 0)}, "\n"})); + pos = eol + 1; + } + + if (lines.empty()) + lines.emplace_back(StringData::create({"\n"})); + + return lines; +} + +Buffer* create_buffer_from_string(String name, Buffer::Flags flags, StringView data) +{ + return BufferManager::instance().create_buffer( + std::move(name), flags, + parse_lines(data.begin(), data.end(), EolFormat::Lf), + ByteOrderMark::None, EolFormat::Lf, + FsStatus{InvalidTime, {}, {}}); +} + +template +decltype(auto) parse_file(StringView filename, Func&& func) +{ + MappedFile file{parse_filename(filename)}; + + const char* pos = file.data; + const char* end = pos + file.st.st_size; + + auto bom = ByteOrderMark::None; + if (file.st.st_size >= 3 && StringView{pos, 3_byte} == "\xEF\xBB\xBF") + { + bom = ByteOrderMark::Utf8; + pos += 3; + } + + bool has_crlf = false, has_lf = false; + for (auto it = pos; it != end; ++it) + { + if (*it == '\n') + ((it != pos and *(it-1) == '\r') ? has_crlf : has_lf) = true; + } + const bool crlf = has_crlf and not has_lf; + auto eolformat = crlf ? EolFormat::Crlf : EolFormat::Lf; + + FsStatus fs_status{file.st.st_mtim, file.st.st_size, hash_data(file.data, file.st.st_size)}; + return func(parse_lines(pos, end, eolformat), bom, eolformat, fs_status); +} + Buffer* open_file_buffer(StringView filename, Buffer::Flags flags) { - MappedFile file_data{parse_filename(filename)}; - return BufferManager::instance().create_buffer( - filename.str(), Buffer::Flags::File | flags, file_data, file_data.st.st_mtim); + return parse_file(filename, [&](BufferLines&& lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status) { + return BufferManager::instance().create_buffer(filename.str(), flags, std::move(lines), bom, eolformat, fs_status); + }); } Buffer* open_or_create_file_buffer(StringView filename, Buffer::Flags flags) { - auto& buffer_manager = BufferManager::instance(); auto path = parse_filename(filename); if (file_exists(path)) - { - MappedFile file_data{path}; - return buffer_manager.create_buffer(filename.str(), Buffer::Flags::File | flags, - file_data, file_data.st.st_mtim); - } - return buffer_manager.create_buffer( - filename.str(), Buffer::Flags::File | Buffer::Flags::New, - {}, InvalidTime); + return open_file_buffer(filename.str(), Buffer::Flags::File | flags); + return create_buffer_from_string(filename.str(), Buffer::Flags::File | Buffer::Flags::New, StringView{}); } void reload_file_buffer(Buffer& buffer) { kak_assert(buffer.flags() & Buffer::Flags::File); - MappedFile file_data{buffer.name()}; - buffer.reload(file_data, file_data.st.st_mtim); + parse_file(buffer.name(), [&](auto&&... params) { + buffer.reload(std::forward(params)...); + }); buffer.flags() &= ~Buffer::Flags::New; } @@ -124,11 +177,12 @@ Buffer* create_fifo_buffer(String name, int fd, Buffer::Flags flags, bool scroll if (buffer) { buffer->flags() |= Buffer::Flags::NoUndo | flags; - buffer->reload({}, InvalidTime); + buffer->reload({StringData::create({"\n"})}, ByteOrderMark::None, EolFormat::Lf, {InvalidTime, {}, {}}); } else buffer = buffer_manager.create_buffer( - std::move(name), flags | Buffer::Flags::Fifo | Buffer::Flags::NoUndo); + std::move(name), flags | Buffer::Flags::Fifo | Buffer::Flags::NoUndo, + {StringData::create({"\n"})}, ByteOrderMark::None, EolFormat::Lf, {InvalidTime, {}, {}}); struct FifoWatcher : FDWatcher { @@ -239,9 +293,9 @@ void write_to_debug_buffer(StringView str) else { String line = str + (eol_back ? "\n" : "\n\n"); - BufferManager::instance().create_buffer( + create_buffer_from_string( debug_buffer_name.str(), Buffer::Flags::NoUndo | Buffer::Flags::Debug | Buffer::Flags::ReadOnly, - line, InvalidTime); + line); } } diff --git a/src/buffer_utils.hh b/src/buffer_utils.hh index dd5919a9..a3f18fe1 100644 --- a/src/buffer_utils.hh +++ b/src/buffer_utils.hh @@ -78,6 +78,7 @@ ByteCount get_byte_to_column(const Buffer& buffer, ColumnCount tabstop, DisplayCoord coord); Buffer* create_fifo_buffer(String name, int fd, Buffer::Flags flags, bool scroll = false); +Buffer* create_buffer_from_string(String name, Buffer::Flags flags, StringView data); Buffer* open_file_buffer(StringView filename, Buffer::Flags flags = Buffer::Flags::None); Buffer* open_or_create_file_buffer(StringView filename, diff --git a/src/commands.cc b/src/commands.cc index 30b0795a..3a805756 100644 --- a/src/commands.cc +++ b/src/commands.cc @@ -369,7 +369,7 @@ void edit(const ParametersParser& parser, Context& context, const ShellContext&) { if (buffer != nullptr and force_reload) buffer_manager.delete_buffer(*buffer); - buffer = buffer_manager.create_buffer(std::move(name), flags); + buffer = create_buffer_from_string(std::move(name), flags, {}); } else if (buffer->flags() & Buffer::Flags::File) throw runtime_error(format("buffer '{}' exists but is not a scratch buffer", name)); diff --git a/src/file.cc b/src/file.cc index a92c6783..89dee9a3 100644 --- a/src/file.cc +++ b/src/file.cc @@ -218,8 +218,6 @@ MappedFile::MappedFile(StringView filename) if (st.st_size == 0) return; - else if (st.st_size > std::numeric_limits::max()) - throw runtime_error("file is too big"); data = (const char*)mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (data == MAP_FAILED) @@ -238,6 +236,8 @@ MappedFile::~MappedFile() MappedFile::operator StringView() const { + if (st.st_size > std::numeric_limits::max()) + throw runtime_error("file is too big"); return { data, (int)st.st_size }; } diff --git a/src/hash.cc b/src/hash.cc index ee3108c7..ecfd6e46 100644 --- a/src/hash.cc +++ b/src/hash.cc @@ -35,10 +35,10 @@ size_t hash_data(const char* input, size_t len) constexpr uint32_t c1 = 0xcc9e2d51; constexpr uint32_t c2 = 0x1b873593; - const int nblocks = len / 4; + const ptrdiff_t nblocks = len / 4; const uint8_t* blocks = data + nblocks*4; - for (int i = -nblocks; i; ++i) + for (ptrdiff_t i = -nblocks; i; ++i) { uint32_t key; memcpy(&key, blocks + 4*i, 4); diff --git a/src/line_modification.cc b/src/line_modification.cc index 3e84fe50..8551e394 100644 --- a/src/line_modification.cc +++ b/src/line_modification.cc @@ -201,8 +201,10 @@ void LineRangeSet::remove_range(LineRange range) UnitTest test_line_modifications{[]() { + auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; }; + { - Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\n"); + Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n")); auto ts = buffer.timestamp(); buffer.erase({1, 0}, {2, 0}); @@ -211,7 +213,7 @@ UnitTest test_line_modifications{[]() } { - Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\n"); + Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n")); auto ts = buffer.timestamp(); buffer.insert({2, 0}, "line 3"); @@ -220,7 +222,7 @@ UnitTest test_line_modifications{[]() } { - Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\nline 3\n"); + Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n", "line 3\n")); auto ts = buffer.timestamp(); buffer.insert({1, 4}, "hoho\nhehe"); @@ -231,7 +233,7 @@ UnitTest test_line_modifications{[]() } { - Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\nline 3\nline 4\n"); + Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n", "line 3\n", "line 4\n")); auto ts = buffer.timestamp(); buffer.erase({0,0}, {3,0}); @@ -250,7 +252,7 @@ UnitTest test_line_modifications{[]() } { - Buffer buffer("test", Buffer::Flags::None, "line 1\n"); + Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n")); auto ts = buffer.timestamp(); buffer.insert({0,0}, "n"); buffer.insert({0,1}, "e"); diff --git a/src/main.cc b/src/main.cc index e4aba375..10b45403 100644 --- a/src/main.cc +++ b/src/main.cc @@ -966,8 +966,8 @@ int run_filter(StringView keystr, ConstArrayView files, bool quiet, } if (not isatty(0)) { - Buffer& buffer = *buffer_manager.create_buffer( - "*stdin*", Buffer::Flags::NoHooks, read_fd(0), InvalidTime); + Buffer& buffer = *create_buffer_from_string( + "*stdin*", Buffer::Flags::NoHooks, read_fd(0)); apply_to_buffer(buffer); write_buffer_to_fd(buffer, 1); buffer_manager.delete_buffer(buffer); diff --git a/src/word_db.cc b/src/word_db.cc index ed0c7d99..56b7fedf 100644 --- a/src/word_db.cc +++ b/src/word_db.cc @@ -231,12 +231,10 @@ UnitTest test_word_db{[]() }); }; + auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; }; + Buffer buffer("test", Buffer::Flags::None, - "tchou mutch\n" - "tchou kanaky tchou\n" - "\n" - "tchaa tchaa\n" - "allo\n"); + make_lines("tchou mutch\n", "tchou kanaky tchou\n", "\n", "tchaa tchaa\n", "allo\n")); WordDB word_db(buffer); auto res = word_db.find_matching(""); std::sort(res.begin(), res.end(), cmp_words);