Support opening files bigger than 2 GiB

The real technical limit is with lines bigger than 2 GiB and buffers
with more than 2 Gi lines, refactor buffer loading to make it possible
to load those files.

Fix an overflow with the hash_data function at the same time
This commit is contained in:
Maxime Coste 2021-05-28 17:03:06 +10:00
parent 38f85706ff
commit e4a23a64fa
12 changed files with 126 additions and 143 deletions

View File

@ -20,57 +20,13 @@
namespace Kakoune
{
struct ParsedLines
{
BufferLines lines;
ByteOrderMark bom = ByteOrderMark::None;
EolFormat eolformat = EolFormat::Lf;
};
static ParsedLines parse_lines(StringView data)
{
ParsedLines res;
const char* pos = data.begin();
if (data.substr(0, 3_byte) == "\xEF\xBB\xBF")
{
res.bom = ByteOrderMark::Utf8;
pos = data.begin() + 3;
}
bool has_crlf = false, has_lf = false;
for (auto it = pos; it != data.end(); ++it)
{
if (*it == '\n')
((it != pos and *(it-1) == '\r') ? has_crlf : has_lf) = true;
}
const bool crlf = has_crlf and not has_lf;
res.eolformat = crlf ? EolFormat::Crlf : EolFormat::Lf;
while (pos < data.end())
{
const char* eol = std::find(pos, data.end(), '\n');
res.lines.emplace_back(StringData::create({{pos, eol - (crlf and eol != data.end() ? 1 : 0)}, "\n"}));
pos = eol + 1;
}
if (res.lines.empty())
res.lines.emplace_back(StringData::create({"\n"}));
return res;
}
static void apply_options(OptionManager& options, const ParsedLines& parsed_lines)
{
options.get_local_option("eolformat").set(parsed_lines.eolformat);
options.get_local_option("BOM").set(parsed_lines.bom);
}
Buffer::HistoryNode::HistoryNode(HistoryId parent)
: parent{parent}, committed{Clock::now()}
{}
Buffer::Buffer(String name, Flags flags, StringView data,
timespec fs_timestamp)
Buffer::Buffer(String name, Flags flags, BufferLines lines,
ByteOrderMark bom, EolFormat eolformat,
FsStatus fs_status)
: Scope{GlobalScope::instance()},
m_name{(flags & Flags::File) ? real_path(parse_filename(name)) : std::move(name)},
m_display_name{(flags & Flags::File) ? compact_path(m_name) : m_name},
@ -78,20 +34,19 @@ Buffer::Buffer(String name, Flags flags, StringView data,
m_history{{HistoryId::Invalid}},
m_history_id{HistoryId::First},
m_last_save_history_id{HistoryId::First},
m_fs_status{fs_timestamp, data.length(), hash_value(data)}
m_fs_status{fs_status}
{
ParsedLines parsed_lines = parse_lines(data);
#ifdef KAK_DEBUG
for (auto& line : parsed_lines.lines)
for (auto& line : lines)
kak_assert(not (line->length == 0) and
line->data()[line->length-1] == '\n');
#endif
static_cast<BufferLines&>(m_lines) = std::move(parsed_lines.lines);
static_cast<BufferLines&>(m_lines) = std::move(lines);
m_changes.push_back({ Change::Insert, {0,0}, line_count() });
apply_options(options(), parsed_lines);
options().get_local_option("eolformat").set(eolformat);
options().get_local_option("BOM").set(bom);
// now we may begin to record undo data
if (not (flags & Flags::NoUndo))
@ -236,10 +191,8 @@ Buffer::Modification Buffer::Modification::inverse() const
return {type == Insert ? Erase : Insert, coord, content};
}
void Buffer::reload(StringView data, timespec fs_timestamp)
void Buffer::reload(BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status)
{
ParsedLines parsed_lines = parse_lines(data);
const bool record_undo = not (m_flags & Flags::NoUndo);
commit_undo_group();
@ -252,21 +205,21 @@ void Buffer::reload(StringView data, timespec fs_timestamp)
m_history = {HistoryNode{HistoryId::Invalid}};
m_changes.push_back({ Change::Erase, {0,0}, line_count() });
static_cast<BufferLines&>(m_lines) = std::move(parsed_lines.lines);
static_cast<BufferLines&>(m_lines) = std::move(lines);
m_changes.push_back({ Change::Insert, {0,0}, line_count() });
}
else
{
Vector<Diff> diff;
for_each_diff(m_lines.begin(), m_lines.size(),
parsed_lines.lines.begin(), parsed_lines.lines.size(),
lines.begin(), lines.size(),
[&diff](DiffOp op, int len)
{ diff.push_back({op, len}); },
[](const StringDataPtr& lhs, const StringDataPtr& rhs)
{ return lhs->strview() == rhs->strview(); });
auto it = m_lines.begin();
auto new_it = parsed_lines.lines.begin();
auto new_it = lines.begin();
for (auto& d : diff)
{
if (d.op == DiffOp::Keep)
@ -303,10 +256,12 @@ void Buffer::reload(StringView data, timespec fs_timestamp)
commit_undo_group();
apply_options(options(), parsed_lines);
options().get_local_option("eolformat").set(eolformat);
options().get_local_option("BOM").set(bom);
m_last_save_history_id = m_history_id;
m_fs_status = {fs_timestamp, data.length(), hash_value(data)};
m_fs_status = fs_status;
}
void Buffer::commit_undo_group()
@ -729,44 +684,13 @@ String Buffer::debug_description() const
content_size, additional_size);
}
UnitTest test_parse_line{[]
{
{
auto lines = parse_lines("foo\nbar\nbaz\n");
kak_assert(lines.eolformat == EolFormat::Lf);
kak_assert(lines.bom == ByteOrderMark::None);
kak_assert(lines.lines.size() == 3);
kak_assert(lines.lines[0]->strview() == "foo\n");
kak_assert(lines.lines[1]->strview() == "bar\n");
kak_assert(lines.lines[2]->strview() == "baz\n");
}
{
auto lines = parse_lines("\xEF\xBB\xBF" "foo\nbar\r\nbaz");
kak_assert(lines.eolformat == EolFormat::Lf);
kak_assert(lines.bom == ByteOrderMark::Utf8);
kak_assert(lines.lines.size() == 3);
kak_assert(lines.lines[0]->strview() == "foo\n");
kak_assert(lines.lines[1]->strview() == "bar\r\n");
kak_assert(lines.lines[2]->strview() == "baz\n");
}
{
auto lines = parse_lines("foo\r\nbar\r\nbaz\r\n");
kak_assert(lines.eolformat == EolFormat::Crlf);
kak_assert(lines.bom == ByteOrderMark::None);
kak_assert(lines.lines.size() == 3);
kak_assert(lines.lines[0]->strview() == "foo\n");
kak_assert(lines.lines[1]->strview() == "bar\n");
kak_assert(lines.lines[2]->strview() == "baz\n");
}
}};
UnitTest test_buffer{[]()
{
Buffer empty_buffer("empty", Buffer::Flags::None, {});
auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; };
Buffer buffer("test", Buffer::Flags::None, "allo ?\nmais que fais la police\n hein ?\n youpi\n");
Buffer empty_buffer("empty", Buffer::Flags::None, make_lines("\n"));
Buffer buffer("test", Buffer::Flags::None, make_lines("allo ?\n", "mais que fais la police\n", " hein ?\n", " youpi\n"));
kak_assert(buffer.line_count() == 4);
BufferIterator pos = buffer.begin();
@ -809,7 +733,9 @@ UnitTest test_buffer{[]()
UnitTest test_undo{[]()
{
Buffer buffer("test", Buffer::Flags::None, "allo ?\nmais que fais la police\n hein ?\n youpi\n");
auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; };
Buffer buffer("test", Buffer::Flags::None, make_lines("allo ?\n", "mais que fais la police\n", " hein ?\n", " youpi\n"));
auto pos = buffer.end_coord();
buffer.insert(pos, "kanaky\n"); // change 1
buffer.commit_undo_group();

View File

@ -129,8 +129,10 @@ public:
enum class HistoryId : size_t { First = 0, Invalid = (size_t)-1 };
Buffer(String name, Flags flags, StringView data = {},
timespec fs_timestamp = InvalidTime);
Buffer(String name, Flags flags, BufferLines lines,
ByteOrderMark bom = ByteOrderMark::None,
EolFormat eolformat = EolFormat::Lf,
FsStatus fs_status = {InvalidTime, {}, {}});
Buffer(const Buffer&) = delete;
Buffer& operator= (const Buffer&) = delete;
~Buffer();
@ -210,7 +212,7 @@ public:
void run_hook_in_own_context(Hook hook, StringView param,
String client_name = {});
void reload(StringView data, timespec fs_timestamp = InvalidTime);
void reload(BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus status);
void check_invariant() const;

View File

@ -24,8 +24,7 @@ BufferManager::~BufferManager()
ClientManager::instance().clear(true);
}
Buffer* BufferManager::create_buffer(String name, Buffer::Flags flags,
StringView data, timespec fs_timestamp)
Buffer* BufferManager::create_buffer(String name, Buffer::Flags flags, BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status)
{
auto path = real_path(parse_filename(name));
for (auto& buf : m_buffers)
@ -35,7 +34,7 @@ Buffer* BufferManager::create_buffer(String name, Buffer::Flags flags,
throw runtime_error{"buffer name is already in use"};
}
m_buffers.push_back(std::make_unique<Buffer>(std::move(name), flags, data, fs_timestamp));
m_buffers.push_back(std::make_unique<Buffer>(std::move(name), flags, lines, bom, eolformat, fs_status));
auto* buffer = m_buffers.back().get();
buffer->on_registered();
@ -84,8 +83,9 @@ Buffer& BufferManager::get_first_buffer()
{
if (all_of(m_buffers, [](auto& b) { return (b->flags() & Buffer::Flags::Debug); }))
create_buffer("*scratch*", Buffer::Flags::None,
"*** this is a *scratch* buffer which won't be automatically saved ***\n"
"*** use it for notes or open a file buffer with the :edit command ***\n");
{StringData::create({"*** this is a *scratch* buffer which won't be automatically saved ***\n"}),
StringData::create({"*** use it for notes or open a file buffer with the :edit command ***\n"})},
ByteOrderMark::None, EolFormat::Lf, {InvalidTime, {}, {}});
return *m_buffers.back();
}

View File

@ -9,6 +9,8 @@
namespace Kakoune
{
class MappedFile;
class BufferManager : public Singleton<BufferManager>
{
public:
@ -17,9 +19,7 @@ public:
~BufferManager();
Buffer* create_buffer(String name, Buffer::Flags flags,
StringView data = {},
timespec fs_timestamp = InvalidTime);
Buffer* create_buffer(String name, Buffer::Flags flags, BufferLines lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status);
void delete_buffer(Buffer& buffer);

View File

@ -85,33 +85,86 @@ ByteCount get_byte_to_column(const Buffer& buffer, ColumnCount tabstop, DisplayC
return (int)(it - line.begin());
}
static BufferLines parse_lines(const char* pos, const char* end, EolFormat eolformat)
{
BufferLines lines;
while (pos < end)
{
if (lines.size() >= std::numeric_limits<int>::max())
throw runtime_error("too many lines");
const char* eol = std::find(pos, end, '\n');
if ((eol - pos) >= std::numeric_limits<int>::max())
throw runtime_error("line is too long");
lines.emplace_back(StringData::create({{pos, eol - (eolformat == EolFormat::Crlf and eol != end ? 1 : 0)}, "\n"}));
pos = eol + 1;
}
if (lines.empty())
lines.emplace_back(StringData::create({"\n"}));
return lines;
}
Buffer* create_buffer_from_string(String name, Buffer::Flags flags, StringView data)
{
return BufferManager::instance().create_buffer(
std::move(name), flags,
parse_lines(data.begin(), data.end(), EolFormat::Lf),
ByteOrderMark::None, EolFormat::Lf,
FsStatus{InvalidTime, {}, {}});
}
template<typename Func>
decltype(auto) parse_file(StringView filename, Func&& func)
{
MappedFile file{parse_filename(filename)};
const char* pos = file.data;
const char* end = pos + file.st.st_size;
auto bom = ByteOrderMark::None;
if (file.st.st_size >= 3 && StringView{pos, 3_byte} == "\xEF\xBB\xBF")
{
bom = ByteOrderMark::Utf8;
pos += 3;
}
bool has_crlf = false, has_lf = false;
for (auto it = pos; it != end; ++it)
{
if (*it == '\n')
((it != pos and *(it-1) == '\r') ? has_crlf : has_lf) = true;
}
const bool crlf = has_crlf and not has_lf;
auto eolformat = crlf ? EolFormat::Crlf : EolFormat::Lf;
FsStatus fs_status{file.st.st_mtim, file.st.st_size, hash_data(file.data, file.st.st_size)};
return func(parse_lines(pos, end, eolformat), bom, eolformat, fs_status);
}
Buffer* open_file_buffer(StringView filename, Buffer::Flags flags)
{
MappedFile file_data{parse_filename(filename)};
return BufferManager::instance().create_buffer(
filename.str(), Buffer::Flags::File | flags, file_data, file_data.st.st_mtim);
return parse_file(filename, [&](BufferLines&& lines, ByteOrderMark bom, EolFormat eolformat, FsStatus fs_status) {
return BufferManager::instance().create_buffer(filename.str(), flags, std::move(lines), bom, eolformat, fs_status);
});
}
Buffer* open_or_create_file_buffer(StringView filename, Buffer::Flags flags)
{
auto& buffer_manager = BufferManager::instance();
auto path = parse_filename(filename);
if (file_exists(path))
{
MappedFile file_data{path};
return buffer_manager.create_buffer(filename.str(), Buffer::Flags::File | flags,
file_data, file_data.st.st_mtim);
}
return buffer_manager.create_buffer(
filename.str(), Buffer::Flags::File | Buffer::Flags::New,
{}, InvalidTime);
return open_file_buffer(filename.str(), Buffer::Flags::File | flags);
return create_buffer_from_string(filename.str(), Buffer::Flags::File | Buffer::Flags::New, StringView{});
}
void reload_file_buffer(Buffer& buffer)
{
kak_assert(buffer.flags() & Buffer::Flags::File);
MappedFile file_data{buffer.name()};
buffer.reload(file_data, file_data.st.st_mtim);
parse_file(buffer.name(), [&](auto&&... params) {
buffer.reload(std::forward<decltype(params)>(params)...);
});
buffer.flags() &= ~Buffer::Flags::New;
}
@ -124,11 +177,12 @@ Buffer* create_fifo_buffer(String name, int fd, Buffer::Flags flags, bool scroll
if (buffer)
{
buffer->flags() |= Buffer::Flags::NoUndo | flags;
buffer->reload({}, InvalidTime);
buffer->reload({StringData::create({"\n"})}, ByteOrderMark::None, EolFormat::Lf, {InvalidTime, {}, {}});
}
else
buffer = buffer_manager.create_buffer(
std::move(name), flags | Buffer::Flags::Fifo | Buffer::Flags::NoUndo);
std::move(name), flags | Buffer::Flags::Fifo | Buffer::Flags::NoUndo,
{StringData::create({"\n"})}, ByteOrderMark::None, EolFormat::Lf, {InvalidTime, {}, {}});
struct FifoWatcher : FDWatcher
{
@ -239,9 +293,9 @@ void write_to_debug_buffer(StringView str)
else
{
String line = str + (eol_back ? "\n" : "\n\n");
BufferManager::instance().create_buffer(
create_buffer_from_string(
debug_buffer_name.str(), Buffer::Flags::NoUndo | Buffer::Flags::Debug | Buffer::Flags::ReadOnly,
line, InvalidTime);
line);
}
}

View File

@ -78,6 +78,7 @@ ByteCount get_byte_to_column(const Buffer& buffer, ColumnCount tabstop,
DisplayCoord coord);
Buffer* create_fifo_buffer(String name, int fd, Buffer::Flags flags, bool scroll = false);
Buffer* create_buffer_from_string(String name, Buffer::Flags flags, StringView data);
Buffer* open_file_buffer(StringView filename,
Buffer::Flags flags = Buffer::Flags::None);
Buffer* open_or_create_file_buffer(StringView filename,

View File

@ -369,7 +369,7 @@ void edit(const ParametersParser& parser, Context& context, const ShellContext&)
{
if (buffer != nullptr and force_reload)
buffer_manager.delete_buffer(*buffer);
buffer = buffer_manager.create_buffer(std::move(name), flags);
buffer = create_buffer_from_string(std::move(name), flags, {});
}
else if (buffer->flags() & Buffer::Flags::File)
throw runtime_error(format("buffer '{}' exists but is not a scratch buffer", name));

View File

@ -218,8 +218,6 @@ MappedFile::MappedFile(StringView filename)
if (st.st_size == 0)
return;
else if (st.st_size > std::numeric_limits<int>::max())
throw runtime_error("file is too big");
data = (const char*)mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED)
@ -238,6 +236,8 @@ MappedFile::~MappedFile()
MappedFile::operator StringView() const
{
if (st.st_size > std::numeric_limits<int>::max())
throw runtime_error("file is too big");
return { data, (int)st.st_size };
}

View File

@ -35,10 +35,10 @@ size_t hash_data(const char* input, size_t len)
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
const int nblocks = len / 4;
const ptrdiff_t nblocks = len / 4;
const uint8_t* blocks = data + nblocks*4;
for (int i = -nblocks; i; ++i)
for (ptrdiff_t i = -nblocks; i; ++i)
{
uint32_t key;
memcpy(&key, blocks + 4*i, 4);

View File

@ -201,8 +201,10 @@ void LineRangeSet::remove_range(LineRange range)
UnitTest test_line_modifications{[]()
{
auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; };
{
Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\n");
Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n"));
auto ts = buffer.timestamp();
buffer.erase({1, 0}, {2, 0});
@ -211,7 +213,7 @@ UnitTest test_line_modifications{[]()
}
{
Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\n");
Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n"));
auto ts = buffer.timestamp();
buffer.insert({2, 0}, "line 3");
@ -220,7 +222,7 @@ UnitTest test_line_modifications{[]()
}
{
Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\nline 3\n");
Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n", "line 3\n"));
auto ts = buffer.timestamp();
buffer.insert({1, 4}, "hoho\nhehe");
@ -231,7 +233,7 @@ UnitTest test_line_modifications{[]()
}
{
Buffer buffer("test", Buffer::Flags::None, "line 1\nline 2\nline 3\nline 4\n");
Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n", "line 2\n", "line 3\n", "line 4\n"));
auto ts = buffer.timestamp();
buffer.erase({0,0}, {3,0});
@ -250,7 +252,7 @@ UnitTest test_line_modifications{[]()
}
{
Buffer buffer("test", Buffer::Flags::None, "line 1\n");
Buffer buffer("test", Buffer::Flags::None, make_lines("line 1\n"));
auto ts = buffer.timestamp();
buffer.insert({0,0}, "n");
buffer.insert({0,1}, "e");

View File

@ -966,8 +966,8 @@ int run_filter(StringView keystr, ConstArrayView<StringView> files, bool quiet,
}
if (not isatty(0))
{
Buffer& buffer = *buffer_manager.create_buffer(
"*stdin*", Buffer::Flags::NoHooks, read_fd(0), InvalidTime);
Buffer& buffer = *create_buffer_from_string(
"*stdin*", Buffer::Flags::NoHooks, read_fd(0));
apply_to_buffer(buffer);
write_buffer_to_fd(buffer, 1);
buffer_manager.delete_buffer(buffer);

View File

@ -231,12 +231,10 @@ UnitTest test_word_db{[]()
});
};
auto make_lines = [](auto&&... lines) { return BufferLines{StringData::create({lines})...}; };
Buffer buffer("test", Buffer::Flags::None,
"tchou mutch\n"
"tchou kanaky tchou\n"
"\n"
"tchaa tchaa\n"
"allo\n");
make_lines("tchou mutch\n", "tchou kanaky tchou\n", "\n", "tchaa tchaa\n", "allo\n"));
WordDB word_db(buffer);
auto res = word_db.find_matching("");
std::sort(res.begin(), res.end(), cmp_words);