From ed68d1ff287d43c5293abb4d41e908aa8e50afec Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 2 Jul 2014 21:14:01 +0100 Subject: [PATCH] utf8: use end of sequence iterators for more security --- src/buffer.cc | 4 +-- src/buffer_utils.cc | 2 +- src/buffer_utils.hh | 4 +-- src/highlighters.cc | 4 +-- src/insert_completer.cc | 2 +- src/keys.cc | 4 +-- src/ncurses.cc | 5 +-- src/normal.cc | 2 +- src/selection.cc | 2 +- src/selectors.cc | 8 ++--- src/selectors.hh | 6 ++-- src/unit_tests.cc | 2 +- src/utf8.hh | 77 ++++++++++++++++++++++------------------- src/utf8_iterator.hh | 10 +++--- src/word_db.cc | 2 +- 15 files changed, 71 insertions(+), 63 deletions(-) diff --git a/src/buffer.cc b/src/buffer.cc index c9293a98..c41d7e8a 100644 --- a/src/buffer.cc +++ b/src/buffer.cc @@ -452,7 +452,7 @@ ByteCoord Buffer::char_next(ByteCoord coord) const if (coord.column < m_lines[coord.line].length() - 1) { auto& line = m_lines[coord.line]; - coord.column += utf8::codepoint_size(line.begin() + (int)coord.column); + coord.column += utf8::codepoint_size(line[(int)coord.column]); // Handle invalid utf-8 if (coord.column >= line.length()) { @@ -483,7 +483,7 @@ ByteCoord Buffer::char_prev(ByteCoord coord) const else { auto& line = m_lines[coord.line]; - coord.column = (int)(utf8::character_start(line.begin() + (int)coord.column - 1) - line.begin()); + coord.column = (int)(utf8::character_start(line.begin() + (int)coord.column - 1, line.begin()) - line.begin()); } return coord; } diff --git a/src/buffer_utils.cc b/src/buffer_utils.cc index c59157a5..5eeb188f 100644 --- a/src/buffer_utils.cc +++ b/src/buffer_utils.cc @@ -14,7 +14,7 @@ CharCount get_column(const Buffer& buffer, auto col = 0_char; for (auto it = line.begin(); it != line.end() and coord.column > (int)(it - line.begin()); - it = utf8::next(it)) + it = utf8::next(it, line.end())) { if (*it == '\t') col = (col / tabstop + 1) * tabstop; diff --git a/src/buffer_utils.hh b/src/buffer_utils.hh index f47c2c26..c16b359f 100644 --- a/src/buffer_utils.hh +++ b/src/buffer_utils.hh @@ -15,13 +15,13 @@ inline String content(const Buffer& buffer, const Selection& range) inline BufferIterator erase(Buffer& buffer, const Selection& range) { return buffer.erase(buffer.iterator_at(range.min()), - utf8::next(buffer.iterator_at(range.max()))); + utf8::next(buffer.iterator_at(range.max()), buffer.end())); } inline CharCount char_length(const Buffer& buffer, const Selection& range) { return utf8::distance(buffer.iterator_at(range.min()), - utf8::next(buffer.iterator_at(range.max()))); + utf8::next(buffer.iterator_at(range.max()), buffer.end())); } CharCount get_column(const Buffer& buffer, diff --git a/src/highlighters.cc b/src/highlighters.cc index 51b8111f..49f5199a 100644 --- a/src/highlighters.cc +++ b/src/highlighters.cc @@ -602,8 +602,8 @@ void expand_unprintable(const Context& context, HighlightFlags flags, DisplayBuf for (auto it = buffer.iterator_at(atom_it->begin()), end = buffer.iterator_at(atom_it->end()); it < end;) { - Codepoint cp = utf8::codepoint(it); - auto next = utf8::next(it); + Codepoint cp = utf8::codepoint(it, end); + auto next = utf8::next(it, end); if (cp != '\n' and not iswprint(cp)) { std::ostringstream oss; diff --git a/src/insert_completer.cc b/src/insert_completer.cc index aab807f1..2acc5447 100644 --- a/src/insert_completer.cc +++ b/src/insert_completer.cc @@ -32,7 +32,7 @@ template InsertCompletion complete_word(const Buffer& buffer, ByteCoord cursor_pos) { auto pos = buffer.iterator_at(cursor_pos); - if (pos == buffer.begin() or not is_word(*utf8::previous(pos))) + if (pos == buffer.begin() or not is_word(*utf8::previous(pos, buffer.begin()))) return {}; auto end = buffer.iterator_at(cursor_pos); diff --git a/src/keys.cc b/src/keys.cc index 5dc5a1b9..4d6d5bc3 100644 --- a/src/keys.cc +++ b/src/keys.cc @@ -41,7 +41,7 @@ static const KeyAndName keynamemap[] = { KeyList parse_keys(StringView str) { KeyList result; - using PassPolicy = utf8::InvalidBytePolicy::Pass; + using PassPolicy = utf8::InvalidPolicy::Pass; using Utf8It = utf8::iterator; for (Utf8It it = str.begin(), str_end = str.end(); it < str_end; ++it) { @@ -71,7 +71,7 @@ KeyList parse_keys(StringView str) } if (keyname.char_length() == 1) { - result.push_back(Key{ modifier, utf8::codepoint(keyname.begin()) }); + result.push_back(Key{ modifier, utf8::codepoint(keyname.begin(),keyname.end()) }); it = end_it; continue; } diff --git a/src/ncurses.cc b/src/ncurses.cc index 0b571fd3..cd5bfde4 100644 --- a/src/ncurses.cc +++ b/src/ncurses.cc @@ -207,7 +207,7 @@ void NCursesUI::refresh() m_dirty = false; } -using Utf8Policy = utf8::InvalidBytePolicy::Pass; +using Utf8Policy = utf8::InvalidPolicy::Pass; using Utf8Iterator = utf8::iterator; void addutf8str(WINDOW* win, Utf8Iterator begin, Utf8Iterator end) { @@ -408,8 +408,9 @@ Key NCursesUI::get_key() int operator*() { return getch(); } getch_iterator& operator++() { return *this; } getch_iterator& operator++(int) { return *this; } + bool operator== (const getch_iterator&) const { return false; } }; - return utf8::codepoint(getch_iterator{}); + return utf8::codepoint(getch_iterator{}, getch_iterator{}); } return Key::Invalid; } diff --git a/src/normal.cc b/src/normal.cc index 6a5468b4..5c073220 100644 --- a/src/normal.cc +++ b/src/normal.cc @@ -719,7 +719,7 @@ void keep(Context& context, int) for (auto& sel : context.selections()) { if (boost::regex_search(buffer.iterator_at(sel.min()), - utf8::next(buffer.iterator_at(sel.max())), ex) == matching) + utf8::next(buffer.iterator_at(sel.max()), buffer.end()), ex) == matching) keep.push_back(sel); } if (keep.empty()) diff --git a/src/selection.cc b/src/selection.cc index d8808054..bd18bf85 100644 --- a/src/selection.cc +++ b/src/selection.cc @@ -446,7 +446,7 @@ BufferIterator prepare_insert(Buffer& buffer, const Selection& sel, InsertMode m { // special case for end of lines, append to current line instead auto pos = buffer.iterator_at(sel.max()); - return *pos == '\n' ? pos : utf8::next(pos); + return *pos == '\n' ? pos : utf8::next(pos, buffer.end()); } case InsertMode::InsertAtLineBegin: return buffer.iterator_at(sel.min().line); diff --git a/src/selectors.cc b/src/selectors.cc index b2b08073..7c05bc36 100644 --- a/src/selectors.cc +++ b/src/selectors.cc @@ -494,7 +494,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex) auto& buffer = selections.buffer(); for (auto& sel : selections) { - auto sel_end = utf8::next(buffer.iterator_at(sel.max())); + auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end()); RegexIterator re_it(buffer.iterator_at(sel.min()), sel_end, regex); RegexIterator re_end; @@ -511,7 +511,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex) captures.emplace_back(match.first, match.second); result.push_back({ begin.coord(), - (begin == end ? end : utf8::previous(end)).coord(), + (begin == end ? end : utf8::previous(end, begin)).coord(), std::move(captures) }); } } @@ -527,7 +527,7 @@ void split_selections(SelectionList& selections, const Regex& regex) for (auto& sel : selections) { auto begin = buffer.iterator_at(sel.min()); - auto sel_end = utf8::next(buffer.iterator_at(sel.max())); + auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end()); RegexIterator re_it(begin, sel_end, regex, boost::regex_constants::match_nosubs); RegexIterator re_end; @@ -536,7 +536,7 @@ void split_selections(SelectionList& selections, const Regex& regex) { BufferIterator end = (*re_it)[0].first; - result.push_back({ begin.coord(), (begin == end) ? end.coord() : utf8::previous(end).coord() }); + result.push_back({ begin.coord(), (begin == end) ? end.coord() : utf8::previous(end, begin).coord() }); begin = (*re_it)[0].second; } if (begin.coord() <= sel.max()) diff --git a/src/selectors.hh b/src/selectors.hh index 5471a5c9..191a15dc 100644 --- a/src/selectors.hh +++ b/src/selectors.hh @@ -50,7 +50,7 @@ inline void remove_selection(SelectionList& selections, int index) selections.check_invariant(); } -using Utf8Iterator = utf8::iterator; +using Utf8Iterator = utf8::iterator; inline Selection utf8_range(const Utf8Iterator& first, const Utf8Iterator& last) { @@ -265,7 +265,7 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege CaptureList captures; MatchResults matches; bool found = false; - if ((found = find_match_in_buffer(buffer, utf8::next(begin), matches, regex))) + if ((found = find_match_in_buffer(buffer, utf8::next(begin, buffer.end()), matches, regex))) { begin = matches[0].first; end = matches[0].second; @@ -275,7 +275,7 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege if (not found or begin == buffer.end()) throw runtime_error("'" + regex.str() + "': no matches found"); - end = (begin == end) ? end : utf8::previous(end); + end = (begin == end) ? end : utf8::previous(end, begin); if (direction == Backward) std::swap(begin, end); diff --git a/src/unit_tests.cc b/src/unit_tests.cc index 85038d06..3795b8a5 100644 --- a/src/unit_tests.cc +++ b/src/unit_tests.cc @@ -99,7 +99,7 @@ void test_utf8() { String str = "maïs mélange bientôt"; kak_assert(utf8::distance(str.begin(), str.end()) == 20); - kak_assert(utf8::codepoint(str.begin() + 2) == 0x00EF); + kak_assert(utf8::codepoint(str.begin() + 2, str.end()) == 0x00EF); } void test_string() diff --git a/src/utf8.hh b/src/utf8.hh index 60460a7a..116c225e 100644 --- a/src/utf8.hh +++ b/src/utf8.hh @@ -15,10 +15,10 @@ namespace utf8 // returns an iterator to next character first byte template -Iterator next(Iterator it) +Iterator next(Iterator it, Iterator end) { - if (*it++ & 0x80) - while ((*(it) & 0xC0) == 0x80) + if (it != end and *it++ & 0x80) + while (it != end and (*(it) & 0xC0) == 0x80) ++it; return it; } @@ -26,18 +26,18 @@ Iterator next(Iterator it) // returns it's parameter if it points to a character first byte, // or else returns next character first byte template -Iterator finish(Iterator it) +Iterator finish(Iterator it, Iterator end) { - while ((*(it) & 0xC0) == 0x80) + while (it != end and (*(it) & 0xC0) == 0x80) ++it; return it; } // returns an iterator to the previous character first byte template -Iterator previous(Iterator it) +Iterator previous(Iterator it, Iterator begin) { - while ((*(--it) & 0xC0) == 0x80) + while (it != begin and (*(--it) & 0xC0) == 0x80) ; return it; } @@ -51,12 +51,12 @@ Iterator advance(Iterator it, Iterator end, CharCount d) if (d < 0) { while (it != end and d++) - it = utf8::previous(it); + it = utf8::previous(it, end); } else { while (it != end and d--) - it = utf8::next(it); + it = utf8::next(it, end); } return it; } @@ -83,65 +83,72 @@ inline bool is_character_start(char c) // returns an iterator to the first byte of the character it is into template -Iterator character_start(Iterator it) +Iterator character_start(Iterator it, Iterator begin) { - while (not is_character_start(*it)) + while (it != begin and not is_character_start(*it)) --it; return it; } -namespace InvalidBytePolicy +namespace InvalidPolicy { struct Assert { - Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; } + Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; } }; struct Pass { - Codepoint operator()(unsigned char byte) const { return byte; } + Codepoint operator()(Codepoint cp) const { return cp; } }; } // returns the codepoint of the character whose first byte // is pointed by it -template -Codepoint codepoint(Iterator it) +Codepoint codepoint(Iterator it, Iterator end) { + if (it == end) + return InvalidPolicy{}(-1); // According to rfc3629, UTF-8 allows only up to 4 bytes. // (21 bits codepoint) - Codepoint cp; unsigned char byte = *it++; if (not (byte & 0x80)) // 0xxxxxxx - cp = byte; - else if ((byte & 0xE0) == 0xC0) // 110xxxxx + return byte; + + if (it == end) + return InvalidPolicy{}(byte); + + if ((byte & 0xE0) == 0xC0) // 110xxxxx + return ((byte & 0x1F) << 6) | (*it & 0x3F); + + if ((byte & 0xF0) == 0xE0) // 1110xxxx { - cp = ((byte & 0x1F) << 6) | (*it & 0x3F); + Codepoint cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6); + if (it == end) + return InvalidPolicy{}(cp); + return cp | (*it & 0x3F); } - else if ((byte & 0xF0) == 0xE0) // 1110xxxx + + if ((byte & 0xF8) == 0xF0) // 11110xxx { - cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6); - cp |= (*it & 0x3F); - } - else if ((byte & 0xF8) == 0xF0) // 11110xxx - { - cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12); + Codepoint cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12); + if (it == end) + return InvalidPolicy{}(cp); cp |= (*it++ & 0x3F) << 6; - cp |= (*it & 0x3F); + if (it == end) + return InvalidPolicy{}(cp); + return cp | (*it & 0x3F); } - else - cp = InvalidPolicy{}(byte); - return cp; + return InvalidPolicy{}(byte); } -template -ByteCount codepoint_size(Iterator it) +template +ByteCount codepoint_size(char byte) { - unsigned char byte = *it; if (not (byte & 0x80)) // 0xxxxxxx return 1; else if ((byte & 0xE0) == 0xC0) // 110xxxxx diff --git a/src/utf8_iterator.hh b/src/utf8_iterator.hh index c7ea102f..d2d8fa6a 100644 --- a/src/utf8_iterator.hh +++ b/src/utf8_iterator.hh @@ -12,7 +12,7 @@ namespace utf8 // adapter for an iterator on bytes which permits to iterate // on unicode codepoints instead. template + typename InvalidPolicy = utf8::InvalidPolicy::Assert> class iterator { public: @@ -21,7 +21,7 @@ public: iterator& operator++() { - m_it = utf8::next(m_it); + m_it = utf8::next(m_it, Iterator{}); invalidate_value(); return *this; } @@ -41,7 +41,7 @@ public: iterator& operator--() { - m_it = utf8::previous(m_it); + m_it = utf8::previous(m_it, Iterator{}); invalidate_value(); return *this; } @@ -132,7 +132,7 @@ private: Codepoint get_value() const { if (m_value == -1) - m_value = utf8::codepoint(m_it); + m_value = utf8::codepoint(m_it, Iterator{}); return m_value; } @@ -140,7 +140,7 @@ private: mutable Codepoint m_value = -1; }; -template +template iterator make_iterator(Iterator it) { return iterator{std::move(it)}; diff --git a/src/word_db.cc b/src/word_db.cc index b120cb66..dc08f102 100644 --- a/src/word_db.cc +++ b/src/word_db.cc @@ -10,7 +10,7 @@ namespace Kakoune static std::vector get_words(StringView content) { std::vector res; - using Iterator = utf8::iterator; + using Iterator = utf8::iterator; const char* word_start = content.begin(); bool in_word = false; for (Iterator it{word_start}, end{content.end()}; it != end; ++it)