From ed68d1ff287d43c5293abb4d41e908aa8e50afec Mon Sep 17 00:00:00 2001
From: Maxime Coste <frrrwww@gmail.com>
Date: Wed, 2 Jul 2014 21:14:01 +0100
Subject: [PATCH] utf8: use end of sequence iterators for more security

---
 src/buffer.cc           |  4 +--
 src/buffer_utils.cc     |  2 +-
 src/buffer_utils.hh     |  4 +--
 src/highlighters.cc     |  4 +--
 src/insert_completer.cc |  2 +-
 src/keys.cc             |  4 +--
 src/ncurses.cc          |  5 +--
 src/normal.cc           |  2 +-
 src/selection.cc        |  2 +-
 src/selectors.cc        |  8 ++---
 src/selectors.hh        |  6 ++--
 src/unit_tests.cc       |  2 +-
 src/utf8.hh             | 77 ++++++++++++++++++++++-------------------
 src/utf8_iterator.hh    | 10 +++---
 src/word_db.cc          |  2 +-
 15 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/src/buffer.cc b/src/buffer.cc
index c9293a98..c41d7e8a 100644
--- a/src/buffer.cc
+++ b/src/buffer.cc
@@ -452,7 +452,7 @@ ByteCoord Buffer::char_next(ByteCoord coord) const
     if (coord.column < m_lines[coord.line].length() - 1)
     {
         auto& line = m_lines[coord.line];
-        coord.column += utf8::codepoint_size(line.begin() + (int)coord.column);
+        coord.column += utf8::codepoint_size(line[(int)coord.column]);
         // Handle invalid utf-8
         if (coord.column >= line.length())
         {
@@ -483,7 +483,7 @@ ByteCoord Buffer::char_prev(ByteCoord coord) const
     else
     {
         auto& line = m_lines[coord.line];
-        coord.column = (int)(utf8::character_start(line.begin() + (int)coord.column - 1) - line.begin());
+        coord.column = (int)(utf8::character_start(line.begin() + (int)coord.column - 1, line.begin()) - line.begin());
     }
     return coord;
 }
diff --git a/src/buffer_utils.cc b/src/buffer_utils.cc
index c59157a5..5eeb188f 100644
--- a/src/buffer_utils.cc
+++ b/src/buffer_utils.cc
@@ -14,7 +14,7 @@ CharCount get_column(const Buffer& buffer,
     auto col = 0_char;
     for (auto it = line.begin();
          it != line.end() and coord.column > (int)(it - line.begin());
-         it = utf8::next(it))
+         it = utf8::next(it, line.end()))
     {
         if (*it == '\t')
             col = (col / tabstop + 1) * tabstop;
diff --git a/src/buffer_utils.hh b/src/buffer_utils.hh
index f47c2c26..c16b359f 100644
--- a/src/buffer_utils.hh
+++ b/src/buffer_utils.hh
@@ -15,13 +15,13 @@ inline String content(const Buffer& buffer, const Selection& range)
 inline BufferIterator erase(Buffer& buffer, const Selection& range)
 {
     return buffer.erase(buffer.iterator_at(range.min()),
-                        utf8::next(buffer.iterator_at(range.max())));
+                        utf8::next(buffer.iterator_at(range.max()), buffer.end()));
 }
 
 inline CharCount char_length(const Buffer& buffer, const Selection& range)
 {
     return utf8::distance(buffer.iterator_at(range.min()),
-                          utf8::next(buffer.iterator_at(range.max())));
+                          utf8::next(buffer.iterator_at(range.max()), buffer.end()));
 }
 
 CharCount get_column(const Buffer& buffer,
diff --git a/src/highlighters.cc b/src/highlighters.cc
index 51b8111f..49f5199a 100644
--- a/src/highlighters.cc
+++ b/src/highlighters.cc
@@ -602,8 +602,8 @@ void expand_unprintable(const Context& context, HighlightFlags flags, DisplayBuf
                 for (auto it  = buffer.iterator_at(atom_it->begin()),
                           end = buffer.iterator_at(atom_it->end()); it < end;)
                 {
-                    Codepoint cp = utf8::codepoint<utf8::InvalidBytePolicy::Pass>(it);
-                    auto next = utf8::next(it);
+                    Codepoint cp = utf8::codepoint<utf8::InvalidPolicy::Pass>(it, end);
+                    auto next = utf8::next(it, end);
                     if (cp != '\n' and not iswprint(cp))
                     {
                         std::ostringstream oss;
diff --git a/src/insert_completer.cc b/src/insert_completer.cc
index aab807f1..2acc5447 100644
--- a/src/insert_completer.cc
+++ b/src/insert_completer.cc
@@ -32,7 +32,7 @@ template<bool other_buffers>
 InsertCompletion complete_word(const Buffer& buffer, ByteCoord cursor_pos)
 {
    auto pos = buffer.iterator_at(cursor_pos);
-   if (pos == buffer.begin() or not is_word(*utf8::previous(pos)))
+   if (pos == buffer.begin() or not is_word(*utf8::previous(pos, buffer.begin())))
        return {};
 
     auto end = buffer.iterator_at(cursor_pos);
diff --git a/src/keys.cc b/src/keys.cc
index 5dc5a1b9..4d6d5bc3 100644
--- a/src/keys.cc
+++ b/src/keys.cc
@@ -41,7 +41,7 @@ static const KeyAndName keynamemap[] = {
 KeyList parse_keys(StringView str)
 {
     KeyList result;
-    using PassPolicy = utf8::InvalidBytePolicy::Pass;
+    using PassPolicy = utf8::InvalidPolicy::Pass;
     using Utf8It = utf8::iterator<const char*, PassPolicy>;
     for (Utf8It it = str.begin(), str_end = str.end(); it < str_end; ++it)
     {
@@ -71,7 +71,7 @@ KeyList parse_keys(StringView str)
                 }
                 if (keyname.char_length() == 1)
                 {
-                    result.push_back(Key{ modifier, utf8::codepoint<PassPolicy>(keyname.begin()) });
+                    result.push_back(Key{ modifier, utf8::codepoint<PassPolicy>(keyname.begin(),keyname.end()) });
                     it = end_it;
                     continue;
                 }
diff --git a/src/ncurses.cc b/src/ncurses.cc
index 0b571fd3..cd5bfde4 100644
--- a/src/ncurses.cc
+++ b/src/ncurses.cc
@@ -207,7 +207,7 @@ void NCursesUI::refresh()
     m_dirty = false;
 }
 
-using Utf8Policy = utf8::InvalidBytePolicy::Pass;
+using Utf8Policy = utf8::InvalidPolicy::Pass;
 using Utf8Iterator = utf8::iterator<const char*, Utf8Policy>;
 void addutf8str(WINDOW* win, Utf8Iterator begin, Utf8Iterator end)
 {
@@ -408,8 +408,9 @@ Key NCursesUI::get_key()
             int operator*() { return getch(); }
             getch_iterator& operator++() { return *this; }
             getch_iterator& operator++(int) { return *this; }
+            bool operator== (const getch_iterator&) const { return false; }
        };
-       return utf8::codepoint(getch_iterator{});
+       return utf8::codepoint(getch_iterator{}, getch_iterator{});
     }
     return Key::Invalid;
 }
diff --git a/src/normal.cc b/src/normal.cc
index 6a5468b4..5c073220 100644
--- a/src/normal.cc
+++ b/src/normal.cc
@@ -719,7 +719,7 @@ void keep(Context& context, int)
         for (auto& sel : context.selections())
         {
             if (boost::regex_search(buffer.iterator_at(sel.min()),
-                                    utf8::next(buffer.iterator_at(sel.max())), ex) == matching)
+                                    utf8::next(buffer.iterator_at(sel.max()), buffer.end()), ex) == matching)
                 keep.push_back(sel);
         }
         if (keep.empty())
diff --git a/src/selection.cc b/src/selection.cc
index d8808054..bd18bf85 100644
--- a/src/selection.cc
+++ b/src/selection.cc
@@ -446,7 +446,7 @@ BufferIterator prepare_insert(Buffer& buffer, const Selection& sel, InsertMode m
     {
         // special case for end of lines, append to current line instead
         auto pos = buffer.iterator_at(sel.max());
-        return *pos == '\n' ? pos : utf8::next(pos);
+        return *pos == '\n' ? pos : utf8::next(pos, buffer.end());
     }
     case InsertMode::InsertAtLineBegin:
         return buffer.iterator_at(sel.min().line);
diff --git a/src/selectors.cc b/src/selectors.cc
index b2b08073..7c05bc36 100644
--- a/src/selectors.cc
+++ b/src/selectors.cc
@@ -494,7 +494,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex)
     auto& buffer = selections.buffer();
     for (auto& sel : selections)
     {
-        auto sel_end = utf8::next(buffer.iterator_at(sel.max()));
+        auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
         RegexIterator re_it(buffer.iterator_at(sel.min()), sel_end, regex);
         RegexIterator re_end;
 
@@ -511,7 +511,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex)
                 captures.emplace_back(match.first, match.second);
 
             result.push_back({ begin.coord(),
-                               (begin == end ? end : utf8::previous(end)).coord(),
+                               (begin == end ? end : utf8::previous(end, begin)).coord(),
                                std::move(captures) });
         }
     }
@@ -527,7 +527,7 @@ void split_selections(SelectionList& selections, const Regex& regex)
     for (auto& sel : selections)
     {
         auto begin = buffer.iterator_at(sel.min());
-        auto sel_end = utf8::next(buffer.iterator_at(sel.max()));
+        auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
         RegexIterator re_it(begin, sel_end, regex,
                             boost::regex_constants::match_nosubs);
         RegexIterator re_end;
@@ -536,7 +536,7 @@ void split_selections(SelectionList& selections, const Regex& regex)
         {
             BufferIterator end = (*re_it)[0].first;
 
-            result.push_back({ begin.coord(), (begin == end) ? end.coord() : utf8::previous(end).coord() });
+            result.push_back({ begin.coord(), (begin == end) ? end.coord() : utf8::previous(end, begin).coord() });
             begin = (*re_it)[0].second;
         }
         if (begin.coord() <= sel.max())
diff --git a/src/selectors.hh b/src/selectors.hh
index 5471a5c9..191a15dc 100644
--- a/src/selectors.hh
+++ b/src/selectors.hh
@@ -50,7 +50,7 @@ inline void remove_selection(SelectionList& selections, int index)
     selections.check_invariant();
 }
 
-using Utf8Iterator = utf8::iterator<BufferIterator, utf8::InvalidBytePolicy::Pass>;
+using Utf8Iterator = utf8::iterator<BufferIterator, utf8::InvalidPolicy::Pass>;
 
 inline Selection utf8_range(const Utf8Iterator& first, const Utf8Iterator& last)
 {
@@ -265,7 +265,7 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
     CaptureList captures;
     MatchResults matches;
     bool found = false;
-    if ((found = find_match_in_buffer<direction>(buffer, utf8::next(begin), matches, regex)))
+    if ((found = find_match_in_buffer<direction>(buffer, utf8::next(begin, buffer.end()), matches, regex)))
     {
         begin = matches[0].first;
         end   = matches[0].second;
@@ -275,7 +275,7 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
     if (not found or begin == buffer.end())
         throw runtime_error("'" + regex.str() + "': no matches found");
 
-    end = (begin == end) ? end : utf8::previous(end);
+    end = (begin == end) ? end : utf8::previous(end, begin);
     if (direction == Backward)
         std::swap(begin, end);
 
diff --git a/src/unit_tests.cc b/src/unit_tests.cc
index 85038d06..3795b8a5 100644
--- a/src/unit_tests.cc
+++ b/src/unit_tests.cc
@@ -99,7 +99,7 @@ void test_utf8()
 {
     String str = "maïs mélange bientôt";
     kak_assert(utf8::distance(str.begin(), str.end()) == 20);
-    kak_assert(utf8::codepoint(str.begin() + 2) == 0x00EF);
+    kak_assert(utf8::codepoint(str.begin() + 2, str.end()) == 0x00EF);
 }
 
 void test_string()
diff --git a/src/utf8.hh b/src/utf8.hh
index 60460a7a..116c225e 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -15,10 +15,10 @@ namespace utf8
 
 // returns an iterator to next character first byte
 template<typename Iterator>
-Iterator next(Iterator it)
+Iterator next(Iterator it, Iterator end)
 {
-    if (*it++ & 0x80)
-        while ((*(it) & 0xC0) == 0x80)
+    if (it != end and *it++ & 0x80)
+        while (it != end and (*(it) & 0xC0) == 0x80)
             ++it;
     return it;
 }
@@ -26,18 +26,18 @@ Iterator next(Iterator it)
 // returns it's parameter if it points to a character first byte,
 // or else returns next character first byte
 template<typename Iterator>
-Iterator finish(Iterator it)
+Iterator finish(Iterator it, Iterator end)
 {
-    while ((*(it) & 0xC0) == 0x80)
+    while (it != end and (*(it) & 0xC0) == 0x80)
         ++it;
     return it;
 }
 
 // returns an iterator to the previous character first byte
 template<typename Iterator>
-Iterator previous(Iterator it)
+Iterator previous(Iterator it, Iterator begin)
 {
-    while ((*(--it) & 0xC0) == 0x80)
+    while (it != begin and (*(--it) & 0xC0) == 0x80)
            ;
     return it;
 }
@@ -51,12 +51,12 @@ Iterator advance(Iterator it, Iterator end, CharCount d)
     if (d < 0)
     {
        while (it != end and d++)
-           it = utf8::previous(it);
+           it = utf8::previous(it, end);
     }
     else
     {
         while (it != end and d--)
-           it = utf8::next(it);
+           it = utf8::next(it, end);
     }
     return it;
 }
@@ -83,65 +83,72 @@ inline bool is_character_start(char c)
 
 // returns an iterator to the first byte of the character it is into
 template<typename Iterator>
-Iterator character_start(Iterator it)
+Iterator character_start(Iterator it, Iterator begin)
 {
-    while (not is_character_start(*it))
+    while (it != begin and not is_character_start(*it))
         --it;
     return it;
 }
 
-namespace InvalidBytePolicy
+namespace InvalidPolicy
 {
 
 struct Assert
 {
-    Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; }
+    Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
 };
 
 struct Pass
 {
-    Codepoint operator()(unsigned char byte) const { return byte; }
+    Codepoint operator()(Codepoint cp) const { return cp; }
 };
 
 }
 
 // returns the codepoint of the character whose first byte
 // is pointed by it
-template<typename InvalidPolicy = InvalidBytePolicy::Assert,
+template<typename InvalidPolicy = utf8::InvalidPolicy::Assert,
          typename Iterator>
-Codepoint codepoint(Iterator it)
+Codepoint codepoint(Iterator it, Iterator end)
 {
+    if (it == end)
+        return InvalidPolicy{}(-1);
     // According to rfc3629, UTF-8 allows only up to 4 bytes.
     // (21 bits codepoint)
-    Codepoint cp;
     unsigned char byte = *it++;
     if (not (byte & 0x80)) // 0xxxxxxx
-        cp = byte;
-    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
+        return byte;
+
+    if (it == end)
+        return InvalidPolicy{}(byte);
+
+    if ((byte & 0xE0) == 0xC0) // 110xxxxx
+        return ((byte & 0x1F) << 6) | (*it & 0x3F);
+
+    if ((byte & 0xF0) == 0xE0) // 1110xxxx
     {
-        cp = ((byte & 0x1F) << 6) | (*it & 0x3F);
+        Codepoint cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
+        if (it == end)
+            return InvalidPolicy{}(cp);
+        return cp | (*it & 0x3F);
     }
-    else if ((byte & 0xF0) == 0xE0) // 1110xxxx
+
+    if ((byte & 0xF8) == 0xF0) // 11110xxx
     {
-        cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
-        cp |= (*it & 0x3F);
-    }
-    else if ((byte & 0xF8) == 0xF0) // 11110xxx
-    {
-        cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
+        Codepoint cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
+        if (it == end)
+            return InvalidPolicy{}(cp);
         cp |= (*it++ & 0x3F) << 6;
-        cp |= (*it & 0x3F);
+        if (it == end)
+            return InvalidPolicy{}(cp);
+        return cp | (*it & 0x3F);
     }
-    else
-        cp = InvalidPolicy{}(byte);
-    return cp;
+    return InvalidPolicy{}(byte);
 }
 
-template<typename InvalidPolicy = InvalidBytePolicy::Assert,
-         typename Iterator>
-ByteCount codepoint_size(Iterator it)
+template<typename InvalidPolicy = utf8::InvalidPolicy::Assert>
+ByteCount codepoint_size(char byte)
 {
-    unsigned char byte = *it;
     if (not (byte & 0x80)) // 0xxxxxxx
         return 1;
     else if ((byte & 0xE0) == 0xC0) // 110xxxxx
diff --git a/src/utf8_iterator.hh b/src/utf8_iterator.hh
index c7ea102f..d2d8fa6a 100644
--- a/src/utf8_iterator.hh
+++ b/src/utf8_iterator.hh
@@ -12,7 +12,7 @@ namespace utf8
 // adapter for an iterator on bytes which permits to iterate
 // on unicode codepoints instead.
 template<typename Iterator,
-         typename InvalidPolicy = InvalidBytePolicy::Assert>
+         typename InvalidPolicy = utf8::InvalidPolicy::Assert>
 class iterator
 {
 public:
@@ -21,7 +21,7 @@ public:
 
     iterator& operator++()
     {
-        m_it = utf8::next(m_it);
+        m_it = utf8::next(m_it, Iterator{});
         invalidate_value();
         return *this;
     }
@@ -41,7 +41,7 @@ public:
 
     iterator& operator--()
     {
-        m_it = utf8::previous(m_it);
+        m_it = utf8::previous(m_it, Iterator{});
         invalidate_value();
         return *this;
     }
@@ -132,7 +132,7 @@ private:
     Codepoint get_value() const
     {
         if (m_value == -1)
-            m_value = utf8::codepoint<InvalidPolicy>(m_it);
+            m_value = utf8::codepoint<InvalidPolicy>(m_it, Iterator{});
         return m_value;
     }
 
@@ -140,7 +140,7 @@ private:
     mutable Codepoint m_value = -1;
 };
 
-template<typename InvalidPolicy = InvalidBytePolicy::Assert, typename Iterator>
+template<typename InvalidPolicy = utf8::InvalidPolicy::Assert, typename Iterator>
 iterator<Iterator, InvalidPolicy> make_iterator(Iterator it)
 {
     return iterator<Iterator, InvalidPolicy>{std::move(it)};
diff --git a/src/word_db.cc b/src/word_db.cc
index b120cb66..dc08f102 100644
--- a/src/word_db.cc
+++ b/src/word_db.cc
@@ -10,7 +10,7 @@ namespace Kakoune
 static std::vector<String> get_words(StringView content)
 {
     std::vector<String> res;
-    using Iterator = utf8::iterator<const char*, utf8::InvalidBytePolicy::Pass>;
+    using Iterator = utf8::iterator<const char*, utf8::InvalidPolicy::Pass>;
     const char* word_start = content.begin();
     bool in_word = false;
     for (Iterator it{word_start}, end{content.end()}; it != end; ++it)