utf8: use end of sequence iterators for more security

This commit is contained in:
Maxime Coste 2014-07-02 21:14:01 +01:00
parent 3f70d91f8c
commit ed68d1ff28
15 changed files with 71 additions and 63 deletions

View File

@ -452,7 +452,7 @@ ByteCoord Buffer::char_next(ByteCoord coord) const
if (coord.column < m_lines[coord.line].length() - 1) if (coord.column < m_lines[coord.line].length() - 1)
{ {
auto& line = m_lines[coord.line]; auto& line = m_lines[coord.line];
coord.column += utf8::codepoint_size(line.begin() + (int)coord.column); coord.column += utf8::codepoint_size(line[(int)coord.column]);
// Handle invalid utf-8 // Handle invalid utf-8
if (coord.column >= line.length()) if (coord.column >= line.length())
{ {
@ -483,7 +483,7 @@ ByteCoord Buffer::char_prev(ByteCoord coord) const
else else
{ {
auto& line = m_lines[coord.line]; auto& line = m_lines[coord.line];
coord.column = (int)(utf8::character_start(line.begin() + (int)coord.column - 1) - line.begin()); coord.column = (int)(utf8::character_start(line.begin() + (int)coord.column - 1, line.begin()) - line.begin());
} }
return coord; return coord;
} }

View File

@ -14,7 +14,7 @@ CharCount get_column(const Buffer& buffer,
auto col = 0_char; auto col = 0_char;
for (auto it = line.begin(); for (auto it = line.begin();
it != line.end() and coord.column > (int)(it - line.begin()); it != line.end() and coord.column > (int)(it - line.begin());
it = utf8::next(it)) it = utf8::next(it, line.end()))
{ {
if (*it == '\t') if (*it == '\t')
col = (col / tabstop + 1) * tabstop; col = (col / tabstop + 1) * tabstop;

View File

@ -15,13 +15,13 @@ inline String content(const Buffer& buffer, const Selection& range)
inline BufferIterator erase(Buffer& buffer, const Selection& range) inline BufferIterator erase(Buffer& buffer, const Selection& range)
{ {
return buffer.erase(buffer.iterator_at(range.min()), return buffer.erase(buffer.iterator_at(range.min()),
utf8::next(buffer.iterator_at(range.max()))); utf8::next(buffer.iterator_at(range.max()), buffer.end()));
} }
inline CharCount char_length(const Buffer& buffer, const Selection& range) inline CharCount char_length(const Buffer& buffer, const Selection& range)
{ {
return utf8::distance(buffer.iterator_at(range.min()), return utf8::distance(buffer.iterator_at(range.min()),
utf8::next(buffer.iterator_at(range.max()))); utf8::next(buffer.iterator_at(range.max()), buffer.end()));
} }
CharCount get_column(const Buffer& buffer, CharCount get_column(const Buffer& buffer,

View File

@ -602,8 +602,8 @@ void expand_unprintable(const Context& context, HighlightFlags flags, DisplayBuf
for (auto it = buffer.iterator_at(atom_it->begin()), for (auto it = buffer.iterator_at(atom_it->begin()),
end = buffer.iterator_at(atom_it->end()); it < end;) end = buffer.iterator_at(atom_it->end()); it < end;)
{ {
Codepoint cp = utf8::codepoint<utf8::InvalidBytePolicy::Pass>(it); Codepoint cp = utf8::codepoint<utf8::InvalidPolicy::Pass>(it, end);
auto next = utf8::next(it); auto next = utf8::next(it, end);
if (cp != '\n' and not iswprint(cp)) if (cp != '\n' and not iswprint(cp))
{ {
std::ostringstream oss; std::ostringstream oss;

View File

@ -32,7 +32,7 @@ template<bool other_buffers>
InsertCompletion complete_word(const Buffer& buffer, ByteCoord cursor_pos) InsertCompletion complete_word(const Buffer& buffer, ByteCoord cursor_pos)
{ {
auto pos = buffer.iterator_at(cursor_pos); auto pos = buffer.iterator_at(cursor_pos);
if (pos == buffer.begin() or not is_word(*utf8::previous(pos))) if (pos == buffer.begin() or not is_word(*utf8::previous(pos, buffer.begin())))
return {}; return {};
auto end = buffer.iterator_at(cursor_pos); auto end = buffer.iterator_at(cursor_pos);

View File

@ -41,7 +41,7 @@ static const KeyAndName keynamemap[] = {
KeyList parse_keys(StringView str) KeyList parse_keys(StringView str)
{ {
KeyList result; KeyList result;
using PassPolicy = utf8::InvalidBytePolicy::Pass; using PassPolicy = utf8::InvalidPolicy::Pass;
using Utf8It = utf8::iterator<const char*, PassPolicy>; using Utf8It = utf8::iterator<const char*, PassPolicy>;
for (Utf8It it = str.begin(), str_end = str.end(); it < str_end; ++it) for (Utf8It it = str.begin(), str_end = str.end(); it < str_end; ++it)
{ {
@ -71,7 +71,7 @@ KeyList parse_keys(StringView str)
} }
if (keyname.char_length() == 1) if (keyname.char_length() == 1)
{ {
result.push_back(Key{ modifier, utf8::codepoint<PassPolicy>(keyname.begin()) }); result.push_back(Key{ modifier, utf8::codepoint<PassPolicy>(keyname.begin(),keyname.end()) });
it = end_it; it = end_it;
continue; continue;
} }

View File

@ -207,7 +207,7 @@ void NCursesUI::refresh()
m_dirty = false; m_dirty = false;
} }
using Utf8Policy = utf8::InvalidBytePolicy::Pass; using Utf8Policy = utf8::InvalidPolicy::Pass;
using Utf8Iterator = utf8::iterator<const char*, Utf8Policy>; using Utf8Iterator = utf8::iterator<const char*, Utf8Policy>;
void addutf8str(WINDOW* win, Utf8Iterator begin, Utf8Iterator end) void addutf8str(WINDOW* win, Utf8Iterator begin, Utf8Iterator end)
{ {
@ -408,8 +408,9 @@ Key NCursesUI::get_key()
int operator*() { return getch(); } int operator*() { return getch(); }
getch_iterator& operator++() { return *this; } getch_iterator& operator++() { return *this; }
getch_iterator& operator++(int) { return *this; } getch_iterator& operator++(int) { return *this; }
bool operator== (const getch_iterator&) const { return false; }
}; };
return utf8::codepoint(getch_iterator{}); return utf8::codepoint(getch_iterator{}, getch_iterator{});
} }
return Key::Invalid; return Key::Invalid;
} }

View File

@ -719,7 +719,7 @@ void keep(Context& context, int)
for (auto& sel : context.selections()) for (auto& sel : context.selections())
{ {
if (boost::regex_search(buffer.iterator_at(sel.min()), if (boost::regex_search(buffer.iterator_at(sel.min()),
utf8::next(buffer.iterator_at(sel.max())), ex) == matching) utf8::next(buffer.iterator_at(sel.max()), buffer.end()), ex) == matching)
keep.push_back(sel); keep.push_back(sel);
} }
if (keep.empty()) if (keep.empty())

View File

@ -446,7 +446,7 @@ BufferIterator prepare_insert(Buffer& buffer, const Selection& sel, InsertMode m
{ {
// special case for end of lines, append to current line instead // special case for end of lines, append to current line instead
auto pos = buffer.iterator_at(sel.max()); auto pos = buffer.iterator_at(sel.max());
return *pos == '\n' ? pos : utf8::next(pos); return *pos == '\n' ? pos : utf8::next(pos, buffer.end());
} }
case InsertMode::InsertAtLineBegin: case InsertMode::InsertAtLineBegin:
return buffer.iterator_at(sel.min().line); return buffer.iterator_at(sel.min().line);

View File

@ -494,7 +494,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex)
auto& buffer = selections.buffer(); auto& buffer = selections.buffer();
for (auto& sel : selections) for (auto& sel : selections)
{ {
auto sel_end = utf8::next(buffer.iterator_at(sel.max())); auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
RegexIterator re_it(buffer.iterator_at(sel.min()), sel_end, regex); RegexIterator re_it(buffer.iterator_at(sel.min()), sel_end, regex);
RegexIterator re_end; RegexIterator re_end;
@ -511,7 +511,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex)
captures.emplace_back(match.first, match.second); captures.emplace_back(match.first, match.second);
result.push_back({ begin.coord(), result.push_back({ begin.coord(),
(begin == end ? end : utf8::previous(end)).coord(), (begin == end ? end : utf8::previous(end, begin)).coord(),
std::move(captures) }); std::move(captures) });
} }
} }
@ -527,7 +527,7 @@ void split_selections(SelectionList& selections, const Regex& regex)
for (auto& sel : selections) for (auto& sel : selections)
{ {
auto begin = buffer.iterator_at(sel.min()); auto begin = buffer.iterator_at(sel.min());
auto sel_end = utf8::next(buffer.iterator_at(sel.max())); auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
RegexIterator re_it(begin, sel_end, regex, RegexIterator re_it(begin, sel_end, regex,
boost::regex_constants::match_nosubs); boost::regex_constants::match_nosubs);
RegexIterator re_end; RegexIterator re_end;
@ -536,7 +536,7 @@ void split_selections(SelectionList& selections, const Regex& regex)
{ {
BufferIterator end = (*re_it)[0].first; BufferIterator end = (*re_it)[0].first;
result.push_back({ begin.coord(), (begin == end) ? end.coord() : utf8::previous(end).coord() }); result.push_back({ begin.coord(), (begin == end) ? end.coord() : utf8::previous(end, begin).coord() });
begin = (*re_it)[0].second; begin = (*re_it)[0].second;
} }
if (begin.coord() <= sel.max()) if (begin.coord() <= sel.max())

View File

@ -50,7 +50,7 @@ inline void remove_selection(SelectionList& selections, int index)
selections.check_invariant(); selections.check_invariant();
} }
using Utf8Iterator = utf8::iterator<BufferIterator, utf8::InvalidBytePolicy::Pass>; using Utf8Iterator = utf8::iterator<BufferIterator, utf8::InvalidPolicy::Pass>;
inline Selection utf8_range(const Utf8Iterator& first, const Utf8Iterator& last) inline Selection utf8_range(const Utf8Iterator& first, const Utf8Iterator& last)
{ {
@ -265,7 +265,7 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
CaptureList captures; CaptureList captures;
MatchResults matches; MatchResults matches;
bool found = false; bool found = false;
if ((found = find_match_in_buffer<direction>(buffer, utf8::next(begin), matches, regex))) if ((found = find_match_in_buffer<direction>(buffer, utf8::next(begin, buffer.end()), matches, regex)))
{ {
begin = matches[0].first; begin = matches[0].first;
end = matches[0].second; end = matches[0].second;
@ -275,7 +275,7 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
if (not found or begin == buffer.end()) if (not found or begin == buffer.end())
throw runtime_error("'" + regex.str() + "': no matches found"); throw runtime_error("'" + regex.str() + "': no matches found");
end = (begin == end) ? end : utf8::previous(end); end = (begin == end) ? end : utf8::previous(end, begin);
if (direction == Backward) if (direction == Backward)
std::swap(begin, end); std::swap(begin, end);

View File

@ -99,7 +99,7 @@ void test_utf8()
{ {
String str = "maïs mélange bientôt"; String str = "maïs mélange bientôt";
kak_assert(utf8::distance(str.begin(), str.end()) == 20); kak_assert(utf8::distance(str.begin(), str.end()) == 20);
kak_assert(utf8::codepoint(str.begin() + 2) == 0x00EF); kak_assert(utf8::codepoint(str.begin() + 2, str.end()) == 0x00EF);
} }
void test_string() void test_string()

View File

@ -15,10 +15,10 @@ namespace utf8
// returns an iterator to next character first byte // returns an iterator to next character first byte
template<typename Iterator> template<typename Iterator>
Iterator next(Iterator it) Iterator next(Iterator it, Iterator end)
{ {
if (*it++ & 0x80) if (it != end and *it++ & 0x80)
while ((*(it) & 0xC0) == 0x80) while (it != end and (*(it) & 0xC0) == 0x80)
++it; ++it;
return it; return it;
} }
@ -26,18 +26,18 @@ Iterator next(Iterator it)
// returns it's parameter if it points to a character first byte, // returns it's parameter if it points to a character first byte,
// or else returns next character first byte // or else returns next character first byte
template<typename Iterator> template<typename Iterator>
Iterator finish(Iterator it) Iterator finish(Iterator it, Iterator end)
{ {
while ((*(it) & 0xC0) == 0x80) while (it != end and (*(it) & 0xC0) == 0x80)
++it; ++it;
return it; return it;
} }
// returns an iterator to the previous character first byte // returns an iterator to the previous character first byte
template<typename Iterator> template<typename Iterator>
Iterator previous(Iterator it) Iterator previous(Iterator it, Iterator begin)
{ {
while ((*(--it) & 0xC0) == 0x80) while (it != begin and (*(--it) & 0xC0) == 0x80)
; ;
return it; return it;
} }
@ -51,12 +51,12 @@ Iterator advance(Iterator it, Iterator end, CharCount d)
if (d < 0) if (d < 0)
{ {
while (it != end and d++) while (it != end and d++)
it = utf8::previous(it); it = utf8::previous(it, end);
} }
else else
{ {
while (it != end and d--) while (it != end and d--)
it = utf8::next(it); it = utf8::next(it, end);
} }
return it; return it;
} }
@ -83,65 +83,72 @@ inline bool is_character_start(char c)
// returns an iterator to the first byte of the character it is into // returns an iterator to the first byte of the character it is into
template<typename Iterator> template<typename Iterator>
Iterator character_start(Iterator it) Iterator character_start(Iterator it, Iterator begin)
{ {
while (not is_character_start(*it)) while (it != begin and not is_character_start(*it))
--it; --it;
return it; return it;
} }
namespace InvalidBytePolicy namespace InvalidPolicy
{ {
struct Assert struct Assert
{ {
Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; } Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
}; };
struct Pass struct Pass
{ {
Codepoint operator()(unsigned char byte) const { return byte; } Codepoint operator()(Codepoint cp) const { return cp; }
}; };
} }
// returns the codepoint of the character whose first byte // returns the codepoint of the character whose first byte
// is pointed by it // is pointed by it
template<typename InvalidPolicy = InvalidBytePolicy::Assert, template<typename InvalidPolicy = utf8::InvalidPolicy::Assert,
typename Iterator> typename Iterator>
Codepoint codepoint(Iterator it) Codepoint codepoint(Iterator it, Iterator end)
{ {
if (it == end)
return InvalidPolicy{}(-1);
// According to rfc3629, UTF-8 allows only up to 4 bytes. // According to rfc3629, UTF-8 allows only up to 4 bytes.
// (21 bits codepoint) // (21 bits codepoint)
Codepoint cp;
unsigned char byte = *it++; unsigned char byte = *it++;
if (not (byte & 0x80)) // 0xxxxxxx if (not (byte & 0x80)) // 0xxxxxxx
cp = byte; return byte;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
if (it == end)
return InvalidPolicy{}(byte);
if ((byte & 0xE0) == 0xC0) // 110xxxxx
return ((byte & 0x1F) << 6) | (*it & 0x3F);
if ((byte & 0xF0) == 0xE0) // 1110xxxx
{ {
cp = ((byte & 0x1F) << 6) | (*it & 0x3F); Codepoint cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
if (it == end)
return InvalidPolicy{}(cp);
return cp | (*it & 0x3F);
} }
else if ((byte & 0xF0) == 0xE0) // 1110xxxx
if ((byte & 0xF8) == 0xF0) // 11110xxx
{ {
cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6); Codepoint cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
cp |= (*it & 0x3F); if (it == end)
} return InvalidPolicy{}(cp);
else if ((byte & 0xF8) == 0xF0) // 11110xxx
{
cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
cp |= (*it++ & 0x3F) << 6; cp |= (*it++ & 0x3F) << 6;
cp |= (*it & 0x3F); if (it == end)
return InvalidPolicy{}(cp);
return cp | (*it & 0x3F);
} }
else return InvalidPolicy{}(byte);
cp = InvalidPolicy{}(byte);
return cp;
} }
template<typename InvalidPolicy = InvalidBytePolicy::Assert, template<typename InvalidPolicy = utf8::InvalidPolicy::Assert>
typename Iterator> ByteCount codepoint_size(char byte)
ByteCount codepoint_size(Iterator it)
{ {
unsigned char byte = *it;
if (not (byte & 0x80)) // 0xxxxxxx if (not (byte & 0x80)) // 0xxxxxxx
return 1; return 1;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx else if ((byte & 0xE0) == 0xC0) // 110xxxxx

View File

@ -12,7 +12,7 @@ namespace utf8
// adapter for an iterator on bytes which permits to iterate // adapter for an iterator on bytes which permits to iterate
// on unicode codepoints instead. // on unicode codepoints instead.
template<typename Iterator, template<typename Iterator,
typename InvalidPolicy = InvalidBytePolicy::Assert> typename InvalidPolicy = utf8::InvalidPolicy::Assert>
class iterator class iterator
{ {
public: public:
@ -21,7 +21,7 @@ public:
iterator& operator++() iterator& operator++()
{ {
m_it = utf8::next(m_it); m_it = utf8::next(m_it, Iterator{});
invalidate_value(); invalidate_value();
return *this; return *this;
} }
@ -41,7 +41,7 @@ public:
iterator& operator--() iterator& operator--()
{ {
m_it = utf8::previous(m_it); m_it = utf8::previous(m_it, Iterator{});
invalidate_value(); invalidate_value();
return *this; return *this;
} }
@ -132,7 +132,7 @@ private:
Codepoint get_value() const Codepoint get_value() const
{ {
if (m_value == -1) if (m_value == -1)
m_value = utf8::codepoint<InvalidPolicy>(m_it); m_value = utf8::codepoint<InvalidPolicy>(m_it, Iterator{});
return m_value; return m_value;
} }
@ -140,7 +140,7 @@ private:
mutable Codepoint m_value = -1; mutable Codepoint m_value = -1;
}; };
template<typename InvalidPolicy = InvalidBytePolicy::Assert, typename Iterator> template<typename InvalidPolicy = utf8::InvalidPolicy::Assert, typename Iterator>
iterator<Iterator, InvalidPolicy> make_iterator(Iterator it) iterator<Iterator, InvalidPolicy> make_iterator(Iterator it)
{ {
return iterator<Iterator, InvalidPolicy>{std::move(it)}; return iterator<Iterator, InvalidPolicy>{std::move(it)};

View File

@ -10,7 +10,7 @@ namespace Kakoune
static std::vector<String> get_words(StringView content) static std::vector<String> get_words(StringView content)
{ {
std::vector<String> res; std::vector<String> res;
using Iterator = utf8::iterator<const char*, utf8::InvalidBytePolicy::Pass>; using Iterator = utf8::iterator<const char*, utf8::InvalidPolicy::Pass>;
const char* word_start = content.begin(); const char* word_start = content.begin();
bool in_word = false; bool in_word = false;
for (Iterator it{word_start}, end{content.end()}; it != end; ++it) for (Iterator it{word_start}, end{content.end()}; it != end; ++it)