utf8 awareness in selectors

This commit is contained in:
Maxime Coste 2012-10-08 19:12:09 +02:00
parent ed592a3005
commit 0a686573ab
2 changed files with 99 additions and 75 deletions

View File

@ -2,37 +2,42 @@
#include "string.hh" #include "string.hh"
#include "utf8_iterator.hh"
#include <algorithm> #include <algorithm>
namespace Kakoune namespace Kakoune
{ {
using utf8::Codepoint;
using Utf8Iterator = utf8::utf8_iterator<BufferIterator>;
namespace namespace
{ {
bool is_eol(char c) bool is_eol(Codepoint c)
{ {
return c == '\n'; return c == '\n';
} }
bool is_blank(char c) bool is_blank(Codepoint c)
{ {
return c == ' ' or c == '\t'; return c == ' ' or c == '\t';
} }
template<bool punctuation_is_word = false> template<bool punctuation_is_word = false>
bool is_word(char c) bool is_word(Codepoint c)
{ {
return Kakoune::is_word(c); return Kakoune::is_word(c);
} }
template<> template<>
bool is_word<true>(char c) bool is_word<true>(Codepoint c)
{ {
return !is_blank(c) and !is_eol(c); return !is_blank(c) and !is_eol(c);
} }
static bool is_punctuation(char c) static bool is_punctuation(Codepoint c)
{ {
return not (is_word(c) or is_blank(c) or is_eol(c)); return not (is_word(c) or is_blank(c) or is_eol(c));
} }
@ -46,7 +51,7 @@ enum class CharCategories
}; };
template<bool punctuation_is_word = false> template<bool punctuation_is_word = false>
CharCategories categorize(char c) CharCategories categorize(Codepoint c)
{ {
if (is_word(c)) if (is_word(c))
return CharCategories::Word; return CharCategories::Word;
@ -58,20 +63,31 @@ CharCategories categorize(char c)
: CharCategories::Punctuation; : CharCategories::Punctuation;
} }
template<typename T> bool is_begin(const BufferIterator& it) { return it.is_begin(); }
bool skip_while(BufferIterator& it, T condition) bool is_end(const BufferIterator& it) { return it.is_end(); }
bool is_begin(const Utf8Iterator& it) { return it.underlying_iterator().is_begin(); }
bool is_end(const Utf8Iterator& it) { return it.underlying_iterator().is_end(); }
template<typename Iterator, typename T>
bool skip_while(Iterator& it, T condition)
{ {
while (not it.is_end() and condition(*it)) while (not is_end(it) and condition(*it))
++it; ++it;
return not it.is_end() and condition(*it); return not is_end(it) and condition(*it);
} }
template<typename T> template<typename Iterator, typename T>
bool skip_while_reverse(BufferIterator& it, T condition) bool skip_while_reverse(Iterator& it, T condition)
{ {
while (not it.is_begin() and condition(*it)) while (not is_begin(it) and condition(*it))
--it; --it;
return not it.is_end() and condition(*it); return not is_end(it) and condition(*it);
}
Selection utf8_selection(const Utf8Iterator& first, const Utf8Iterator& last)
{
return Selection(first.underlying_iterator(), last.underlying_iterator());
} }
} }
@ -81,13 +97,13 @@ typedef boost::regex_iterator<BufferIterator> RegexIterator;
template<bool punctuation_is_word> template<bool punctuation_is_word>
SelectionAndCaptures select_to_next_word(const Selection& selection) SelectionAndCaptures select_to_next_word(const Selection& selection)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
if (categorize<punctuation_is_word>(*begin) != if (categorize<punctuation_is_word>(*begin) !=
categorize<punctuation_is_word>(*(begin+1))) categorize<punctuation_is_word>(*(begin+1)))
++begin; ++begin;
skip_while(begin, is_eol); skip_while(begin, is_eol);
BufferIterator end = begin+1; Utf8Iterator end = begin+1;
if (not punctuation_is_word and is_punctuation(*begin)) if (not punctuation_is_word and is_punctuation(*begin))
skip_while(end, is_punctuation); skip_while(end, is_punctuation);
@ -96,7 +112,7 @@ SelectionAndCaptures select_to_next_word(const Selection& selection)
bool with_end = skip_while(end, is_blank); bool with_end = skip_while(end, is_blank);
return Selection(begin, with_end ? end : end-1); return utf8_selection(begin, with_end ? end : end-1);
} }
template SelectionAndCaptures select_to_next_word<false>(const Selection&); template SelectionAndCaptures select_to_next_word<false>(const Selection&);
template SelectionAndCaptures select_to_next_word<true>(const Selection&); template SelectionAndCaptures select_to_next_word<true>(const Selection&);
@ -104,13 +120,13 @@ template SelectionAndCaptures select_to_next_word<true>(const Selection&);
template<bool punctuation_is_word> template<bool punctuation_is_word>
SelectionAndCaptures select_to_next_word_end(const Selection& selection) SelectionAndCaptures select_to_next_word_end(const Selection& selection)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
if (categorize<punctuation_is_word>(*begin) != if (categorize<punctuation_is_word>(*begin) !=
categorize<punctuation_is_word>(*(begin+1))) categorize<punctuation_is_word>(*(begin+1)))
++begin; ++begin;
skip_while(begin, is_eol); skip_while(begin, is_eol);
BufferIterator end = begin; Utf8Iterator end = begin;
skip_while(end, is_blank); skip_while(end, is_blank);
bool with_end = false; bool with_end = false;
@ -119,7 +135,7 @@ SelectionAndCaptures select_to_next_word_end(const Selection& selection)
else if (is_word<punctuation_is_word>(*end)) else if (is_word<punctuation_is_word>(*end))
with_end = skip_while(end, is_word<punctuation_is_word>); with_end = skip_while(end, is_word<punctuation_is_word>);
return Selection(begin, with_end ? end : end-1); return utf8_selection(begin, with_end ? end : end-1);
} }
template SelectionAndCaptures select_to_next_word_end<false>(const Selection&); template SelectionAndCaptures select_to_next_word_end<false>(const Selection&);
template SelectionAndCaptures select_to_next_word_end<true>(const Selection&); template SelectionAndCaptures select_to_next_word_end<true>(const Selection&);
@ -127,14 +143,14 @@ template SelectionAndCaptures select_to_next_word_end<true>(const Selection&);
template<bool punctuation_is_word> template<bool punctuation_is_word>
SelectionAndCaptures select_to_previous_word(const Selection& selection) SelectionAndCaptures select_to_previous_word(const Selection& selection)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
if (categorize<punctuation_is_word>(*begin) != if (categorize<punctuation_is_word>(*begin) !=
categorize<punctuation_is_word>(*(begin-1))) categorize<punctuation_is_word>(*(begin-1)))
--begin; --begin;
skip_while_reverse(begin, is_eol); skip_while_reverse(begin, is_eol);
BufferIterator end = begin; Utf8Iterator end = begin;
skip_while_reverse(end, is_blank); skip_while_reverse(end, is_blank);
bool with_end = false; bool with_end = false;
@ -143,30 +159,30 @@ SelectionAndCaptures select_to_previous_word(const Selection& selection)
else if (is_word<punctuation_is_word>(*end)) else if (is_word<punctuation_is_word>(*end))
with_end = skip_while_reverse(end, is_word<punctuation_is_word>); with_end = skip_while_reverse(end, is_word<punctuation_is_word>);
return Selection(begin, with_end ? end : end+1); return utf8_selection(begin, with_end ? end : end+1);
} }
template SelectionAndCaptures select_to_previous_word<false>(const Selection&); template SelectionAndCaptures select_to_previous_word<false>(const Selection&);
template SelectionAndCaptures select_to_previous_word<true>(const Selection&); template SelectionAndCaptures select_to_previous_word<true>(const Selection&);
SelectionAndCaptures select_line(const Selection& selection) SelectionAndCaptures select_line(const Selection& selection)
{ {
BufferIterator first = selection.last(); Utf8Iterator first = selection.last();
if (*first == '\n' and not (first + 1).is_end()) if (*first == '\n' and not is_end(first + 1))
++first; ++first;
while (not first.is_begin() and *(first - 1) != '\n') while (not is_begin(first) and *(first - 1) != '\n')
--first; --first;
BufferIterator last = first; Utf8Iterator last = first;
while (not (last + 1).is_end() and *last != '\n') while (not is_end(last + 1) and *last != '\n')
++last; ++last;
return Selection(first, last); return utf8_selection(first, last);
} }
SelectionAndCaptures select_matching(const Selection& selection) SelectionAndCaptures select_matching(const Selection& selection)
{ {
std::vector<char> matching_pairs = { '(', ')', '{', '}', '[', ']', '<', '>' }; std::vector<char> matching_pairs = { '(', ')', '{', '}', '[', ']', '<', '>' };
BufferIterator it = selection.last(); Utf8Iterator it = selection.last();
std::vector<char>::iterator match = matching_pairs.end(); std::vector<char>::iterator match = matching_pairs.end();
while (not is_eol(*it)) while (not is_eol(*it))
{ {
@ -178,19 +194,19 @@ SelectionAndCaptures select_matching(const Selection& selection)
if (match == matching_pairs.end()) if (match == matching_pairs.end())
return selection; return selection;
BufferIterator begin = it; Utf8Iterator begin = it;
if (((match - matching_pairs.begin()) % 2) == 0) if (((match - matching_pairs.begin()) % 2) == 0)
{ {
int level = 0; int level = 0;
const char opening = *match; const char opening = *match;
const char closing = *(match+1); const char closing = *(match+1);
while (not it.is_end()) while (not is_end(it))
{ {
if (*it == opening) if (*it == opening)
++level; ++level;
else if (*it == closing and --level == 0) else if (*it == closing and --level == 0)
return Selection(begin, it); return utf8_selection(begin, it);
++it; ++it;
} }
@ -200,12 +216,12 @@ SelectionAndCaptures select_matching(const Selection& selection)
int level = 0; int level = 0;
const char opening = *(match-1); const char opening = *(match-1);
const char closing = *match; const char closing = *match;
while (not it.is_begin()) while (not is_begin(it))
{ {
if (*it == closing) if (*it == closing)
++level; ++level;
else if (*it == opening and --level == 0) else if (*it == opening and --level == 0)
return Selection(begin, it); return utf8_selection(begin, it);
--it; --it;
} }
} }
@ -213,12 +229,12 @@ SelectionAndCaptures select_matching(const Selection& selection)
} }
SelectionAndCaptures select_surrounding(const Selection& selection, SelectionAndCaptures select_surrounding(const Selection& selection,
const std::pair<char, char>& matching, const CodepointPair& matching,
bool inside) bool inside)
{ {
int level = 0; int level = 0;
BufferIterator first = selection.last(); Utf8Iterator first = selection.last();
while (not first.is_begin()) while (not is_begin(first))
{ {
if (first != selection.last() and *first == matching.second) if (first != selection.last() and *first == matching.second)
++level; ++level;
@ -235,8 +251,8 @@ SelectionAndCaptures select_surrounding(const Selection& selection,
return selection; return selection;
level = 0; level = 0;
BufferIterator last = first + 1; Utf8Iterator last = first + 1;
while (not last.is_end()) while (not is_end(last))
{ {
if (*last == matching.first) if (*last == matching.first)
++level; ++level;
@ -258,64 +274,64 @@ SelectionAndCaptures select_surrounding(const Selection& selection,
if (first != last) if (first != last)
--last; --last;
} }
return Selection(first, last); return utf8_selection(first, last);
} }
SelectionAndCaptures select_to(const Selection& selection, SelectionAndCaptures select_to(const Selection& selection,
char c, int count, bool inclusive) Codepoint c, int count, bool inclusive)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
BufferIterator end = begin; Utf8Iterator end = begin;
do do
{ {
++end; ++end;
skip_while(end, [c](char cur) { return not is_eol(cur) and cur != c; }); skip_while(end, [c](Codepoint cur) { return not is_eol(cur) and cur != c; });
if (end.is_end() or is_eol(*end)) if (is_end(end) or is_eol(*end))
return selection; return selection;
} }
while (--count > 0); while (--count > 0);
return Selection(begin, inclusive ? end : end-1); return utf8_selection(begin, inclusive ? end : end-1);
} }
SelectionAndCaptures select_to_reverse(const Selection& selection, SelectionAndCaptures select_to_reverse(const Selection& selection,
char c, int count, bool inclusive) Codepoint c, int count, bool inclusive)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
BufferIterator end = begin; Utf8Iterator end = begin;
do do
{ {
--end; --end;
skip_while_reverse(end, [c](char cur) { return not is_eol(cur) and cur != c; }); skip_while_reverse(end, [c](Codepoint cur) { return not is_eol(cur) and cur != c; });
if (end.is_begin() or is_eol(*end)) if (is_begin(end) or is_eol(*end))
return selection; return selection;
} }
while (--count > 0); while (--count > 0);
return Selection(begin, inclusive ? end : end+1); return utf8_selection(begin, inclusive ? end : end+1);
} }
SelectionAndCaptures select_to_eol(const Selection& selection) SelectionAndCaptures select_to_eol(const Selection& selection)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
BufferIterator end = begin + 1; Utf8Iterator end = begin + 1;
skip_while(end, [](char cur) { return not is_eol(cur); }); skip_while(end, [](Codepoint cur) { return not is_eol(cur); });
return Selection(begin, end-1); return utf8_selection(begin, end-1);
} }
SelectionAndCaptures select_to_eol_reverse(const Selection& selection) SelectionAndCaptures select_to_eol_reverse(const Selection& selection)
{ {
BufferIterator begin = selection.last(); Utf8Iterator begin = selection.last();
BufferIterator end = begin - 1; Utf8Iterator end = begin - 1;
skip_while_reverse(end, [](char cur) { return not is_eol(cur); }); skip_while_reverse(end, [](Codepoint cur) { return not is_eol(cur); });
return Selection(begin, end.is_begin() ? end : end+1); return utf8_selection(begin, is_begin(end) ? end : end+1);
} }
template<bool punctuation_is_word> template<bool punctuation_is_word>
SelectionAndCaptures select_whole_word(const Selection& selection, bool inner) SelectionAndCaptures select_whole_word(const Selection& selection, bool inner)
{ {
BufferIterator first = selection.last(); Utf8Iterator first = selection.last();
BufferIterator last = first; Utf8Iterator last = first;
if (is_word(*first)) if (is_word(*first))
{ {
if (not skip_while_reverse(first, is_word<punctuation_is_word>)) if (not skip_while_reverse(first, is_word<punctuation_is_word>))
@ -334,13 +350,14 @@ SelectionAndCaptures select_whole_word(const Selection& selection, bool inner)
skip_while(last, is_word<punctuation_is_word>); skip_while(last, is_word<punctuation_is_word>);
} }
--last; --last;
return Selection(first, last); return utf8_selection(first, last);
} }
template SelectionAndCaptures select_whole_word<false>(const Selection&, bool); template SelectionAndCaptures select_whole_word<false>(const Selection&, bool);
template SelectionAndCaptures select_whole_word<true>(const Selection&, bool); template SelectionAndCaptures select_whole_word<true>(const Selection&, bool);
SelectionAndCaptures select_whole_lines(const Selection& selection) SelectionAndCaptures select_whole_lines(const Selection& selection)
{ {
// no need to be utf8 aware for is_eol as we only use \n as line seperator
BufferIterator first = selection.first(); BufferIterator first = selection.first();
BufferIterator last = selection.last(); BufferIterator last = selection.last();
BufferIterator& to_line_start = first <= last ? first : last; BufferIterator& to_line_start = first <= last ? first : last;
@ -352,14 +369,13 @@ SelectionAndCaptures select_whole_lines(const Selection& selection)
skip_while(to_line_end, [](char cur) { return not is_eol(cur); }); skip_while(to_line_end, [](char cur) { return not is_eol(cur); });
return Selection(first, last); return Selection(first, last);
} }
SelectionAndCaptures select_whole_buffer(const Selection& selection) SelectionAndCaptures select_whole_buffer(const Selection& selection)
{ {
const Buffer& buffer = selection.first().buffer(); const Buffer& buffer = selection.first().buffer();
return Selection(buffer.begin(), buffer.end()-1); return Selection(buffer.begin(), utf8::previous(buffer.end()));
} }
SelectionAndCaptures select_next_match(const Selection& selection, SelectionAndCaptures select_next_match(const Selection& selection,
@ -367,6 +383,8 @@ SelectionAndCaptures select_next_match(const Selection& selection,
{ {
try try
{ {
// regex matching do not use Utf8Iterator as boost::regex handle utf8
// decoding itself
BufferIterator begin = selection.last(); BufferIterator begin = selection.last();
BufferIterator end = begin; BufferIterator end = begin;
CaptureList captures; CaptureList captures;
@ -374,7 +392,7 @@ SelectionAndCaptures select_next_match(const Selection& selection,
Regex ex(regex.begin(), regex.end()); Regex ex(regex.begin(), regex.end());
boost::match_results<BufferIterator> matches; boost::match_results<BufferIterator> matches;
if (boost::regex_search(begin+1, begin.buffer().end(), if (boost::regex_search(utf8::next(begin), begin.buffer().end(),
matches, ex)) matches, ex))
{ {
begin = matches[0].first; begin = matches[0].first;
@ -382,7 +400,7 @@ SelectionAndCaptures select_next_match(const Selection& selection,
for (auto& match : matches) for (auto& match : matches)
captures.push_back(String(match.first, match.second)); captures.push_back(String(match.first, match.second));
} }
else if (boost::regex_search(begin.buffer().begin(), begin+1, else if (boost::regex_search(begin.buffer().begin(), utf8::next(begin),
matches, ex)) matches, ex))
{ {
begin = matches[0].first; begin = matches[0].first;
@ -396,7 +414,7 @@ SelectionAndCaptures select_next_match(const Selection& selection,
if (begin == end) if (begin == end)
++end; ++end;
return SelectionAndCaptures(begin, end - 1, std::move(captures)); return SelectionAndCaptures(begin, utf8::previous(end), std::move(captures));
} }
catch (boost::regex_error& err) catch (boost::regex_error& err)
{ {
@ -427,7 +445,7 @@ SelectionAndCapturesList select_all_matches(const Selection& selection,
captures.push_back(String(match.first, match.second)); captures.push_back(String(match.first, match.second));
result.push_back(SelectionAndCaptures(begin, result.push_back(SelectionAndCaptures(begin,
begin == end ? end : end-1, begin == end ? end : utf8::previous(end),
std::move(captures))); std::move(captures)));
} }
return result; return result;
@ -454,7 +472,7 @@ SelectionAndCapturesList split_selection(const Selection& selection,
{ {
BufferIterator end = (*re_it)[0].first; BufferIterator end = (*re_it)[0].first;
result.push_back(Selection(begin, (begin == end) ? end : end-1)); result.push_back(Selection(begin, (begin == end) ? end : utf8::previous(end)));
begin = (*re_it)[0].second; begin = (*re_it)[0].second;
} }
result.push_back(Selection(begin, selection.last())); result.push_back(Selection(begin, selection.last()));

View File

@ -2,6 +2,7 @@
#define selectors_hh_INCLUDED #define selectors_hh_INCLUDED
#include "selection.hh" #include "selection.hh"
#include "utf8.hh"
namespace Kakoune namespace Kakoune
{ {
@ -15,14 +16,19 @@ SelectionAndCaptures select_to_previous_word(const Selection& selection);
SelectionAndCaptures select_line(const Selection& selection); SelectionAndCaptures select_line(const Selection& selection);
SelectionAndCaptures select_matching(const Selection& selection); SelectionAndCaptures select_matching(const Selection& selection);
using CodepointPair = std::pair<utf8::Codepoint, utf8::Codepoint>;
SelectionAndCaptures select_surrounding(const Selection& selection, SelectionAndCaptures select_surrounding(const Selection& selection,
const std::pair<char, char>& matching, const CodepointPair& matching,
bool inside); bool inside);
SelectionAndCaptures select_to(const Selection& selection, SelectionAndCaptures select_to(const Selection& selection,
char c, int count, bool inclusive); utf8::Codepoint c,
int count, bool inclusive);
SelectionAndCaptures select_to_reverse(const Selection& selection, SelectionAndCaptures select_to_reverse(const Selection& selection,
char c, int count, bool inclusive); utf8::Codepoint c,
int count, bool inclusive);
SelectionAndCaptures select_to_eol(const Selection& selection); SelectionAndCaptures select_to_eol(const Selection& selection);
SelectionAndCaptures select_to_eol_reverse(const Selection& selection); SelectionAndCaptures select_to_eol_reverse(const Selection& selection);