utf8 awareness in selectors

This commit is contained in:
Maxime Coste 2012-10-08 19:12:09 +02:00
parent ed592a3005
commit 0a686573ab
2 changed files with 99 additions and 75 deletions

View File

@ -2,37 +2,42 @@
#include "string.hh"
#include "utf8_iterator.hh"
#include <algorithm>
namespace Kakoune
{
using utf8::Codepoint;
using Utf8Iterator = utf8::utf8_iterator<BufferIterator>;
namespace
{
bool is_eol(char c)
bool is_eol(Codepoint c)
{
return c == '\n';
}
bool is_blank(char c)
bool is_blank(Codepoint c)
{
return c == ' ' or c == '\t';
}
template<bool punctuation_is_word = false>
bool is_word(char c)
bool is_word(Codepoint c)
{
return Kakoune::is_word(c);
}
template<>
bool is_word<true>(char c)
bool is_word<true>(Codepoint c)
{
return !is_blank(c) and !is_eol(c);
}
static bool is_punctuation(char c)
static bool is_punctuation(Codepoint c)
{
return not (is_word(c) or is_blank(c) or is_eol(c));
}
@ -46,7 +51,7 @@ enum class CharCategories
};
template<bool punctuation_is_word = false>
CharCategories categorize(char c)
CharCategories categorize(Codepoint c)
{
if (is_word(c))
return CharCategories::Word;
@ -58,20 +63,31 @@ CharCategories categorize(char c)
: CharCategories::Punctuation;
}
template<typename T>
bool skip_while(BufferIterator& it, T condition)
bool is_begin(const BufferIterator& it) { return it.is_begin(); }
bool is_end(const BufferIterator& it) { return it.is_end(); }
bool is_begin(const Utf8Iterator& it) { return it.underlying_iterator().is_begin(); }
bool is_end(const Utf8Iterator& it) { return it.underlying_iterator().is_end(); }
template<typename Iterator, typename T>
bool skip_while(Iterator& it, T condition)
{
while (not it.is_end() and condition(*it))
while (not is_end(it) and condition(*it))
++it;
return not it.is_end() and condition(*it);
return not is_end(it) and condition(*it);
}
template<typename T>
bool skip_while_reverse(BufferIterator& it, T condition)
template<typename Iterator, typename T>
bool skip_while_reverse(Iterator& it, T condition)
{
while (not it.is_begin() and condition(*it))
while (not is_begin(it) and condition(*it))
--it;
return not it.is_end() and condition(*it);
return not is_end(it) and condition(*it);
}
Selection utf8_selection(const Utf8Iterator& first, const Utf8Iterator& last)
{
return Selection(first.underlying_iterator(), last.underlying_iterator());
}
}
@ -81,13 +97,13 @@ typedef boost::regex_iterator<BufferIterator> RegexIterator;
template<bool punctuation_is_word>
SelectionAndCaptures select_to_next_word(const Selection& selection)
{
BufferIterator begin = selection.last();
Utf8Iterator begin = selection.last();
if (categorize<punctuation_is_word>(*begin) !=
categorize<punctuation_is_word>(*(begin+1)))
++begin;
skip_while(begin, is_eol);
BufferIterator end = begin+1;
Utf8Iterator end = begin+1;
if (not punctuation_is_word and is_punctuation(*begin))
skip_while(end, is_punctuation);
@ -96,7 +112,7 @@ SelectionAndCaptures select_to_next_word(const Selection& selection)
bool with_end = skip_while(end, is_blank);
return Selection(begin, with_end ? end : end-1);
return utf8_selection(begin, with_end ? end : end-1);
}
template SelectionAndCaptures select_to_next_word<false>(const Selection&);
template SelectionAndCaptures select_to_next_word<true>(const Selection&);
@ -104,13 +120,13 @@ template SelectionAndCaptures select_to_next_word<true>(const Selection&);
template<bool punctuation_is_word>
SelectionAndCaptures select_to_next_word_end(const Selection& selection)
{
BufferIterator begin = selection.last();
Utf8Iterator begin = selection.last();
if (categorize<punctuation_is_word>(*begin) !=
categorize<punctuation_is_word>(*(begin+1)))
++begin;
skip_while(begin, is_eol);
BufferIterator end = begin;
Utf8Iterator end = begin;
skip_while(end, is_blank);
bool with_end = false;
@ -119,7 +135,7 @@ SelectionAndCaptures select_to_next_word_end(const Selection& selection)
else if (is_word<punctuation_is_word>(*end))
with_end = skip_while(end, is_word<punctuation_is_word>);
return Selection(begin, with_end ? end : end-1);
return utf8_selection(begin, with_end ? end : end-1);
}
template SelectionAndCaptures select_to_next_word_end<false>(const Selection&);
template SelectionAndCaptures select_to_next_word_end<true>(const Selection&);
@ -127,14 +143,14 @@ template SelectionAndCaptures select_to_next_word_end<true>(const Selection&);
template<bool punctuation_is_word>
SelectionAndCaptures select_to_previous_word(const Selection& selection)
{
BufferIterator begin = selection.last();
Utf8Iterator begin = selection.last();
if (categorize<punctuation_is_word>(*begin) !=
categorize<punctuation_is_word>(*(begin-1)))
--begin;
skip_while_reverse(begin, is_eol);
BufferIterator end = begin;
Utf8Iterator end = begin;
skip_while_reverse(end, is_blank);
bool with_end = false;
@ -143,30 +159,30 @@ SelectionAndCaptures select_to_previous_word(const Selection& selection)
else if (is_word<punctuation_is_word>(*end))
with_end = skip_while_reverse(end, is_word<punctuation_is_word>);
return Selection(begin, with_end ? end : end+1);
return utf8_selection(begin, with_end ? end : end+1);
}
template SelectionAndCaptures select_to_previous_word<false>(const Selection&);
template SelectionAndCaptures select_to_previous_word<true>(const Selection&);
SelectionAndCaptures select_line(const Selection& selection)
{
BufferIterator first = selection.last();
if (*first == '\n' and not (first + 1).is_end())
Utf8Iterator first = selection.last();
if (*first == '\n' and not is_end(first + 1))
++first;
while (not first.is_begin() and *(first - 1) != '\n')
while (not is_begin(first) and *(first - 1) != '\n')
--first;
BufferIterator last = first;
while (not (last + 1).is_end() and *last != '\n')
Utf8Iterator last = first;
while (not is_end(last + 1) and *last != '\n')
++last;
return Selection(first, last);
return utf8_selection(first, last);
}
SelectionAndCaptures select_matching(const Selection& selection)
{
std::vector<char> matching_pairs = { '(', ')', '{', '}', '[', ']', '<', '>' };
BufferIterator it = selection.last();
Utf8Iterator it = selection.last();
std::vector<char>::iterator match = matching_pairs.end();
while (not is_eol(*it))
{
@ -178,19 +194,19 @@ SelectionAndCaptures select_matching(const Selection& selection)
if (match == matching_pairs.end())
return selection;
BufferIterator begin = it;
Utf8Iterator begin = it;
if (((match - matching_pairs.begin()) % 2) == 0)
{
int level = 0;
const char opening = *match;
const char closing = *(match+1);
while (not it.is_end())
while (not is_end(it))
{
if (*it == opening)
++level;
else if (*it == closing and --level == 0)
return Selection(begin, it);
return utf8_selection(begin, it);
++it;
}
@ -200,12 +216,12 @@ SelectionAndCaptures select_matching(const Selection& selection)
int level = 0;
const char opening = *(match-1);
const char closing = *match;
while (not it.is_begin())
while (not is_begin(it))
{
if (*it == closing)
++level;
else if (*it == opening and --level == 0)
return Selection(begin, it);
return utf8_selection(begin, it);
--it;
}
}
@ -213,12 +229,12 @@ SelectionAndCaptures select_matching(const Selection& selection)
}
SelectionAndCaptures select_surrounding(const Selection& selection,
const std::pair<char, char>& matching,
bool inside)
const CodepointPair& matching,
bool inside)
{
int level = 0;
BufferIterator first = selection.last();
while (not first.is_begin())
Utf8Iterator first = selection.last();
while (not is_begin(first))
{
if (first != selection.last() and *first == matching.second)
++level;
@ -235,8 +251,8 @@ SelectionAndCaptures select_surrounding(const Selection& selection,
return selection;
level = 0;
BufferIterator last = first + 1;
while (not last.is_end())
Utf8Iterator last = first + 1;
while (not is_end(last))
{
if (*last == matching.first)
++level;
@ -258,64 +274,64 @@ SelectionAndCaptures select_surrounding(const Selection& selection,
if (first != last)
--last;
}
return Selection(first, last);
return utf8_selection(first, last);
}
SelectionAndCaptures select_to(const Selection& selection,
char c, int count, bool inclusive)
Codepoint c, int count, bool inclusive)
{
BufferIterator begin = selection.last();
BufferIterator end = begin;
Utf8Iterator begin = selection.last();
Utf8Iterator end = begin;
do
{
++end;
skip_while(end, [c](char cur) { return not is_eol(cur) and cur != c; });
if (end.is_end() or is_eol(*end))
skip_while(end, [c](Codepoint cur) { return not is_eol(cur) and cur != c; });
if (is_end(end) or is_eol(*end))
return selection;
}
while (--count > 0);
return Selection(begin, inclusive ? end : end-1);
return utf8_selection(begin, inclusive ? end : end-1);
}
SelectionAndCaptures select_to_reverse(const Selection& selection,
char c, int count, bool inclusive)
Codepoint c, int count, bool inclusive)
{
BufferIterator begin = selection.last();
BufferIterator end = begin;
Utf8Iterator begin = selection.last();
Utf8Iterator end = begin;
do
{
--end;
skip_while_reverse(end, [c](char cur) { return not is_eol(cur) and cur != c; });
if (end.is_begin() or is_eol(*end))
skip_while_reverse(end, [c](Codepoint cur) { return not is_eol(cur) and cur != c; });
if (is_begin(end) or is_eol(*end))
return selection;
}
while (--count > 0);
return Selection(begin, inclusive ? end : end+1);
return utf8_selection(begin, inclusive ? end : end+1);
}
SelectionAndCaptures select_to_eol(const Selection& selection)
{
BufferIterator begin = selection.last();
BufferIterator end = begin + 1;
skip_while(end, [](char cur) { return not is_eol(cur); });
return Selection(begin, end-1);
Utf8Iterator begin = selection.last();
Utf8Iterator end = begin + 1;
skip_while(end, [](Codepoint cur) { return not is_eol(cur); });
return utf8_selection(begin, end-1);
}
SelectionAndCaptures select_to_eol_reverse(const Selection& selection)
{
BufferIterator begin = selection.last();
BufferIterator end = begin - 1;
skip_while_reverse(end, [](char cur) { return not is_eol(cur); });
return Selection(begin, end.is_begin() ? end : end+1);
Utf8Iterator begin = selection.last();
Utf8Iterator end = begin - 1;
skip_while_reverse(end, [](Codepoint cur) { return not is_eol(cur); });
return utf8_selection(begin, is_begin(end) ? end : end+1);
}
template<bool punctuation_is_word>
SelectionAndCaptures select_whole_word(const Selection& selection, bool inner)
{
BufferIterator first = selection.last();
BufferIterator last = first;
Utf8Iterator first = selection.last();
Utf8Iterator last = first;
if (is_word(*first))
{
if (not skip_while_reverse(first, is_word<punctuation_is_word>))
@ -334,13 +350,14 @@ SelectionAndCaptures select_whole_word(const Selection& selection, bool inner)
skip_while(last, is_word<punctuation_is_word>);
}
--last;
return Selection(first, last);
return utf8_selection(first, last);
}
template SelectionAndCaptures select_whole_word<false>(const Selection&, bool);
template SelectionAndCaptures select_whole_word<true>(const Selection&, bool);
SelectionAndCaptures select_whole_lines(const Selection& selection)
{
// no need to be utf8 aware for is_eol as we only use \n as line seperator
BufferIterator first = selection.first();
BufferIterator last = selection.last();
BufferIterator& to_line_start = first <= last ? first : last;
@ -352,14 +369,13 @@ SelectionAndCaptures select_whole_lines(const Selection& selection)
skip_while(to_line_end, [](char cur) { return not is_eol(cur); });
return Selection(first, last);
}
SelectionAndCaptures select_whole_buffer(const Selection& selection)
{
const Buffer& buffer = selection.first().buffer();
return Selection(buffer.begin(), buffer.end()-1);
return Selection(buffer.begin(), utf8::previous(buffer.end()));
}
SelectionAndCaptures select_next_match(const Selection& selection,
@ -367,6 +383,8 @@ SelectionAndCaptures select_next_match(const Selection& selection,
{
try
{
// regex matching do not use Utf8Iterator as boost::regex handle utf8
// decoding itself
BufferIterator begin = selection.last();
BufferIterator end = begin;
CaptureList captures;
@ -374,7 +392,7 @@ SelectionAndCaptures select_next_match(const Selection& selection,
Regex ex(regex.begin(), regex.end());
boost::match_results<BufferIterator> matches;
if (boost::regex_search(begin+1, begin.buffer().end(),
if (boost::regex_search(utf8::next(begin), begin.buffer().end(),
matches, ex))
{
begin = matches[0].first;
@ -382,7 +400,7 @@ SelectionAndCaptures select_next_match(const Selection& selection,
for (auto& match : matches)
captures.push_back(String(match.first, match.second));
}
else if (boost::regex_search(begin.buffer().begin(), begin+1,
else if (boost::regex_search(begin.buffer().begin(), utf8::next(begin),
matches, ex))
{
begin = matches[0].first;
@ -396,7 +414,7 @@ SelectionAndCaptures select_next_match(const Selection& selection,
if (begin == end)
++end;
return SelectionAndCaptures(begin, end - 1, std::move(captures));
return SelectionAndCaptures(begin, utf8::previous(end), std::move(captures));
}
catch (boost::regex_error& err)
{
@ -427,7 +445,7 @@ SelectionAndCapturesList select_all_matches(const Selection& selection,
captures.push_back(String(match.first, match.second));
result.push_back(SelectionAndCaptures(begin,
begin == end ? end : end-1,
begin == end ? end : utf8::previous(end),
std::move(captures)));
}
return result;
@ -454,7 +472,7 @@ SelectionAndCapturesList split_selection(const Selection& selection,
{
BufferIterator end = (*re_it)[0].first;
result.push_back(Selection(begin, (begin == end) ? end : end-1));
result.push_back(Selection(begin, (begin == end) ? end : utf8::previous(end)));
begin = (*re_it)[0].second;
}
result.push_back(Selection(begin, selection.last()));

View File

@ -2,6 +2,7 @@
#define selectors_hh_INCLUDED
#include "selection.hh"
#include "utf8.hh"
namespace Kakoune
{
@ -15,14 +16,19 @@ SelectionAndCaptures select_to_previous_word(const Selection& selection);
SelectionAndCaptures select_line(const Selection& selection);
SelectionAndCaptures select_matching(const Selection& selection);
using CodepointPair = std::pair<utf8::Codepoint, utf8::Codepoint>;
SelectionAndCaptures select_surrounding(const Selection& selection,
const std::pair<char, char>& matching,
const CodepointPair& matching,
bool inside);
SelectionAndCaptures select_to(const Selection& selection,
char c, int count, bool inclusive);
utf8::Codepoint c,
int count, bool inclusive);
SelectionAndCaptures select_to_reverse(const Selection& selection,
char c, int count, bool inclusive);
utf8::Codepoint c,
int count, bool inclusive);
SelectionAndCaptures select_to_eol(const Selection& selection);
SelectionAndCaptures select_to_eol_reverse(const Selection& selection);