From 0a686573abe217b7bd38ae252f5673c300ab4adc Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Mon, 8 Oct 2012 19:12:09 +0200 Subject: [PATCH] utf8 awareness in selectors --- src/selectors.cc | 162 ++++++++++++++++++++++++++--------------------- src/selectors.hh | 12 +++- 2 files changed, 99 insertions(+), 75 deletions(-) diff --git a/src/selectors.cc b/src/selectors.cc index 2ac1168c..bed7d23e 100644 --- a/src/selectors.cc +++ b/src/selectors.cc @@ -2,37 +2,42 @@ #include "string.hh" +#include "utf8_iterator.hh" + #include namespace Kakoune { +using utf8::Codepoint; +using Utf8Iterator = utf8::utf8_iterator; + namespace { -bool is_eol(char c) +bool is_eol(Codepoint c) { return c == '\n'; } -bool is_blank(char c) +bool is_blank(Codepoint c) { return c == ' ' or c == '\t'; } template -bool is_word(char c) +bool is_word(Codepoint c) { return Kakoune::is_word(c); } template<> -bool is_word(char c) +bool is_word(Codepoint c) { return !is_blank(c) and !is_eol(c); } -static bool is_punctuation(char c) +static bool is_punctuation(Codepoint c) { return not (is_word(c) or is_blank(c) or is_eol(c)); } @@ -46,7 +51,7 @@ enum class CharCategories }; template -CharCategories categorize(char c) +CharCategories categorize(Codepoint c) { if (is_word(c)) return CharCategories::Word; @@ -58,20 +63,31 @@ CharCategories categorize(char c) : CharCategories::Punctuation; } -template -bool skip_while(BufferIterator& it, T condition) +bool is_begin(const BufferIterator& it) { return it.is_begin(); } +bool is_end(const BufferIterator& it) { return it.is_end(); } + +bool is_begin(const Utf8Iterator& it) { return it.underlying_iterator().is_begin(); } +bool is_end(const Utf8Iterator& it) { return it.underlying_iterator().is_end(); } + +template +bool skip_while(Iterator& it, T condition) { - while (not it.is_end() and condition(*it)) + while (not is_end(it) and condition(*it)) ++it; - return not it.is_end() and condition(*it); + return not is_end(it) and condition(*it); } -template -bool skip_while_reverse(BufferIterator& it, T condition) +template +bool skip_while_reverse(Iterator& it, T condition) { - while (not it.is_begin() and condition(*it)) + while (not is_begin(it) and condition(*it)) --it; - return not it.is_end() and condition(*it); + return not is_end(it) and condition(*it); +} + +Selection utf8_selection(const Utf8Iterator& first, const Utf8Iterator& last) +{ + return Selection(first.underlying_iterator(), last.underlying_iterator()); } } @@ -81,13 +97,13 @@ typedef boost::regex_iterator RegexIterator; template SelectionAndCaptures select_to_next_word(const Selection& selection) { - BufferIterator begin = selection.last(); + Utf8Iterator begin = selection.last(); if (categorize(*begin) != categorize(*(begin+1))) ++begin; skip_while(begin, is_eol); - BufferIterator end = begin+1; + Utf8Iterator end = begin+1; if (not punctuation_is_word and is_punctuation(*begin)) skip_while(end, is_punctuation); @@ -96,7 +112,7 @@ SelectionAndCaptures select_to_next_word(const Selection& selection) bool with_end = skip_while(end, is_blank); - return Selection(begin, with_end ? end : end-1); + return utf8_selection(begin, with_end ? end : end-1); } template SelectionAndCaptures select_to_next_word(const Selection&); template SelectionAndCaptures select_to_next_word(const Selection&); @@ -104,13 +120,13 @@ template SelectionAndCaptures select_to_next_word(const Selection&); template SelectionAndCaptures select_to_next_word_end(const Selection& selection) { - BufferIterator begin = selection.last(); + Utf8Iterator begin = selection.last(); if (categorize(*begin) != categorize(*(begin+1))) ++begin; skip_while(begin, is_eol); - BufferIterator end = begin; + Utf8Iterator end = begin; skip_while(end, is_blank); bool with_end = false; @@ -119,7 +135,7 @@ SelectionAndCaptures select_to_next_word_end(const Selection& selection) else if (is_word(*end)) with_end = skip_while(end, is_word); - return Selection(begin, with_end ? end : end-1); + return utf8_selection(begin, with_end ? end : end-1); } template SelectionAndCaptures select_to_next_word_end(const Selection&); template SelectionAndCaptures select_to_next_word_end(const Selection&); @@ -127,14 +143,14 @@ template SelectionAndCaptures select_to_next_word_end(const Selection&); template SelectionAndCaptures select_to_previous_word(const Selection& selection) { - BufferIterator begin = selection.last(); + Utf8Iterator begin = selection.last(); if (categorize(*begin) != categorize(*(begin-1))) --begin; skip_while_reverse(begin, is_eol); - BufferIterator end = begin; + Utf8Iterator end = begin; skip_while_reverse(end, is_blank); bool with_end = false; @@ -143,30 +159,30 @@ SelectionAndCaptures select_to_previous_word(const Selection& selection) else if (is_word(*end)) with_end = skip_while_reverse(end, is_word); - return Selection(begin, with_end ? end : end+1); + return utf8_selection(begin, with_end ? end : end+1); } template SelectionAndCaptures select_to_previous_word(const Selection&); template SelectionAndCaptures select_to_previous_word(const Selection&); SelectionAndCaptures select_line(const Selection& selection) { - BufferIterator first = selection.last(); - if (*first == '\n' and not (first + 1).is_end()) + Utf8Iterator first = selection.last(); + if (*first == '\n' and not is_end(first + 1)) ++first; - while (not first.is_begin() and *(first - 1) != '\n') + while (not is_begin(first) and *(first - 1) != '\n') --first; - BufferIterator last = first; - while (not (last + 1).is_end() and *last != '\n') + Utf8Iterator last = first; + while (not is_end(last + 1) and *last != '\n') ++last; - return Selection(first, last); + return utf8_selection(first, last); } SelectionAndCaptures select_matching(const Selection& selection) { std::vector matching_pairs = { '(', ')', '{', '}', '[', ']', '<', '>' }; - BufferIterator it = selection.last(); + Utf8Iterator it = selection.last(); std::vector::iterator match = matching_pairs.end(); while (not is_eol(*it)) { @@ -178,19 +194,19 @@ SelectionAndCaptures select_matching(const Selection& selection) if (match == matching_pairs.end()) return selection; - BufferIterator begin = it; + Utf8Iterator begin = it; if (((match - matching_pairs.begin()) % 2) == 0) { int level = 0; const char opening = *match; const char closing = *(match+1); - while (not it.is_end()) + while (not is_end(it)) { if (*it == opening) ++level; else if (*it == closing and --level == 0) - return Selection(begin, it); + return utf8_selection(begin, it); ++it; } @@ -200,12 +216,12 @@ SelectionAndCaptures select_matching(const Selection& selection) int level = 0; const char opening = *(match-1); const char closing = *match; - while (not it.is_begin()) + while (not is_begin(it)) { if (*it == closing) ++level; else if (*it == opening and --level == 0) - return Selection(begin, it); + return utf8_selection(begin, it); --it; } } @@ -213,12 +229,12 @@ SelectionAndCaptures select_matching(const Selection& selection) } SelectionAndCaptures select_surrounding(const Selection& selection, - const std::pair& matching, - bool inside) + const CodepointPair& matching, + bool inside) { int level = 0; - BufferIterator first = selection.last(); - while (not first.is_begin()) + Utf8Iterator first = selection.last(); + while (not is_begin(first)) { if (first != selection.last() and *first == matching.second) ++level; @@ -235,8 +251,8 @@ SelectionAndCaptures select_surrounding(const Selection& selection, return selection; level = 0; - BufferIterator last = first + 1; - while (not last.is_end()) + Utf8Iterator last = first + 1; + while (not is_end(last)) { if (*last == matching.first) ++level; @@ -258,64 +274,64 @@ SelectionAndCaptures select_surrounding(const Selection& selection, if (first != last) --last; } - return Selection(first, last); + return utf8_selection(first, last); } SelectionAndCaptures select_to(const Selection& selection, - char c, int count, bool inclusive) + Codepoint c, int count, bool inclusive) { - BufferIterator begin = selection.last(); - BufferIterator end = begin; + Utf8Iterator begin = selection.last(); + Utf8Iterator end = begin; do { ++end; - skip_while(end, [c](char cur) { return not is_eol(cur) and cur != c; }); - if (end.is_end() or is_eol(*end)) + skip_while(end, [c](Codepoint cur) { return not is_eol(cur) and cur != c; }); + if (is_end(end) or is_eol(*end)) return selection; } while (--count > 0); - return Selection(begin, inclusive ? end : end-1); + return utf8_selection(begin, inclusive ? end : end-1); } SelectionAndCaptures select_to_reverse(const Selection& selection, - char c, int count, bool inclusive) + Codepoint c, int count, bool inclusive) { - BufferIterator begin = selection.last(); - BufferIterator end = begin; + Utf8Iterator begin = selection.last(); + Utf8Iterator end = begin; do { --end; - skip_while_reverse(end, [c](char cur) { return not is_eol(cur) and cur != c; }); - if (end.is_begin() or is_eol(*end)) + skip_while_reverse(end, [c](Codepoint cur) { return not is_eol(cur) and cur != c; }); + if (is_begin(end) or is_eol(*end)) return selection; } while (--count > 0); - return Selection(begin, inclusive ? end : end+1); + return utf8_selection(begin, inclusive ? end : end+1); } SelectionAndCaptures select_to_eol(const Selection& selection) { - BufferIterator begin = selection.last(); - BufferIterator end = begin + 1; - skip_while(end, [](char cur) { return not is_eol(cur); }); - return Selection(begin, end-1); + Utf8Iterator begin = selection.last(); + Utf8Iterator end = begin + 1; + skip_while(end, [](Codepoint cur) { return not is_eol(cur); }); + return utf8_selection(begin, end-1); } SelectionAndCaptures select_to_eol_reverse(const Selection& selection) { - BufferIterator begin = selection.last(); - BufferIterator end = begin - 1; - skip_while_reverse(end, [](char cur) { return not is_eol(cur); }); - return Selection(begin, end.is_begin() ? end : end+1); + Utf8Iterator begin = selection.last(); + Utf8Iterator end = begin - 1; + skip_while_reverse(end, [](Codepoint cur) { return not is_eol(cur); }); + return utf8_selection(begin, is_begin(end) ? end : end+1); } template SelectionAndCaptures select_whole_word(const Selection& selection, bool inner) { - BufferIterator first = selection.last(); - BufferIterator last = first; + Utf8Iterator first = selection.last(); + Utf8Iterator last = first; if (is_word(*first)) { if (not skip_while_reverse(first, is_word)) @@ -334,13 +350,14 @@ SelectionAndCaptures select_whole_word(const Selection& selection, bool inner) skip_while(last, is_word); } --last; - return Selection(first, last); + return utf8_selection(first, last); } template SelectionAndCaptures select_whole_word(const Selection&, bool); template SelectionAndCaptures select_whole_word(const Selection&, bool); SelectionAndCaptures select_whole_lines(const Selection& selection) { + // no need to be utf8 aware for is_eol as we only use \n as line seperator BufferIterator first = selection.first(); BufferIterator last = selection.last(); BufferIterator& to_line_start = first <= last ? first : last; @@ -352,14 +369,13 @@ SelectionAndCaptures select_whole_lines(const Selection& selection) skip_while(to_line_end, [](char cur) { return not is_eol(cur); }); - return Selection(first, last); } SelectionAndCaptures select_whole_buffer(const Selection& selection) { const Buffer& buffer = selection.first().buffer(); - return Selection(buffer.begin(), buffer.end()-1); + return Selection(buffer.begin(), utf8::previous(buffer.end())); } SelectionAndCaptures select_next_match(const Selection& selection, @@ -367,6 +383,8 @@ SelectionAndCaptures select_next_match(const Selection& selection, { try { + // regex matching do not use Utf8Iterator as boost::regex handle utf8 + // decoding itself BufferIterator begin = selection.last(); BufferIterator end = begin; CaptureList captures; @@ -374,7 +392,7 @@ SelectionAndCaptures select_next_match(const Selection& selection, Regex ex(regex.begin(), regex.end()); boost::match_results matches; - if (boost::regex_search(begin+1, begin.buffer().end(), + if (boost::regex_search(utf8::next(begin), begin.buffer().end(), matches, ex)) { begin = matches[0].first; @@ -382,7 +400,7 @@ SelectionAndCaptures select_next_match(const Selection& selection, for (auto& match : matches) captures.push_back(String(match.first, match.second)); } - else if (boost::regex_search(begin.buffer().begin(), begin+1, + else if (boost::regex_search(begin.buffer().begin(), utf8::next(begin), matches, ex)) { begin = matches[0].first; @@ -396,7 +414,7 @@ SelectionAndCaptures select_next_match(const Selection& selection, if (begin == end) ++end; - return SelectionAndCaptures(begin, end - 1, std::move(captures)); + return SelectionAndCaptures(begin, utf8::previous(end), std::move(captures)); } catch (boost::regex_error& err) { @@ -427,7 +445,7 @@ SelectionAndCapturesList select_all_matches(const Selection& selection, captures.push_back(String(match.first, match.second)); result.push_back(SelectionAndCaptures(begin, - begin == end ? end : end-1, + begin == end ? end : utf8::previous(end), std::move(captures))); } return result; @@ -454,7 +472,7 @@ SelectionAndCapturesList split_selection(const Selection& selection, { BufferIterator end = (*re_it)[0].first; - result.push_back(Selection(begin, (begin == end) ? end : end-1)); + result.push_back(Selection(begin, (begin == end) ? end : utf8::previous(end))); begin = (*re_it)[0].second; } result.push_back(Selection(begin, selection.last())); diff --git a/src/selectors.hh b/src/selectors.hh index b9acd08e..fc6a628a 100644 --- a/src/selectors.hh +++ b/src/selectors.hh @@ -2,6 +2,7 @@ #define selectors_hh_INCLUDED #include "selection.hh" +#include "utf8.hh" namespace Kakoune { @@ -15,14 +16,19 @@ SelectionAndCaptures select_to_previous_word(const Selection& selection); SelectionAndCaptures select_line(const Selection& selection); SelectionAndCaptures select_matching(const Selection& selection); + + +using CodepointPair = std::pair; SelectionAndCaptures select_surrounding(const Selection& selection, - const std::pair& matching, + const CodepointPair& matching, bool inside); SelectionAndCaptures select_to(const Selection& selection, - char c, int count, bool inclusive); + utf8::Codepoint c, + int count, bool inclusive); SelectionAndCaptures select_to_reverse(const Selection& selection, - char c, int count, bool inclusive); + utf8::Codepoint c, + int count, bool inclusive); SelectionAndCaptures select_to_eol(const Selection& selection); SelectionAndCaptures select_to_eol_reverse(const Selection& selection);