Use boost::wregex implementation and manually utf8 decode into it

That way we get proper unicode support in regular expressions as long
as the current locale treats wchar_t as unicode codepoints.

Fixes #638
Fixes #595
Fixes #162
This commit is contained in:
Maxime Coste 2016-05-10 09:12:30 +01:00
parent bff9d45bdb
commit abac6a9436
12 changed files with 127 additions and 43 deletions

View File

@ -394,8 +394,9 @@ CandidateList complete_filename(StringView prefix, const Regex& ignored_regex,
auto filter = [&ignored_regex, check_ignored_regex, include_hidden, only_dir](const dirent& entry, struct stat& st) auto filter = [&ignored_regex, check_ignored_regex, include_hidden, only_dir](const dirent& entry, struct stat& st)
{ {
return (include_hidden or StringView{entry.d_name}.substr(0_byte, 1_byte) != ".") and StringView name{entry.d_name};
(not check_ignored_regex or not regex_match(entry.d_name, ignored_regex)) and return (include_hidden or name.substr(0_byte, 1_byte) != ".") and
(not check_ignored_regex or not regex_match(name.begin(), name.end(), ignored_regex)) and
(not only_dir or S_ISDIR(st.st_mode)); (not only_dir or S_ISDIR(st.st_mode));
}; };
auto files = list_files(dirname, filter); auto files = list_files(dirname, filter);

View File

@ -308,13 +308,14 @@ private:
buffer.iterator_at(range.end), m_regex, buffer.iterator_at(range.end), m_regex,
match_flags(is_bol(range.begin), match_flags(is_bol(range.begin),
is_eol(buffer, range.end), is_eol(buffer, range.end),
is_bow(buffer, range.begin),
is_eow(buffer, range.end))}; is_eow(buffer, range.end))};
RegexIt re_end; RegexIt re_end;
for (; re_it != re_end; ++re_it) for (; re_it != re_end; ++re_it)
{ {
for (size_t i = 0; i < m_faces.size(); ++i) for (size_t i = 0; i < m_faces.size(); ++i)
{ {
auto& sub = (*re_it)[m_faces[i].first]; const auto& sub = (*re_it)[m_faces[i].first];
matches.push_back({sub.first.coord(), sub.second.coord()}); matches.push_back({sub.first.coord(), sub.second.coord()});
} }
} }

View File

@ -5,6 +5,12 @@
namespace Kakoune namespace Kakoune
{ {
using Utf8It = RegexUtf8It<const char*>;
Regex::Regex(StringView re, flag_type flags) try
: boost::wregex{Utf8It{re.begin(), re}, Utf8It{re.end(), re}}, m_str(re.str())
{} catch (std::runtime_error& err) { throw regex_error(err.what()); }
String option_to_string(const Regex& re) String option_to_string(const Regex& re)
{ {
return re.str(); return re.str();

View File

@ -3,12 +3,9 @@
#include "string.hh" #include "string.hh"
#include "exception.hh" #include "exception.hh"
#include "utf8_iterator.hh"
#ifdef KAK_USE_STDREGEX
#include <regex>
#else
#include <boost/regex.hpp> #include <boost/regex.hpp>
#endif
namespace Kakoune namespace Kakoune
{ {
@ -20,16 +17,12 @@ struct regex_error : runtime_error
{} {}
}; };
#ifdef KAK_USE_STDREGEX
// Regex that keeps track of its string representation // Regex that keeps track of its string representation
struct Regex : std::regex struct Regex : boost::wregex
{ {
Regex() = default; Regex() = default;
explicit Regex(StringView re, flag_type flags = ECMAScript) try explicit Regex(StringView re, flag_type flags = ECMAScript);
: std::regex(re.begin(), re.end(), flags), m_str(re.str()) {}
catch (std::runtime_error& err) { throw regex_error(err.what()); }
bool empty() const { return m_str.empty(); } bool empty() const { return m_str.empty(); }
bool operator==(const Regex& other) const { return m_str == other.m_str; } bool operator==(const Regex& other) const { return m_str == other.m_str; }
bool operator!=(const Regex& other) const { return m_str != other.m_str; } bool operator!=(const Regex& other) const { return m_str != other.m_str; }
@ -39,37 +32,97 @@ struct Regex : std::regex
private: private:
String m_str; String m_str;
}; };
namespace regex_ns = std;
#else template<typename It>
struct Regex : boost::regex using RegexUtf8It = utf8::iterator<It, wchar_t, ssize_t>;
namespace RegexConstant = boost::regex_constants;
template<typename Iterator>
struct MatchResults : boost::match_results<RegexUtf8It<Iterator>>
{ {
Regex() = default; using ParentType = boost::match_results<RegexUtf8It<Iterator>>;
struct SubMatch : std::pair<Iterator, Iterator>
{
SubMatch() = default;
SubMatch(const boost::sub_match<RegexUtf8It<Iterator>>& m)
: std::pair<Iterator, Iterator>{m.first.base(), m.second.base()},
matched{m.matched}
{}
explicit Regex(StringView re, flag_type flags = ECMAScript) try bool matched = false;
: boost::regex(re.begin(), re.end(), flags) {} };
catch (std::runtime_error& err) { throw regex_error(err.what()); }
String str() const { auto s = boost::regex::str(); return {s.data(), (int)s.length()}; } struct iterator : boost::match_results<RegexUtf8It<Iterator>>::iterator
{
using ParentType = typename boost::match_results<RegexUtf8It<Iterator>>::iterator;
iterator(const ParentType& it) : ParentType(it) {}
SubMatch operator*() const { return {ParentType::operator*()}; }
};
iterator begin() const { return {ParentType::begin()}; }
iterator cbegin() const { return {ParentType::cbegin()}; }
iterator end() const { return {ParentType::end()}; }
iterator cend() const { return {ParentType::cend()}; }
SubMatch operator[](size_t s) const { return {ParentType::operator[](s)}; }
}; };
namespace regex_ns = boost;
#endif
template<typename Iterator> template<typename Iterator>
using RegexIterator = regex_ns::regex_iterator<Iterator>; struct RegexIterator : boost::regex_iterator<RegexUtf8It<Iterator>>
template<typename Iterator>
using MatchResults = regex_ns::match_results<Iterator>;
namespace RegexConstant = regex_ns::regex_constants;
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool eow)
{ {
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol | using ParentType = boost::regex_iterator<RegexUtf8It<Iterator>>;
RegexConstant::match_prev_avail) | using Utf8It = RegexUtf8It<Iterator>;
using ValueType = MatchResults<Iterator>;
RegexIterator() = default;
RegexIterator(Iterator begin, Iterator end, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
: ParentType{Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re, flags} {}
const ValueType& operator*() const { return *reinterpret_cast<const ValueType*>(&ParentType::operator*()); }
const ValueType* operator->() const { return reinterpret_cast<const ValueType*>(ParentType::operator->()); }
};
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, bool eow)
{
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol) |
(eol ? RegexConstant::match_default : RegexConstant::match_not_eol) | (eol ? RegexConstant::match_default : RegexConstant::match_not_eol) |
(bow ? RegexConstant::match_default : RegexConstant::match_not_bow) |
(eow ? RegexConstant::match_default : RegexConstant::match_not_eow); (eow ? RegexConstant::match_default : RegexConstant::match_not_eow);
} }
template<typename It>
bool regex_match(It begin, It end, const Regex& re)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_match(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re);
}
template<typename It>
bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_match(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, res, re);
}
template<typename It>
bool regex_search(It begin, It end, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_search(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re);
}
template<typename It>
bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_search(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, res, re);
}
String option_to_string(const Regex& re); String option_to_string(const Regex& re);
void option_from_string(StringView str, Regex& re); void option_from_string(StringView str, Regex& re);

View File

@ -615,20 +615,21 @@ void select_all_matches(SelectionList& selections, const Regex& regex, unsigned
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end()); auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
const auto flags = match_flags(is_bol(sel_beg.coord()), const auto flags = match_flags(is_bol(sel_beg.coord()),
is_eol(buffer, sel_end.coord()), is_eol(buffer, sel_end.coord()),
is_bow(buffer, sel_beg.coord()),
is_eow(buffer, sel_end.coord())); is_eow(buffer, sel_end.coord()));
RegexIt re_it(sel_beg, sel_end, regex, flags); RegexIt re_it(sel_beg, sel_end, regex, flags);
RegexIt re_end; RegexIt re_end;
for (; re_it != re_end; ++re_it) for (; re_it != re_end; ++re_it)
{ {
auto begin = ensure_char_start(buffer, (*re_it)[capture].first); auto begin = (*re_it)[capture].first;
if (begin == sel_end) if (begin == sel_end)
continue; continue;
auto end = ensure_char_start(buffer, (*re_it)[capture].second); auto end = (*re_it)[capture].second;
CaptureList captures; CaptureList captures;
captures.reserve(mark_count); captures.reserve(mark_count);
for (auto& match : *re_it) for (const auto& match : *re_it)
captures.push_back(buffer.string(match.first.coord(), captures.push_back(buffer.string(match.first.coord(),
match.second.coord())); match.second.coord()));
@ -661,6 +662,7 @@ void split_selections(SelectionList& selections, const Regex& regex, unsigned ca
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end()); auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
const auto flags = match_flags(is_bol(begin.coord()), const auto flags = match_flags(is_bol(begin.coord()),
is_eol(buffer, sel_end.coord()), is_eol(buffer, sel_end.coord()),
is_bow(buffer, begin.coord()),
is_eow(buffer, sel_end.coord())); is_eow(buffer, sel_end.coord()));
RegexIt re_it(begin, sel_end, regex, flags); RegexIt re_it(begin, sel_end, regex, flags);
@ -674,11 +676,10 @@ void split_selections(SelectionList& selections, const Regex& regex, unsigned ca
if (end != buf_begin) if (end != buf_begin)
{ {
end = ensure_char_start(buffer, end);
auto sel_end = (begin == end) ? end : utf8::previous(end, begin); auto sel_end = (begin == end) ? end : utf8::previous(end, begin);
result.push_back(keep_direction({ begin.coord(), sel_end.coord() }, sel)); result.push_back(keep_direction({ begin.coord(), sel_end.coord() }, sel));
} }
begin = ensure_char_start(buffer, (*re_it)[capture].second); begin = (*re_it)[capture].second;
} }
if (begin.coord() <= sel.max()) if (begin.coord() <= sel.max())
result.push_back(keep_direction({ begin.coord(), sel.max() }, sel)); result.push_back(keep_direction({ begin.coord(), sel.max() }, sel));

View File

@ -226,7 +226,8 @@ inline bool find_last_match(const Buffer& buffer, const BufferIterator& pos,
const bool is_pos_eow = is_eow(buffer, pos.coord()); const bool is_pos_eow = is_eow(buffer, pos.coord());
auto begin = buffer.begin(); auto begin = buffer.begin();
while (begin != pos and regex_search(begin, pos, matches, regex, while (begin != pos and regex_search(begin, pos, matches, regex,
match_flags(is_bol(begin.coord()), is_pos_eol, is_pos_eow))) match_flags(is_bol(begin.coord()), is_pos_eol,
is_bow(buffer, begin.coord()), is_pos_eow)))
{ {
begin = utf8::next(matches[0].first, pos); begin = utf8::next(matches[0].first, pos);
if (res.empty() or matches[0].second > res[0].second) if (res.empty() or matches[0].second > res[0].second)
@ -244,7 +245,8 @@ bool find_match_in_buffer(const Buffer& buffer, const BufferIterator pos,
if (direction == Forward) if (direction == Forward)
{ {
if (regex_search(pos, buffer.end(), matches, ex, if (regex_search(pos, buffer.end(), matches, ex,
match_flags(is_bol(pos.coord()), true, true))) match_flags(is_bol(pos.coord()), true,
is_bow(buffer, pos.coord()), true)))
return true; return true;
wrapped = true; wrapped = true;
return regex_search(buffer.begin(), buffer.end(), matches, ex); return regex_search(buffer.begin(), buffer.end(), matches, ex);
@ -276,9 +278,9 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
auto pos = direction == Forward ? utf8::next(begin, buffer.end()) : begin; auto pos = direction == Forward ? utf8::next(begin, buffer.end()) : begin;
if ((found = find_match_in_buffer<direction>(buffer, pos, matches, regex, wrapped))) if ((found = find_match_in_buffer<direction>(buffer, pos, matches, regex, wrapped)))
{ {
begin = ensure_char_start(buffer, matches[0].first); begin = matches[0].first;
end = ensure_char_start(buffer, matches[0].second); end = matches[0].second;
for (auto& match : matches) for (const auto& match : matches)
captures.push_back(buffer.string(match.first.coord(), captures.push_back(buffer.string(match.first.coord(),
match.second.coord())); match.second.coord()));
} }

View File

@ -0,0 +1 @@
w<c-l>:q<ret>

View File

@ -0,0 +1,5 @@
{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "“" }, { "face": { "fg": "white", "bg": "blue", "attributes": [] }, "contents": "We" }, { "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": " " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:4 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }

View File

@ -0,0 +1 @@
“We ought to scrape this planet clean of every living thing on it,”

View File

@ -0,0 +1 @@
“We ought to scrape this planet clean of every living thing on it,”

View File

@ -0,0 +1,11 @@
--- a/display 2016-05-10 09:21:59.272300947 +0100
+++ b/display 2016-05-10 09:37:27.866341923 +0100
@@ -1,5 +1,5 @@
-{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "“" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "We ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
+{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "“" }, { "face": { "fg": "white", "bg": "blue", "attributes": [] }, "contents": "We" }, { "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": " " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
-{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
+{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:4 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }

View File

@ -0,0 +1 @@
addhl regex '[“”]' 0:red