Use boost::wregex implementation and manually utf8 decode into it

That way we get proper unicode support in regular expressions as long
as the current locale treats wchar_t as unicode codepoints.

Fixes #638
Fixes #595
Fixes #162
This commit is contained in:
Maxime Coste 2016-05-10 09:12:30 +01:00
parent bff9d45bdb
commit abac6a9436
12 changed files with 127 additions and 43 deletions

View File

@ -394,8 +394,9 @@ CandidateList complete_filename(StringView prefix, const Regex& ignored_regex,
auto filter = [&ignored_regex, check_ignored_regex, include_hidden, only_dir](const dirent& entry, struct stat& st)
{
return (include_hidden or StringView{entry.d_name}.substr(0_byte, 1_byte) != ".") and
(not check_ignored_regex or not regex_match(entry.d_name, ignored_regex)) and
StringView name{entry.d_name};
return (include_hidden or name.substr(0_byte, 1_byte) != ".") and
(not check_ignored_regex or not regex_match(name.begin(), name.end(), ignored_regex)) and
(not only_dir or S_ISDIR(st.st_mode));
};
auto files = list_files(dirname, filter);

View File

@ -308,13 +308,14 @@ private:
buffer.iterator_at(range.end), m_regex,
match_flags(is_bol(range.begin),
is_eol(buffer, range.end),
is_bow(buffer, range.begin),
is_eow(buffer, range.end))};
RegexIt re_end;
for (; re_it != re_end; ++re_it)
{
for (size_t i = 0; i < m_faces.size(); ++i)
{
auto& sub = (*re_it)[m_faces[i].first];
const auto& sub = (*re_it)[m_faces[i].first];
matches.push_back({sub.first.coord(), sub.second.coord()});
}
}

View File

@ -5,6 +5,12 @@
namespace Kakoune
{
using Utf8It = RegexUtf8It<const char*>;
Regex::Regex(StringView re, flag_type flags) try
: boost::wregex{Utf8It{re.begin(), re}, Utf8It{re.end(), re}}, m_str(re.str())
{} catch (std::runtime_error& err) { throw regex_error(err.what()); }
String option_to_string(const Regex& re)
{
return re.str();

View File

@ -3,12 +3,9 @@
#include "string.hh"
#include "exception.hh"
#include "utf8_iterator.hh"
#ifdef KAK_USE_STDREGEX
#include <regex>
#else
#include <boost/regex.hpp>
#endif
namespace Kakoune
{
@ -20,16 +17,12 @@ struct regex_error : runtime_error
{}
};
#ifdef KAK_USE_STDREGEX
// Regex that keeps track of its string representation
struct Regex : std::regex
struct Regex : boost::wregex
{
Regex() = default;
explicit Regex(StringView re, flag_type flags = ECMAScript) try
: std::regex(re.begin(), re.end(), flags), m_str(re.str()) {}
catch (std::runtime_error& err) { throw regex_error(err.what()); }
explicit Regex(StringView re, flag_type flags = ECMAScript);
bool empty() const { return m_str.empty(); }
bool operator==(const Regex& other) const { return m_str == other.m_str; }
bool operator!=(const Regex& other) const { return m_str != other.m_str; }
@ -39,37 +32,97 @@ struct Regex : std::regex
private:
String m_str;
};
namespace regex_ns = std;
#else
struct Regex : boost::regex
template<typename It>
using RegexUtf8It = utf8::iterator<It, wchar_t, ssize_t>;
namespace RegexConstant = boost::regex_constants;
template<typename Iterator>
struct MatchResults : boost::match_results<RegexUtf8It<Iterator>>
{
Regex() = default;
using ParentType = boost::match_results<RegexUtf8It<Iterator>>;
struct SubMatch : std::pair<Iterator, Iterator>
{
SubMatch() = default;
SubMatch(const boost::sub_match<RegexUtf8It<Iterator>>& m)
: std::pair<Iterator, Iterator>{m.first.base(), m.second.base()},
matched{m.matched}
{}
explicit Regex(StringView re, flag_type flags = ECMAScript) try
: boost::regex(re.begin(), re.end(), flags) {}
catch (std::runtime_error& err) { throw regex_error(err.what()); }
bool matched = false;
};
String str() const { auto s = boost::regex::str(); return {s.data(), (int)s.length()}; }
struct iterator : boost::match_results<RegexUtf8It<Iterator>>::iterator
{
using ParentType = typename boost::match_results<RegexUtf8It<Iterator>>::iterator;
iterator(const ParentType& it) : ParentType(it) {}
SubMatch operator*() const { return {ParentType::operator*()}; }
};
iterator begin() const { return {ParentType::begin()}; }
iterator cbegin() const { return {ParentType::cbegin()}; }
iterator end() const { return {ParentType::end()}; }
iterator cend() const { return {ParentType::cend()}; }
SubMatch operator[](size_t s) const { return {ParentType::operator[](s)}; }
};
namespace regex_ns = boost;
#endif
template<typename Iterator>
using RegexIterator = regex_ns::regex_iterator<Iterator>;
template<typename Iterator>
using MatchResults = regex_ns::match_results<Iterator>;
namespace RegexConstant = regex_ns::regex_constants;
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool eow)
struct RegexIterator : boost::regex_iterator<RegexUtf8It<Iterator>>
{
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol |
RegexConstant::match_prev_avail) |
using ParentType = boost::regex_iterator<RegexUtf8It<Iterator>>;
using Utf8It = RegexUtf8It<Iterator>;
using ValueType = MatchResults<Iterator>;
RegexIterator() = default;
RegexIterator(Iterator begin, Iterator end, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
: ParentType{Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re, flags} {}
const ValueType& operator*() const { return *reinterpret_cast<const ValueType*>(&ParentType::operator*()); }
const ValueType* operator->() const { return reinterpret_cast<const ValueType*>(ParentType::operator->()); }
};
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, bool eow)
{
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol) |
(eol ? RegexConstant::match_default : RegexConstant::match_not_eol) |
(bow ? RegexConstant::match_default : RegexConstant::match_not_bow) |
(eow ? RegexConstant::match_default : RegexConstant::match_not_eow);
}
template<typename It>
bool regex_match(It begin, It end, const Regex& re)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_match(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re);
}
template<typename It>
bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_match(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, res, re);
}
template<typename It>
bool regex_search(It begin, It end, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_search(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re);
}
template<typename It>
bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
{
using Utf8It = RegexUtf8It<It>;
return boost::regex_search(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, res, re);
}
String option_to_string(const Regex& re);
void option_from_string(StringView str, Regex& re);

View File

@ -615,20 +615,21 @@ void select_all_matches(SelectionList& selections, const Regex& regex, unsigned
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
const auto flags = match_flags(is_bol(sel_beg.coord()),
is_eol(buffer, sel_end.coord()),
is_bow(buffer, sel_beg.coord()),
is_eow(buffer, sel_end.coord()));
RegexIt re_it(sel_beg, sel_end, regex, flags);
RegexIt re_end;
for (; re_it != re_end; ++re_it)
{
auto begin = ensure_char_start(buffer, (*re_it)[capture].first);
auto begin = (*re_it)[capture].first;
if (begin == sel_end)
continue;
auto end = ensure_char_start(buffer, (*re_it)[capture].second);
auto end = (*re_it)[capture].second;
CaptureList captures;
captures.reserve(mark_count);
for (auto& match : *re_it)
for (const auto& match : *re_it)
captures.push_back(buffer.string(match.first.coord(),
match.second.coord()));
@ -661,6 +662,7 @@ void split_selections(SelectionList& selections, const Regex& regex, unsigned ca
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
const auto flags = match_flags(is_bol(begin.coord()),
is_eol(buffer, sel_end.coord()),
is_bow(buffer, begin.coord()),
is_eow(buffer, sel_end.coord()));
RegexIt re_it(begin, sel_end, regex, flags);
@ -674,11 +676,10 @@ void split_selections(SelectionList& selections, const Regex& regex, unsigned ca
if (end != buf_begin)
{
end = ensure_char_start(buffer, end);
auto sel_end = (begin == end) ? end : utf8::previous(end, begin);
result.push_back(keep_direction({ begin.coord(), sel_end.coord() }, sel));
}
begin = ensure_char_start(buffer, (*re_it)[capture].second);
begin = (*re_it)[capture].second;
}
if (begin.coord() <= sel.max())
result.push_back(keep_direction({ begin.coord(), sel.max() }, sel));

View File

@ -226,7 +226,8 @@ inline bool find_last_match(const Buffer& buffer, const BufferIterator& pos,
const bool is_pos_eow = is_eow(buffer, pos.coord());
auto begin = buffer.begin();
while (begin != pos and regex_search(begin, pos, matches, regex,
match_flags(is_bol(begin.coord()), is_pos_eol, is_pos_eow)))
match_flags(is_bol(begin.coord()), is_pos_eol,
is_bow(buffer, begin.coord()), is_pos_eow)))
{
begin = utf8::next(matches[0].first, pos);
if (res.empty() or matches[0].second > res[0].second)
@ -244,7 +245,8 @@ bool find_match_in_buffer(const Buffer& buffer, const BufferIterator pos,
if (direction == Forward)
{
if (regex_search(pos, buffer.end(), matches, ex,
match_flags(is_bol(pos.coord()), true, true)))
match_flags(is_bol(pos.coord()), true,
is_bow(buffer, pos.coord()), true)))
return true;
wrapped = true;
return regex_search(buffer.begin(), buffer.end(), matches, ex);
@ -276,9 +278,9 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
auto pos = direction == Forward ? utf8::next(begin, buffer.end()) : begin;
if ((found = find_match_in_buffer<direction>(buffer, pos, matches, regex, wrapped)))
{
begin = ensure_char_start(buffer, matches[0].first);
end = ensure_char_start(buffer, matches[0].second);
for (auto& match : matches)
begin = matches[0].first;
end = matches[0].second;
for (const auto& match : matches)
captures.push_back(buffer.string(match.first.coord(),
match.second.coord()));
}

View File

@ -0,0 +1 @@
w<c-l>:q<ret>

View File

@ -0,0 +1,5 @@
{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "“" }, { "face": { "fg": "white", "bg": "blue", "attributes": [] }, "contents": "We" }, { "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": " " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:4 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }

View File

@ -0,0 +1 @@
“We ought to scrape this planet clean of every living thing on it,”

View File

@ -0,0 +1 @@
“We ought to scrape this planet clean of every living thing on it,”

View File

@ -0,0 +1,11 @@
--- a/display 2016-05-10 09:21:59.272300947 +0100
+++ b/display 2016-05-10 09:37:27.866341923 +0100
@@ -1,5 +1,5 @@
-{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "“" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "We ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
+{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "“" }, { "face": { "fg": "white", "bg": "blue", "attributes": [] }, "contents": "We" }, { "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": " " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
-{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
+{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:4 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }

View File

@ -0,0 +1 @@
addhl regex '[“”]' 0:red