Use boost::wregex implementation and manually utf8 decode into it
That way we get proper unicode support in regular expressions as long as the current locale treats wchar_t as unicode codepoints. Fixes #638 Fixes #595 Fixes #162
This commit is contained in:
parent
bff9d45bdb
commit
abac6a9436
|
@ -394,8 +394,9 @@ CandidateList complete_filename(StringView prefix, const Regex& ignored_regex,
|
|||
|
||||
auto filter = [&ignored_regex, check_ignored_regex, include_hidden, only_dir](const dirent& entry, struct stat& st)
|
||||
{
|
||||
return (include_hidden or StringView{entry.d_name}.substr(0_byte, 1_byte) != ".") and
|
||||
(not check_ignored_regex or not regex_match(entry.d_name, ignored_regex)) and
|
||||
StringView name{entry.d_name};
|
||||
return (include_hidden or name.substr(0_byte, 1_byte) != ".") and
|
||||
(not check_ignored_regex or not regex_match(name.begin(), name.end(), ignored_regex)) and
|
||||
(not only_dir or S_ISDIR(st.st_mode));
|
||||
};
|
||||
auto files = list_files(dirname, filter);
|
||||
|
|
|
@ -308,13 +308,14 @@ private:
|
|||
buffer.iterator_at(range.end), m_regex,
|
||||
match_flags(is_bol(range.begin),
|
||||
is_eol(buffer, range.end),
|
||||
is_bow(buffer, range.begin),
|
||||
is_eow(buffer, range.end))};
|
||||
RegexIt re_end;
|
||||
for (; re_it != re_end; ++re_it)
|
||||
{
|
||||
for (size_t i = 0; i < m_faces.size(); ++i)
|
||||
{
|
||||
auto& sub = (*re_it)[m_faces[i].first];
|
||||
const auto& sub = (*re_it)[m_faces[i].first];
|
||||
matches.push_back({sub.first.coord(), sub.second.coord()});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,6 +5,12 @@
|
|||
namespace Kakoune
|
||||
{
|
||||
|
||||
using Utf8It = RegexUtf8It<const char*>;
|
||||
|
||||
Regex::Regex(StringView re, flag_type flags) try
|
||||
: boost::wregex{Utf8It{re.begin(), re}, Utf8It{re.end(), re}}, m_str(re.str())
|
||||
{} catch (std::runtime_error& err) { throw regex_error(err.what()); }
|
||||
|
||||
String option_to_string(const Regex& re)
|
||||
{
|
||||
return re.str();
|
||||
|
|
117
src/regex.hh
117
src/regex.hh
|
@ -3,12 +3,9 @@
|
|||
|
||||
#include "string.hh"
|
||||
#include "exception.hh"
|
||||
#include "utf8_iterator.hh"
|
||||
|
||||
#ifdef KAK_USE_STDREGEX
|
||||
#include <regex>
|
||||
#else
|
||||
#include <boost/regex.hpp>
|
||||
#endif
|
||||
|
||||
namespace Kakoune
|
||||
{
|
||||
|
@ -20,16 +17,12 @@ struct regex_error : runtime_error
|
|||
{}
|
||||
};
|
||||
|
||||
#ifdef KAK_USE_STDREGEX
|
||||
// Regex that keeps track of its string representation
|
||||
struct Regex : std::regex
|
||||
struct Regex : boost::wregex
|
||||
{
|
||||
Regex() = default;
|
||||
|
||||
explicit Regex(StringView re, flag_type flags = ECMAScript) try
|
||||
: std::regex(re.begin(), re.end(), flags), m_str(re.str()) {}
|
||||
catch (std::runtime_error& err) { throw regex_error(err.what()); }
|
||||
|
||||
explicit Regex(StringView re, flag_type flags = ECMAScript);
|
||||
bool empty() const { return m_str.empty(); }
|
||||
bool operator==(const Regex& other) const { return m_str == other.m_str; }
|
||||
bool operator!=(const Regex& other) const { return m_str != other.m_str; }
|
||||
|
@ -39,37 +32,97 @@ struct Regex : std::regex
|
|||
private:
|
||||
String m_str;
|
||||
};
|
||||
namespace regex_ns = std;
|
||||
#else
|
||||
struct Regex : boost::regex
|
||||
|
||||
template<typename It>
|
||||
using RegexUtf8It = utf8::iterator<It, wchar_t, ssize_t>;
|
||||
|
||||
namespace RegexConstant = boost::regex_constants;
|
||||
|
||||
template<typename Iterator>
|
||||
struct MatchResults : boost::match_results<RegexUtf8It<Iterator>>
|
||||
{
|
||||
Regex() = default;
|
||||
using ParentType = boost::match_results<RegexUtf8It<Iterator>>;
|
||||
struct SubMatch : std::pair<Iterator, Iterator>
|
||||
{
|
||||
SubMatch() = default;
|
||||
SubMatch(const boost::sub_match<RegexUtf8It<Iterator>>& m)
|
||||
: std::pair<Iterator, Iterator>{m.first.base(), m.second.base()},
|
||||
matched{m.matched}
|
||||
{}
|
||||
|
||||
explicit Regex(StringView re, flag_type flags = ECMAScript) try
|
||||
: boost::regex(re.begin(), re.end(), flags) {}
|
||||
catch (std::runtime_error& err) { throw regex_error(err.what()); }
|
||||
|
||||
String str() const { auto s = boost::regex::str(); return {s.data(), (int)s.length()}; }
|
||||
bool matched = false;
|
||||
};
|
||||
namespace regex_ns = boost;
|
||||
#endif
|
||||
|
||||
template<typename Iterator>
|
||||
using RegexIterator = regex_ns::regex_iterator<Iterator>;
|
||||
|
||||
template<typename Iterator>
|
||||
using MatchResults = regex_ns::match_results<Iterator>;
|
||||
|
||||
namespace RegexConstant = regex_ns::regex_constants;
|
||||
|
||||
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool eow)
|
||||
struct iterator : boost::match_results<RegexUtf8It<Iterator>>::iterator
|
||||
{
|
||||
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol |
|
||||
RegexConstant::match_prev_avail) |
|
||||
using ParentType = typename boost::match_results<RegexUtf8It<Iterator>>::iterator;
|
||||
iterator(const ParentType& it) : ParentType(it) {}
|
||||
|
||||
SubMatch operator*() const { return {ParentType::operator*()}; }
|
||||
};
|
||||
|
||||
iterator begin() const { return {ParentType::begin()}; }
|
||||
iterator cbegin() const { return {ParentType::cbegin()}; }
|
||||
iterator end() const { return {ParentType::end()}; }
|
||||
iterator cend() const { return {ParentType::cend()}; }
|
||||
|
||||
SubMatch operator[](size_t s) const { return {ParentType::operator[](s)}; }
|
||||
};
|
||||
|
||||
template<typename Iterator>
|
||||
struct RegexIterator : boost::regex_iterator<RegexUtf8It<Iterator>>
|
||||
{
|
||||
using ParentType = boost::regex_iterator<RegexUtf8It<Iterator>>;
|
||||
using Utf8It = RegexUtf8It<Iterator>;
|
||||
using ValueType = MatchResults<Iterator>;
|
||||
|
||||
RegexIterator() = default;
|
||||
RegexIterator(Iterator begin, Iterator end, const Regex& re,
|
||||
RegexConstant::match_flag_type flags = RegexConstant::match_default)
|
||||
: ParentType{Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re, flags} {}
|
||||
|
||||
const ValueType& operator*() const { return *reinterpret_cast<const ValueType*>(&ParentType::operator*()); }
|
||||
const ValueType* operator->() const { return reinterpret_cast<const ValueType*>(ParentType::operator->()); }
|
||||
};
|
||||
|
||||
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, bool eow)
|
||||
{
|
||||
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol) |
|
||||
(eol ? RegexConstant::match_default : RegexConstant::match_not_eol) |
|
||||
(bow ? RegexConstant::match_default : RegexConstant::match_not_bow) |
|
||||
(eow ? RegexConstant::match_default : RegexConstant::match_not_eow);
|
||||
}
|
||||
|
||||
template<typename It>
|
||||
bool regex_match(It begin, It end, const Regex& re)
|
||||
{
|
||||
using Utf8It = RegexUtf8It<It>;
|
||||
return boost::regex_match(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re);
|
||||
}
|
||||
|
||||
template<typename It>
|
||||
bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
|
||||
{
|
||||
using Utf8It = RegexUtf8It<It>;
|
||||
return boost::regex_match(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, res, re);
|
||||
}
|
||||
|
||||
template<typename It>
|
||||
bool regex_search(It begin, It end, const Regex& re,
|
||||
RegexConstant::match_flag_type flags = RegexConstant::match_default)
|
||||
{
|
||||
using Utf8It = RegexUtf8It<It>;
|
||||
return boost::regex_search(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, re);
|
||||
}
|
||||
|
||||
template<typename It>
|
||||
bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
|
||||
RegexConstant::match_flag_type flags = RegexConstant::match_default)
|
||||
{
|
||||
using Utf8It = RegexUtf8It<It>;
|
||||
return boost::regex_search(Utf8It{begin, begin, end}, Utf8It{end, begin, end}, res, re);
|
||||
}
|
||||
|
||||
String option_to_string(const Regex& re);
|
||||
void option_from_string(StringView str, Regex& re);
|
||||
|
||||
|
|
|
@ -615,20 +615,21 @@ void select_all_matches(SelectionList& selections, const Regex& regex, unsigned
|
|||
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
|
||||
const auto flags = match_flags(is_bol(sel_beg.coord()),
|
||||
is_eol(buffer, sel_end.coord()),
|
||||
is_bow(buffer, sel_beg.coord()),
|
||||
is_eow(buffer, sel_end.coord()));
|
||||
RegexIt re_it(sel_beg, sel_end, regex, flags);
|
||||
RegexIt re_end;
|
||||
|
||||
for (; re_it != re_end; ++re_it)
|
||||
{
|
||||
auto begin = ensure_char_start(buffer, (*re_it)[capture].first);
|
||||
auto begin = (*re_it)[capture].first;
|
||||
if (begin == sel_end)
|
||||
continue;
|
||||
auto end = ensure_char_start(buffer, (*re_it)[capture].second);
|
||||
auto end = (*re_it)[capture].second;
|
||||
|
||||
CaptureList captures;
|
||||
captures.reserve(mark_count);
|
||||
for (auto& match : *re_it)
|
||||
for (const auto& match : *re_it)
|
||||
captures.push_back(buffer.string(match.first.coord(),
|
||||
match.second.coord()));
|
||||
|
||||
|
@ -661,6 +662,7 @@ void split_selections(SelectionList& selections, const Regex& regex, unsigned ca
|
|||
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
|
||||
const auto flags = match_flags(is_bol(begin.coord()),
|
||||
is_eol(buffer, sel_end.coord()),
|
||||
is_bow(buffer, begin.coord()),
|
||||
is_eow(buffer, sel_end.coord()));
|
||||
|
||||
RegexIt re_it(begin, sel_end, regex, flags);
|
||||
|
@ -674,11 +676,10 @@ void split_selections(SelectionList& selections, const Regex& regex, unsigned ca
|
|||
|
||||
if (end != buf_begin)
|
||||
{
|
||||
end = ensure_char_start(buffer, end);
|
||||
auto sel_end = (begin == end) ? end : utf8::previous(end, begin);
|
||||
result.push_back(keep_direction({ begin.coord(), sel_end.coord() }, sel));
|
||||
}
|
||||
begin = ensure_char_start(buffer, (*re_it)[capture].second);
|
||||
begin = (*re_it)[capture].second;
|
||||
}
|
||||
if (begin.coord() <= sel.max())
|
||||
result.push_back(keep_direction({ begin.coord(), sel.max() }, sel));
|
||||
|
|
|
@ -226,7 +226,8 @@ inline bool find_last_match(const Buffer& buffer, const BufferIterator& pos,
|
|||
const bool is_pos_eow = is_eow(buffer, pos.coord());
|
||||
auto begin = buffer.begin();
|
||||
while (begin != pos and regex_search(begin, pos, matches, regex,
|
||||
match_flags(is_bol(begin.coord()), is_pos_eol, is_pos_eow)))
|
||||
match_flags(is_bol(begin.coord()), is_pos_eol,
|
||||
is_bow(buffer, begin.coord()), is_pos_eow)))
|
||||
{
|
||||
begin = utf8::next(matches[0].first, pos);
|
||||
if (res.empty() or matches[0].second > res[0].second)
|
||||
|
@ -244,7 +245,8 @@ bool find_match_in_buffer(const Buffer& buffer, const BufferIterator pos,
|
|||
if (direction == Forward)
|
||||
{
|
||||
if (regex_search(pos, buffer.end(), matches, ex,
|
||||
match_flags(is_bol(pos.coord()), true, true)))
|
||||
match_flags(is_bol(pos.coord()), true,
|
||||
is_bow(buffer, pos.coord()), true)))
|
||||
return true;
|
||||
wrapped = true;
|
||||
return regex_search(buffer.begin(), buffer.end(), matches, ex);
|
||||
|
@ -276,9 +278,9 @@ Selection find_next_match(const Buffer& buffer, const Selection& sel, const Rege
|
|||
auto pos = direction == Forward ? utf8::next(begin, buffer.end()) : begin;
|
||||
if ((found = find_match_in_buffer<direction>(buffer, pos, matches, regex, wrapped)))
|
||||
{
|
||||
begin = ensure_char_start(buffer, matches[0].first);
|
||||
end = ensure_char_start(buffer, matches[0].second);
|
||||
for (auto& match : matches)
|
||||
begin = matches[0].first;
|
||||
end = matches[0].second;
|
||||
for (const auto& match : matches)
|
||||
captures.push_back(buffer.string(match.first.coord(),
|
||||
match.second.coord()));
|
||||
}
|
||||
|
|
1
test/regression/638-highlight-codepoint-with-bracket/cmd
Normal file
1
test/regression/638-highlight-codepoint-with-bracket/cmd
Normal file
|
@ -0,0 +1 @@
|
|||
w<c-l>:q<ret>
|
|
@ -0,0 +1,5 @@
|
|||
{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "“" }, { "face": { "fg": "white", "bg": "blue", "attributes": [] }, "contents": "We" }, { "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": " " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
|
||||
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
|
||||
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
|
||||
{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:4 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
|
||||
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }
|
1
test/regression/638-highlight-codepoint-with-bracket/in
Normal file
1
test/regression/638-highlight-codepoint-with-bracket/in
Normal file
|
@ -0,0 +1 @@
|
|||
“We ought to scrape this planet clean of every living thing on it,”
|
1
test/regression/638-highlight-codepoint-with-bracket/out
Normal file
1
test/regression/638-highlight-codepoint-with-bracket/out
Normal file
|
@ -0,0 +1 @@
|
|||
“We ought to scrape this planet clean of every living thing on it,”
|
11
test/regression/638-highlight-codepoint-with-bracket/patch
Normal file
11
test/regression/638-highlight-codepoint-with-bracket/patch
Normal file
|
@ -0,0 +1,11 @@
|
|||
--- a/display 2016-05-10 09:21:59.272300947 +0100
|
||||
+++ b/display 2016-05-10 09:37:27.866341923 +0100
|
||||
@@ -1,5 +1,5 @@
|
||||
-{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "“" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "We ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
|
||||
+{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "“" }, { "face": { "fg": "white", "bg": "blue", "attributes": [] }, "contents": "We" }, { "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": " " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "ought to scrape this planet clean of every living thing on it," }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "”" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
|
||||
{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
|
||||
{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
|
||||
-{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
|
||||
+{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:4 " }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - unnamed0@[kak-test-regression-638-highlight-codepoint-with-bracket]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
|
||||
{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }
|
||||
|
1
test/regression/638-highlight-codepoint-with-bracket/rc
Normal file
1
test/regression/638-highlight-codepoint-with-bracket/rc
Normal file
|
@ -0,0 +1 @@
|
|||
addhl regex '[“”]' 0:red
|
Loading…
Reference in New Issue
Block a user