Regex: switch to custom impl, use boost for checking

This commit is contained in:
Maxime Coste 2017-10-09 14:04:14 +08:00
parent 9305fa1369
commit 065bbc8f59
8 changed files with 179 additions and 117 deletions

View File

@ -789,7 +789,7 @@ const CommandDesc add_hook_cmd = {
if (not contains(hooks, parser[1]))
throw runtime_error{format("Unknown hook '{}'", parser[1])};
Regex regex{parser[2], Regex::optimize | Regex::ECMAScript};
Regex regex{parser[2], RegexCompileFlags::Optimize};
const String& command = parser[3];
auto group = parser.get_switch("group").value_or(StringView{});
get_scope(parser[0], context).hooks().add_hook(parser[1], group.str(), std::move(regex), command);

View File

@ -323,7 +323,7 @@ public:
String id = format("hlregex'{}'", params[0]);
Regex ex{params[0], Regex::optimize};
Regex ex{params[0], RegexCompileFlags::Optimize};
return {id, std::make_unique<RegexHighlighter>(std::move(ex),
std::move(faces))};
@ -1823,8 +1823,8 @@ public:
if (parser[i].empty() or parser[i+1].empty() or parser[i+2].empty())
throw runtime_error("group id, begin and end must not be empty");
const Regex::flag_type flags = match_capture ?
Regex::optimize : Regex::nosubs | Regex::optimize;
const RegexCompileFlags flags = match_capture ?
RegexCompileFlags::Optimize : RegexCompileFlags::NoSubs | RegexCompileFlags::Optimize;
regions.push_back({ parser[i],
Regex{parser[i+1], flags}, Regex{parser[i+2], flags},

View File

@ -981,7 +981,7 @@ void keep(Context& context, NormalParams)
const auto flags = match_flags(is_bol(begin.coord()), false,
is_bow(buffer, begin.coord()),
is_eow(buffer, end.coord())) |
RegexConstant::match_any;
RegexExecFlags::AnyMatch;
if (regex_search(begin, end, ex, flags) == matching)
keep.push_back(sel);
}

View File

@ -1,6 +1,5 @@
#include "regex.hh"
#include "exception.hh"
#include "buffer_utils.hh"
namespace Kakoune
@ -8,17 +7,45 @@ namespace Kakoune
using Utf8It = RegexUtf8It<const char*>;
Regex::Regex(StringView re, flag_type flags) try
: RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
boost::regbase::flag_type convert_flags(RegexCompileFlags flags)
{
try
{
m_impl = new CompiledRegex{compile_regex(re)};
boost::regbase::flag_type res = boost::regbase::ECMAScript;
if (flags & RegexCompileFlags::NoSubs)
res |= boost::regbase::nosubs;
if (flags & RegexCompileFlags::Optimize)
res |= boost::regbase::optimize;
return res;
}
catch (runtime_error& err)
boost::regex_constants::match_flag_type convert_flags(RegexExecFlags flags)
{
write_to_debug_buffer(err.what());
boost::regex_constants::match_flag_type res = boost::regex_constants::match_default;
if (flags & RegexExecFlags::NotBeginOfLine)
res |= boost::regex_constants::match_not_bol;
if (flags & RegexExecFlags::NotEndOfLine)
res |= boost::regex_constants::match_not_eol;
if (flags & RegexExecFlags::NotBeginOfWord)
res |= boost::regex_constants::match_not_bow;
if (flags & RegexExecFlags::NotEndOfWord)
res |= boost::regex_constants::match_not_eow;
if (flags & RegexExecFlags::NotBeginOfSubject)
res |= boost::regex_constants::match_not_bob;
if (flags & RegexExecFlags::NotInitialNull)
res |= boost::regex_constants::match_not_initial_null;
if (flags & RegexExecFlags::AnyMatch)
res |= boost::regex_constants::match_any;
if (flags & RegexExecFlags::PrevAvailable)
res |= boost::regex_constants::match_prev_avail;
return res;
}
Regex::Regex(StringView re, RegexCompileFlags flags) try
: m_impl{new CompiledRegex{compile_regex(re, flags)}},
m_str{re.str()},
m_boost_impl{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, convert_flags(flags)}
{
} catch (std::runtime_error& err) { throw regex_error(err.what()); }
String option_to_string(const Regex& re)

View File

@ -12,89 +12,116 @@
namespace Kakoune
{
struct regex_error : runtime_error
{
regex_error(StringView desc)
: runtime_error{format("regex error: '{}'", desc)}
{}
};
using RegexBase = boost::basic_regex<wchar_t, boost::c_regex_traits<wchar_t>>;
// Regex that keeps track of its string representation
class Regex : public RegexBase
class Regex
{
public:
Regex() = default;
explicit Regex(StringView re, flag_type flags = ECMAScript);
explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None);
bool empty() const { return m_str.empty(); }
bool operator==(const Regex& other) const { return m_str == other.m_str; }
bool operator!=(const Regex& other) const { return m_str != other.m_str; }
const String& str() const { return m_str; }
size_t mark_count() const { return m_impl->save_count / 2 - 1; }
static constexpr const char* option_type_name = "regex";
const CompiledRegex* impl() const { return m_impl.get(); }
using BoostImpl = boost::basic_regex<wchar_t, boost::c_regex_traits<wchar_t>>;
const BoostImpl& boost_impl() const { return m_boost_impl; }
private:
String m_str;
RefPtr<CompiledRegex> m_impl;
String m_str;
BoostImpl m_boost_impl;
};
template<typename It>
using RegexUtf8It = utf8::iterator<It, wchar_t, ssize_t>;
template<typename It>
using RegexIteratorBase = boost::regex_iterator<RegexUtf8It<It>, wchar_t,
boost::c_regex_traits<wchar_t>>;
namespace RegexConstant = boost::regex_constants;
template<typename Iterator>
struct MatchResults : boost::match_results<RegexUtf8It<Iterator>>
struct MatchResults
{
using ParentType = boost::match_results<RegexUtf8It<Iterator>>;
struct SubMatch : std::pair<Iterator, Iterator>
{
SubMatch() = default;
SubMatch(const boost::sub_match<RegexUtf8It<Iterator>>& m)
: std::pair<Iterator, Iterator>{m.first.base(), m.second.base()},
matched{m.matched}
SubMatch(Iterator begin, Iterator end)
: std::pair<Iterator, Iterator>{begin, end}, matched{begin != Iterator{}}
{}
bool matched = false;
};
struct iterator : boost::match_results<RegexUtf8It<Iterator>>::iterator
struct iterator : std::iterator<std::bidirectional_iterator_tag, SubMatch, size_t, SubMatch*, SubMatch>
{
using ParentType = typename boost::match_results<RegexUtf8It<Iterator>>::iterator;
iterator(const ParentType& it) : ParentType(it) {}
using It = typename Vector<Iterator>::const_iterator;
SubMatch operator*() const { return {ParentType::operator*()}; }
iterator() = default;
iterator(It it) : m_it{std::move(it)} {}
iterator& operator--() { m_it += 2; return *this; }
iterator& operator++() { m_it += 2; return *this; }
SubMatch operator*() const { return {*m_it, *(m_it+1)}; }
friend bool operator==(const iterator& lhs, const iterator& rhs) { return lhs.m_it == rhs.m_it; }
friend bool operator!=(const iterator& lhs, const iterator& rhs) { return lhs.m_it != rhs.m_it; }
private:
It m_it;
};
iterator begin() const { return {ParentType::begin()}; }
iterator cbegin() const { return {ParentType::cbegin()}; }
iterator end() const { return {ParentType::end()}; }
iterator cend() const { return {ParentType::cend()}; }
MatchResults() = default;
MatchResults(Vector<Iterator> values) : m_values{std::move(values)} {}
SubMatch operator[](size_t s) const { return {ParentType::operator[](s)}; }
iterator begin() const { return iterator{m_values.begin()}; }
iterator cbegin() const { return iterator{m_values.cbegin()}; }
iterator end() const { return iterator{m_values.end()}; }
iterator cend() const { return iterator{m_values.cend()}; }
size_t size() const { return m_values.size() / 2; }
bool empty() const { return m_values.empty(); }
SubMatch operator[](size_t i) const
{
return i * 2 < m_values.size() ?
SubMatch{m_values[i*2], m_values[i*2+1]} : SubMatch{};
}
friend bool operator==(const MatchResults& lhs, const MatchResults& rhs)
{
return lhs.m_values == rhs.m_values;
}
friend bool operator!=(const MatchResults& lhs, const MatchResults& rhs)
{
return not (lhs == rhs);
}
void swap(MatchResults& other)
{
m_values.swap(other.m_values);
}
private:
Vector<Iterator> m_values;
};
inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, bool eow)
inline RegexExecFlags match_flags(bool bol, bool eol, bool bow, bool eow)
{
return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol) |
(eol ? RegexConstant::match_default : RegexConstant::match_not_eol) |
(bow ? RegexConstant::match_default : RegexConstant::match_not_bow) |
(eow ? RegexConstant::match_default : RegexConstant::match_not_eow);
return (bol ? RegexExecFlags::None : RegexExecFlags::NotBeginOfLine) |
(eol ? RegexExecFlags::None : RegexExecFlags::NotEndOfLine) |
(bow ? RegexExecFlags::None : RegexExecFlags::NotBeginOfWord) |
(eow ? RegexExecFlags::None : RegexExecFlags::NotEndOfWord);
}
void regex_mismatch(const Regex& re);
template<typename It>
void check_captures(const Regex& re, const MatchResults<It>& res, const Vector<It>& captures)
using RegexUtf8It = utf8::iterator<It, wchar_t, ssize_t>;
template<typename It>
void check_captures(const Regex& re, const boost::match_results<RegexUtf8It<It>>& res, const Vector<It>& captures)
{
if (res.size() > captures.size() * 2)
return regex_mismatch(re);
@ -115,37 +142,18 @@ void check_captures(const Regex& re, const MatchResults<It>& res, const Vector<I
}
}
inline RegexExecFlags convert_flags(RegexConstant::match_flag_type flags)
{
auto res = RegexExecFlags::None;
if (flags & RegexConstant::match_not_bol)
res |= RegexExecFlags::NotBeginOfLine;
if (flags & RegexConstant::match_not_eol)
res |= RegexExecFlags::NotEndOfLine;
if (flags & RegexConstant::match_not_bow)
res |= RegexExecFlags::NotBeginOfWord;
if (flags & RegexConstant::match_not_eow)
res |= RegexExecFlags::NotEndOfWord;
if (flags & RegexConstant::match_not_bob)
res |= RegexExecFlags::NotBeginOfSubject;
if (flags & RegexConstant::match_not_initial_null)
res |= RegexExecFlags::NotInitialNull;
if (flags & RegexConstant::match_any)
res |= RegexExecFlags::AnyMatch;
if (flags & RegexConstant::match_prev_avail)
res |= RegexExecFlags::PrevAvailable;
return res;
}
boost::regbase::flag_type convert_flags(RegexCompileFlags flags);
boost::regex_constants::match_flag_type convert_flags(RegexExecFlags flags);
template<typename It>
bool regex_match(It begin, It end, const Regex& re)
{
try
{
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
if (re.impl() and matched != regex_match(begin, end, *re.impl()))
const bool matched = regex_match(begin, end, *re.impl());
if (not re.boost_impl().empty() and
matched != boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end},
re.boost_impl()))
regex_mismatch(re);
return matched;
}
@ -160,12 +168,18 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
{
try
{
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
Vector<It> captures;
if (re.impl() and matched != regex_match(begin, end, captures, *re.impl()))
const bool matched = regex_match(begin, end, captures, *re.impl());
boost::match_results<RegexUtf8It<It>> boost_res;
if (not re.boost_impl().empty() and
matched != boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end},
boost_res, re.boost_impl()))
regex_mismatch(re);
if (re.impl() and matched)
check_captures(re, res, captures);
if (not re.boost_impl().empty() and matched)
check_captures(re, boost_res, captures);
res = matched ? MatchResults<It>{std::move(captures)} : MatchResults<It>{};
return matched;
}
catch (std::runtime_error& err)
@ -176,13 +190,16 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
template<typename It>
bool regex_search(It begin, It end, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
RegexExecFlags flags = RegexExecFlags::None)
{
try
{
auto first = (flags & RegexConstant::match_prev_avail) ? begin-1 : begin;
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, first, end}, {end, first, end}, re, flags);
if (re.impl() and matched != regex_search(begin, end, *re.impl(), convert_flags(flags)))
const bool matched = regex_search(begin, end, *re.impl(), flags);
auto first = (flags & RegexExecFlags::PrevAvailable) ? begin-1 : begin;
if (not re.boost_impl().empty() and
matched != boost::regex_search<RegexUtf8It<It>>({begin, first, end}, {end, first, end},
re.boost_impl(), convert_flags(flags)))
regex_mismatch(re);
return matched;
}
@ -194,17 +211,23 @@ bool regex_search(It begin, It end, const Regex& re,
template<typename It>
bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
RegexExecFlags flags = RegexExecFlags::None)
{
try
{
auto first = (flags & RegexConstant::match_prev_avail) ? begin-1 : begin;
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, first, end}, {end, first, end}, res, re, flags);
Vector<It> captures;
if (re.impl() and matched != regex_search(begin, end, captures, *re.impl(), convert_flags(flags)))
const bool matched = regex_search(begin, end, captures, *re.impl(), flags);
auto first = (flags & RegexExecFlags::PrevAvailable) ? begin-1 : begin;
boost::match_results<RegexUtf8It<It>> boost_res;
if (not re.boost_impl().empty() and
matched != boost::regex_search<RegexUtf8It<It>>({begin, first, end}, {end, first, end},
boost_res, re.boost_impl(), convert_flags(flags)))
regex_mismatch(re);
if (re.impl() and matched)
check_captures(re, res, captures);
if (not re.boost_impl().empty() and matched)
check_captures(re, boost_res, captures);
res = matched ? MatchResults<It>{std::move(captures)} : MatchResults<It>{};
return matched;
}
catch (std::runtime_error& err)
@ -219,12 +242,11 @@ void option_from_string(StringView str, Regex& re);
template<typename Iterator>
struct RegexIterator
{
using Utf8It = RegexUtf8It<Iterator>;
using ValueType = MatchResults<Iterator>;
RegexIterator() = default;
RegexIterator(Iterator begin, Iterator end, const Regex& re,
RegexConstant::match_flag_type flags = RegexConstant::match_default)
RegexExecFlags flags = RegexExecFlags::None)
: m_regex{&re}, m_next_begin{begin}, m_begin{begin}, m_end{end}, m_flags{flags}
{
next();
@ -261,11 +283,11 @@ private:
{
kak_assert(m_regex);
RegexConstant::match_flag_type additional_flags{};
RegexExecFlags additional_flags{};
if (m_results.size() and m_results[0].first == m_results[0].second)
additional_flags |= RegexConstant::match_not_initial_null;
additional_flags |= RegexExecFlags::NotInitialNull;
if (m_begin != m_next_begin)
additional_flags |= RegexConstant::match_not_bob | RegexConstant::match_prev_avail;
additional_flags |= RegexExecFlags::NotBeginOfSubject | RegexExecFlags::PrevAvailable;
if (not regex_search(m_next_begin, m_end, m_results, *m_regex,
m_flags | additional_flags))
@ -279,10 +301,9 @@ private:
Iterator m_next_begin{};
const Iterator m_begin{};
const Iterator m_end{};
const RegexConstant::match_flag_type m_flags = RegexConstant::match_default;
const RegexExecFlags m_flags = RegexExecFlags::None;
};
}
#endif // regex_hh_INCLUDED

View File

@ -96,7 +96,7 @@ struct RegexParser
private:
struct InvalidPolicy
{
Codepoint operator()(Codepoint cp) { throw runtime_error{"Invalid utf8 in regex"}; }
Codepoint operator()(Codepoint cp) { throw regex_error{"Invalid utf8 in regex"}; }
};
using Iterator = utf8::iterator<const char*, Codepoint, int, InvalidPolicy>;
@ -466,7 +466,7 @@ private:
[[gnu::noreturn]]
void parse_error(StringView error)
{
throw runtime_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
throw regex_error(format("regex parse error: {} at '{}<<<HERE>>>{}'", error,
StringView{m_regex.begin(), m_pos.base()},
StringView{m_pos.base(), m_regex.end()}));
}
@ -515,8 +515,8 @@ const RegexParser::ControlEscape RegexParser::control_escapes[5] = {
struct RegexCompiler
{
RegexCompiler(const ParsedRegex& parsed_regex, MatchDirection direction)
: m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
RegexCompiler(const ParsedRegex& parsed_regex, RegexCompileFlags flags, MatchDirection direction)
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward}
{
compile_node(m_parsed_regex.ast);
push_inst(CompiledRegex::Match);
@ -535,7 +535,7 @@ private:
const auto start_pos = m_program.instructions.size();
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
if (capture != -1)
if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs)))
push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 0 : 1));
Vector<uint32_t> goto_inner_end_offsets;
@ -629,7 +629,7 @@ private:
for (auto& offset : goto_inner_end_offsets)
m_program.instructions[offset].param = m_program.instructions.size();
if (capture != -1)
if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs)))
push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 1 : 0));
return start_pos;
@ -797,6 +797,7 @@ private:
}
CompiledRegex m_program;
RegexCompileFlags m_flags;
const ParsedRegex& m_parsed_regex;
const bool m_forward;
};
@ -878,9 +879,9 @@ void dump_regex(const CompiledRegex& program)
}
}
CompiledRegex compile_regex(StringView re, MatchDirection direction)
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction)
{
return RegexCompiler{RegexParser::parse(re), direction}.get_compiled_regex();
return RegexCompiler{RegexParser::parse(re), flags, direction}.get_compiled_regex();
}
namespace
@ -891,7 +892,7 @@ struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
using VMType = ThreadedRegexVM<const char*, dir>;
TestVM(StringView re, bool dump = false)
: CompiledRegex{compile_regex(re, dir)},
: CompiledRegex{compile_regex(re, RegexCompileFlags::None, dir)},
VMType{(const CompiledRegex&)*this}
{ if (dump) dump_regex(*this); }

View File

@ -14,6 +14,11 @@
namespace Kakoune
{
struct regex_error : runtime_error
{
using runtime_error::runtime_error;
};
enum class MatchDirection
{
Forward,
@ -66,7 +71,15 @@ struct CompiledRegex : RefCountable
std::unique_ptr<StartChars> start_chars;
};
CompiledRegex compile_regex(StringView re, MatchDirection direction = MatchDirection::Forward);
enum RegexCompileFlags
{
None = 0,
NoSubs = 1 << 0,
Optimize = 1 << 1
};
constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; }
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward);
enum class RegexExecFlags
{
@ -475,7 +488,7 @@ bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& r
ThreadedRegexVM<It, direction> vm{re};
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
{
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
std::move(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
return true;
}
return false;

View File

@ -837,7 +837,7 @@ void select_buffer(SelectionList& selections)
selections = SelectionList{ buffer, target_eol({{0,0}, buffer.back_coord()}) };
}
static RegexConstant::match_flag_type
static RegexExecFlags
match_flags(const Buffer& buf, const BufferIterator& begin, const BufferIterator& end)
{
return match_flags(is_bol(begin.coord()), is_eol(buf, end.coord()),