diff --git a/src/commands.cc b/src/commands.cc index 39af4e20..e4a968f8 100644 --- a/src/commands.cc +++ b/src/commands.cc @@ -789,7 +789,7 @@ const CommandDesc add_hook_cmd = { if (not contains(hooks, parser[1])) throw runtime_error{format("Unknown hook '{}'", parser[1])}; - Regex regex{parser[2], Regex::optimize | Regex::ECMAScript}; + Regex regex{parser[2], RegexCompileFlags::Optimize}; const String& command = parser[3]; auto group = parser.get_switch("group").value_or(StringView{}); get_scope(parser[0], context).hooks().add_hook(parser[1], group.str(), std::move(regex), command); diff --git a/src/highlighters.cc b/src/highlighters.cc index d5314b2c..c59a57d1 100644 --- a/src/highlighters.cc +++ b/src/highlighters.cc @@ -323,7 +323,7 @@ public: String id = format("hlregex'{}'", params[0]); - Regex ex{params[0], Regex::optimize}; + Regex ex{params[0], RegexCompileFlags::Optimize}; return {id, std::make_unique(std::move(ex), std::move(faces))}; @@ -1823,8 +1823,8 @@ public: if (parser[i].empty() or parser[i+1].empty() or parser[i+2].empty()) throw runtime_error("group id, begin and end must not be empty"); - const Regex::flag_type flags = match_capture ? - Regex::optimize : Regex::nosubs | Regex::optimize; + const RegexCompileFlags flags = match_capture ? + RegexCompileFlags::Optimize : RegexCompileFlags::NoSubs | RegexCompileFlags::Optimize; regions.push_back({ parser[i], Regex{parser[i+1], flags}, Regex{parser[i+2], flags}, diff --git a/src/normal.cc b/src/normal.cc index 04ddc7f6..bd7d065c 100644 --- a/src/normal.cc +++ b/src/normal.cc @@ -981,7 +981,7 @@ void keep(Context& context, NormalParams) const auto flags = match_flags(is_bol(begin.coord()), false, is_bow(buffer, begin.coord()), is_eow(buffer, end.coord())) | - RegexConstant::match_any; + RegexExecFlags::AnyMatch; if (regex_search(begin, end, ex, flags) == matching) keep.push_back(sel); } diff --git a/src/regex.cc b/src/regex.cc index a53dc498..f7ba0e48 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -1,6 +1,5 @@ #include "regex.hh" -#include "exception.hh" #include "buffer_utils.hh" namespace Kakoune @@ -8,17 +7,45 @@ namespace Kakoune using Utf8It = RegexUtf8It; -Regex::Regex(StringView re, flag_type flags) try - : RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()} +boost::regbase::flag_type convert_flags(RegexCompileFlags flags) +{ + boost::regbase::flag_type res = boost::regbase::ECMAScript; + if (flags & RegexCompileFlags::NoSubs) + res |= boost::regbase::nosubs; + if (flags & RegexCompileFlags::Optimize) + res |= boost::regbase::optimize; + return res; +} + +boost::regex_constants::match_flag_type convert_flags(RegexExecFlags flags) +{ + boost::regex_constants::match_flag_type res = boost::regex_constants::match_default; + + if (flags & RegexExecFlags::NotBeginOfLine) + res |= boost::regex_constants::match_not_bol; + if (flags & RegexExecFlags::NotEndOfLine) + res |= boost::regex_constants::match_not_eol; + if (flags & RegexExecFlags::NotBeginOfWord) + res |= boost::regex_constants::match_not_bow; + if (flags & RegexExecFlags::NotEndOfWord) + res |= boost::regex_constants::match_not_eow; + if (flags & RegexExecFlags::NotBeginOfSubject) + res |= boost::regex_constants::match_not_bob; + if (flags & RegexExecFlags::NotInitialNull) + res |= boost::regex_constants::match_not_initial_null; + if (flags & RegexExecFlags::AnyMatch) + res |= boost::regex_constants::match_any; + if (flags & RegexExecFlags::PrevAvailable) + res |= boost::regex_constants::match_prev_avail; + + return res; +} + +Regex::Regex(StringView re, RegexCompileFlags flags) try + : m_impl{new CompiledRegex{compile_regex(re, flags)}}, + m_str{re.str()}, + m_boost_impl{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, convert_flags(flags)} { - try - { - m_impl = new CompiledRegex{compile_regex(re)}; - } - catch (runtime_error& err) - { - write_to_debug_buffer(err.what()); - } } catch (std::runtime_error& err) { throw regex_error(err.what()); } String option_to_string(const Regex& re) diff --git a/src/regex.hh b/src/regex.hh index eecb947d..fc9d3176 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -12,89 +12,116 @@ namespace Kakoune { -struct regex_error : runtime_error -{ - regex_error(StringView desc) - : runtime_error{format("regex error: '{}'", desc)} - {} -}; - -using RegexBase = boost::basic_regex>; - // Regex that keeps track of its string representation -class Regex : public RegexBase +class Regex { public: Regex() = default; - explicit Regex(StringView re, flag_type flags = ECMAScript); + explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None); bool empty() const { return m_str.empty(); } bool operator==(const Regex& other) const { return m_str == other.m_str; } bool operator!=(const Regex& other) const { return m_str != other.m_str; } const String& str() const { return m_str; } + size_t mark_count() const { return m_impl->save_count / 2 - 1; } + static constexpr const char* option_type_name = "regex"; const CompiledRegex* impl() const { return m_impl.get(); } + using BoostImpl = boost::basic_regex>; + const BoostImpl& boost_impl() const { return m_boost_impl; } + private: - String m_str; RefPtr m_impl; + String m_str; + BoostImpl m_boost_impl; }; -template -using RegexUtf8It = utf8::iterator; - -template -using RegexIteratorBase = boost::regex_iterator, wchar_t, - boost::c_regex_traits>; - -namespace RegexConstant = boost::regex_constants; - template -struct MatchResults : boost::match_results> +struct MatchResults { - using ParentType = boost::match_results>; struct SubMatch : std::pair { SubMatch() = default; - SubMatch(const boost::sub_match>& m) - : std::pair{m.first.base(), m.second.base()}, - matched{m.matched} + SubMatch(Iterator begin, Iterator end) + : std::pair{begin, end}, matched{begin != Iterator{}} {} bool matched = false; }; - struct iterator : boost::match_results>::iterator + struct iterator : std::iterator { - using ParentType = typename boost::match_results>::iterator; - iterator(const ParentType& it) : ParentType(it) {} + using It = typename Vector::const_iterator; - SubMatch operator*() const { return {ParentType::operator*()}; } + iterator() = default; + iterator(It it) : m_it{std::move(it)} {} + + iterator& operator--() { m_it += 2; return *this; } + iterator& operator++() { m_it += 2; return *this; } + SubMatch operator*() const { return {*m_it, *(m_it+1)}; } + + friend bool operator==(const iterator& lhs, const iterator& rhs) { return lhs.m_it == rhs.m_it; } + friend bool operator!=(const iterator& lhs, const iterator& rhs) { return lhs.m_it != rhs.m_it; } + private: + + It m_it; }; - iterator begin() const { return {ParentType::begin()}; } - iterator cbegin() const { return {ParentType::cbegin()}; } - iterator end() const { return {ParentType::end()}; } - iterator cend() const { return {ParentType::cend()}; } + MatchResults() = default; + MatchResults(Vector values) : m_values{std::move(values)} {} - SubMatch operator[](size_t s) const { return {ParentType::operator[](s)}; } + iterator begin() const { return iterator{m_values.begin()}; } + iterator cbegin() const { return iterator{m_values.cbegin()}; } + iterator end() const { return iterator{m_values.end()}; } + iterator cend() const { return iterator{m_values.cend()}; } + + size_t size() const { return m_values.size() / 2; } + bool empty() const { return m_values.empty(); } + + SubMatch operator[](size_t i) const + { + return i * 2 < m_values.size() ? + SubMatch{m_values[i*2], m_values[i*2+1]} : SubMatch{}; + } + + friend bool operator==(const MatchResults& lhs, const MatchResults& rhs) + { + return lhs.m_values == rhs.m_values; + } + + friend bool operator!=(const MatchResults& lhs, const MatchResults& rhs) + { + return not (lhs == rhs); + } + + void swap(MatchResults& other) + { + m_values.swap(other.m_values); + } + +private: + Vector m_values; }; -inline RegexConstant::match_flag_type match_flags(bool bol, bool eol, bool bow, bool eow) +inline RegexExecFlags match_flags(bool bol, bool eol, bool bow, bool eow) { - return (bol ? RegexConstant::match_default : RegexConstant::match_not_bol) | - (eol ? RegexConstant::match_default : RegexConstant::match_not_eol) | - (bow ? RegexConstant::match_default : RegexConstant::match_not_bow) | - (eow ? RegexConstant::match_default : RegexConstant::match_not_eow); + return (bol ? RegexExecFlags::None : RegexExecFlags::NotBeginOfLine) | + (eol ? RegexExecFlags::None : RegexExecFlags::NotEndOfLine) | + (bow ? RegexExecFlags::None : RegexExecFlags::NotBeginOfWord) | + (eow ? RegexExecFlags::None : RegexExecFlags::NotEndOfWord); } void regex_mismatch(const Regex& re); template -void check_captures(const Regex& re, const MatchResults& res, const Vector& captures) +using RegexUtf8It = utf8::iterator; + +template +void check_captures(const Regex& re, const boost::match_results>& res, const Vector& captures) { if (res.size() > captures.size() * 2) return regex_mismatch(re); @@ -115,37 +142,18 @@ void check_captures(const Regex& re, const MatchResults& res, const Vector bool regex_match(It begin, It end, const Regex& re) { try { - bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, re); - if (re.impl() and matched != regex_match(begin, end, *re.impl())) + const bool matched = regex_match(begin, end, *re.impl()); + if (not re.boost_impl().empty() and + matched != boost::regex_match>({begin, begin, end}, {end, begin, end}, + re.boost_impl())) regex_mismatch(re); return matched; } @@ -160,12 +168,18 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) { try { - bool matched = boost::regex_match>({begin, begin, end}, {end, begin, end}, res, re); Vector captures; - if (re.impl() and matched != regex_match(begin, end, captures, *re.impl())) + const bool matched = regex_match(begin, end, captures, *re.impl()); + + boost::match_results> boost_res; + if (not re.boost_impl().empty() and + matched != boost::regex_match>({begin, begin, end}, {end, begin, end}, + boost_res, re.boost_impl())) regex_mismatch(re); - if (re.impl() and matched) - check_captures(re, res, captures); + if (not re.boost_impl().empty() and matched) + check_captures(re, boost_res, captures); + + res = matched ? MatchResults{std::move(captures)} : MatchResults{}; return matched; } catch (std::runtime_error& err) @@ -176,13 +190,16 @@ bool regex_match(It begin, It end, MatchResults& res, const Regex& re) template bool regex_search(It begin, It end, const Regex& re, - RegexConstant::match_flag_type flags = RegexConstant::match_default) + RegexExecFlags flags = RegexExecFlags::None) { try { - auto first = (flags & RegexConstant::match_prev_avail) ? begin-1 : begin; - bool matched = boost::regex_search>({begin, first, end}, {end, first, end}, re, flags); - if (re.impl() and matched != regex_search(begin, end, *re.impl(), convert_flags(flags))) + const bool matched = regex_search(begin, end, *re.impl(), flags); + + auto first = (flags & RegexExecFlags::PrevAvailable) ? begin-1 : begin; + if (not re.boost_impl().empty() and + matched != boost::regex_search>({begin, first, end}, {end, first, end}, + re.boost_impl(), convert_flags(flags))) regex_mismatch(re); return matched; } @@ -194,17 +211,23 @@ bool regex_search(It begin, It end, const Regex& re, template bool regex_search(It begin, It end, MatchResults& res, const Regex& re, - RegexConstant::match_flag_type flags = RegexConstant::match_default) + RegexExecFlags flags = RegexExecFlags::None) { try { - auto first = (flags & RegexConstant::match_prev_avail) ? begin-1 : begin; - bool matched = boost::regex_search>({begin, first, end}, {end, first, end}, res, re, flags); Vector captures; - if (re.impl() and matched != regex_search(begin, end, captures, *re.impl(), convert_flags(flags))) + const bool matched = regex_search(begin, end, captures, *re.impl(), flags); + + auto first = (flags & RegexExecFlags::PrevAvailable) ? begin-1 : begin; + boost::match_results> boost_res; + if (not re.boost_impl().empty() and + matched != boost::regex_search>({begin, first, end}, {end, first, end}, + boost_res, re.boost_impl(), convert_flags(flags))) regex_mismatch(re); - if (re.impl() and matched) - check_captures(re, res, captures); + if (not re.boost_impl().empty() and matched) + check_captures(re, boost_res, captures); + + res = matched ? MatchResults{std::move(captures)} : MatchResults{}; return matched; } catch (std::runtime_error& err) @@ -219,12 +242,11 @@ void option_from_string(StringView str, Regex& re); template struct RegexIterator { - using Utf8It = RegexUtf8It; using ValueType = MatchResults; RegexIterator() = default; RegexIterator(Iterator begin, Iterator end, const Regex& re, - RegexConstant::match_flag_type flags = RegexConstant::match_default) + RegexExecFlags flags = RegexExecFlags::None) : m_regex{&re}, m_next_begin{begin}, m_begin{begin}, m_end{end}, m_flags{flags} { next(); @@ -261,11 +283,11 @@ private: { kak_assert(m_regex); - RegexConstant::match_flag_type additional_flags{}; + RegexExecFlags additional_flags{}; if (m_results.size() and m_results[0].first == m_results[0].second) - additional_flags |= RegexConstant::match_not_initial_null; + additional_flags |= RegexExecFlags::NotInitialNull; if (m_begin != m_next_begin) - additional_flags |= RegexConstant::match_not_bob | RegexConstant::match_prev_avail; + additional_flags |= RegexExecFlags::NotBeginOfSubject | RegexExecFlags::PrevAvailable; if (not regex_search(m_next_begin, m_end, m_results, *m_regex, m_flags | additional_flags)) @@ -279,10 +301,9 @@ private: Iterator m_next_begin{}; const Iterator m_begin{}; const Iterator m_end{}; - const RegexConstant::match_flag_type m_flags = RegexConstant::match_default; + const RegexExecFlags m_flags = RegexExecFlags::None; }; - } #endif // regex_hh_INCLUDED diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 6673695d..1b056e83 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -96,7 +96,7 @@ struct RegexParser private: struct InvalidPolicy { - Codepoint operator()(Codepoint cp) { throw runtime_error{"Invalid utf8 in regex"}; } + Codepoint operator()(Codepoint cp) { throw regex_error{"Invalid utf8 in regex"}; } }; using Iterator = utf8::iterator; @@ -466,9 +466,9 @@ private: [[gnu::noreturn]] void parse_error(StringView error) { - throw runtime_error(format("regex parse error: {} at '{}<<>>{}'", error, - StringView{m_regex.begin(), m_pos.base()}, - StringView{m_pos.base(), m_regex.end()})); + throw regex_error(format("regex parse error: {} at '{}<<>>{}'", error, + StringView{m_regex.begin(), m_pos.base()}, + StringView{m_pos.base(), m_regex.end()})); } void validate_lookaround(const AstNodePtr& node) @@ -515,8 +515,8 @@ const RegexParser::ControlEscape RegexParser::control_escapes[5] = { struct RegexCompiler { - RegexCompiler(const ParsedRegex& parsed_regex, MatchDirection direction) - : m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward} + RegexCompiler(const ParsedRegex& parsed_regex, RegexCompileFlags flags, MatchDirection direction) + : m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward} { compile_node(m_parsed_regex.ast); push_inst(CompiledRegex::Match); @@ -535,7 +535,7 @@ private: const auto start_pos = m_program.instructions.size(); const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1; - if (capture != -1) + if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs))) push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 0 : 1)); Vector goto_inner_end_offsets; @@ -629,7 +629,7 @@ private: for (auto& offset : goto_inner_end_offsets) m_program.instructions[offset].param = m_program.instructions.size(); - if (capture != -1) + if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs))) push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 1 : 0)); return start_pos; @@ -797,6 +797,7 @@ private: } CompiledRegex m_program; + RegexCompileFlags m_flags; const ParsedRegex& m_parsed_regex; const bool m_forward; }; @@ -878,9 +879,9 @@ void dump_regex(const CompiledRegex& program) } } -CompiledRegex compile_regex(StringView re, MatchDirection direction) +CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction) { - return RegexCompiler{RegexParser::parse(re), direction}.get_compiled_regex(); + return RegexCompiler{RegexParser::parse(re), flags, direction}.get_compiled_regex(); } namespace @@ -891,7 +892,7 @@ struct TestVM : CompiledRegex, ThreadedRegexVM using VMType = ThreadedRegexVM; TestVM(StringView re, bool dump = false) - : CompiledRegex{compile_regex(re, dir)}, + : CompiledRegex{compile_regex(re, RegexCompileFlags::None, dir)}, VMType{(const CompiledRegex&)*this} { if (dump) dump_regex(*this); } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index d18de8a1..31d919e7 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -14,6 +14,11 @@ namespace Kakoune { +struct regex_error : runtime_error +{ + using runtime_error::runtime_error; +}; + enum class MatchDirection { Forward, @@ -66,7 +71,15 @@ struct CompiledRegex : RefCountable std::unique_ptr start_chars; }; -CompiledRegex compile_regex(StringView re, MatchDirection direction = MatchDirection::Forward); +enum RegexCompileFlags +{ + None = 0, + NoSubs = 1 << 0, + Optimize = 1 << 1 +}; +constexpr bool with_bit_ops(Meta::Type) { return true; } + +CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward); enum class RegexExecFlags { @@ -475,7 +488,7 @@ bool regex_search(It begin, It end, Vector& captures, const CompiledRegex& r ThreadedRegexVM vm{re}; if (vm.exec(begin, end, flags | RegexExecFlags::Search)) { - std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures)); + std::move(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures)); return true; } return false; diff --git a/src/selectors.cc b/src/selectors.cc index e751de41..076edd7b 100644 --- a/src/selectors.cc +++ b/src/selectors.cc @@ -837,7 +837,7 @@ void select_buffer(SelectionList& selections) selections = SelectionList{ buffer, target_eol({{0,0}, buffer.back_coord()}) }; } -static RegexConstant::match_flag_type +static RegexExecFlags match_flags(const Buffer& buf, const BufferIterator& begin, const BufferIterator& end) { return match_flags(is_bol(begin.coord()), is_eol(buf, end.coord()),