Regex: Support forward and backward matching code in the same CompiledRegex

No need to have two separate regexes to handle forward and backward
matching, just passing RegexCompileFlags::Backward will add support
for backward matching to the regex. For backward only regex, pass
RegexCompileFlags::NoForward as well to disable generation of
forward matching code.
This commit is contained in:
Maxime Coste 2017-12-01 19:57:02 +08:00
parent e9e3dc862c
commit 413f880e9e
5 changed files with 117 additions and 78 deletions

View File

@ -693,6 +693,12 @@ void paste_all(Context& context, NormalParams params)
selections = std::move(result); selections = std::move(result);
} }
constexpr RegexCompileFlags direction_flags(MatchDirection direction)
{
return (direction == MatchDirection::Forward) ?
RegexCompileFlags::None : RegexCompileFlags::Backward | RegexCompileFlags::NoForward;
}
template<MatchDirection direction = MatchDirection::Forward, typename T> template<MatchDirection direction = MatchDirection::Forward, typename T>
void regex_prompt(Context& context, String prompt, String default_regex, T func) void regex_prompt(Context& context, String prompt, String default_regex, T func)
{ {
@ -725,7 +731,7 @@ void regex_prompt(Context& context, String prompt, String default_regex, T func)
context.push_jump(); context.push_jump();
if (not str.empty() or event == PromptEvent::Validate) if (not str.empty() or event == PromptEvent::Validate)
func(Regex{str.empty() ? default_regex : str, RegexCompileFlags::None, direction}, event, context); func(Regex{str.empty() ? default_regex : str, direction_flags(direction)}, event, context);
} }
catch (regex_error& err) catch (regex_error& err)
{ {
@ -795,7 +801,7 @@ void search_next(Context& context, NormalParams params)
StringView str = context.main_sel_register_value(reg); StringView str = context.main_sel_register_value(reg);
if (not str.empty()) if (not str.empty())
{ {
Regex regex{str, RegexCompileFlags::None, direction}; Regex regex{str, direction_flags(direction)};
auto& selections = context.selections(); auto& selections = context.selections();
bool main_wrapped = false; bool main_wrapped = false;
do { do {

View File

@ -3,8 +3,8 @@
namespace Kakoune namespace Kakoune
{ {
Regex::Regex(StringView re, RegexCompileFlags flags, MatchDirection direction) Regex::Regex(StringView re, RegexCompileFlags flags)
: m_impl{new CompiledRegex{compile_regex(re, flags, direction)}}, : m_impl{new CompiledRegex{compile_regex(re, flags)}},
m_str{re.str()} m_str{re.str()}
{} {}

View File

@ -13,8 +13,7 @@ class Regex
public: public:
Regex() = default; Regex() = default;
explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None, explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None);
MatchDirection direction = MatchDirection::Forward);
bool empty() const { return m_str.empty(); } bool empty() const { return m_str.empty(); }
bool operator==(const Regex& other) const { return m_str == other.m_str; } bool operator==(const Regex& other) const { return m_str == other.m_str; }
bool operator!=(const Regex& other) const { return m_str != other.m_str; } bool operator!=(const Regex& other) const { return m_str != other.m_str; }

View File

@ -618,26 +618,43 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[];
struct RegexCompiler struct RegexCompiler
{ {
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags, MatchDirection direction) RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags)
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward} : m_parsed_regex{parsed_regex}, m_flags(flags)
{ {
kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward);
// Approximation of the number of instructions generated // Approximation of the number of instructions generated
m_program.instructions.reserve(CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1); m_program.instructions.reserve((CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1)
m_program.start_desc = compute_start_desc(); * (((flags & RegexCompileFlags::Backward) and
not (flags & RegexCompileFlags::NoForward)) ? 2 : 1));
if (not (flags & RegexCompileFlags::NoForward))
{
m_program.forward_start_desc = compute_start_desc(true);
write_search_prefix(); write_search_prefix();
compile_node(0); compile_node(0, true);
push_inst(CompiledRegex::Match); push_inst(CompiledRegex::Match);
}
if (flags & RegexCompileFlags::Backward)
{
m_program.first_backward_inst = m_program.instructions.size();
m_program.backward_start_desc = compute_start_desc(false);
write_search_prefix();
compile_node(0, false);
push_inst(CompiledRegex::Match);
}
else
m_program.first_backward_inst = -1;
m_program.character_classes = std::move(m_parsed_regex.character_classes); m_program.character_classes = std::move(m_parsed_regex.character_classes);
m_program.save_count = m_parsed_regex.capture_count * 2; m_program.save_count = m_parsed_regex.capture_count * 2;
m_program.direction = direction;
} }
CompiledRegex get_compiled_regex() { return std::move(m_program); } CompiledRegex get_compiled_regex() { return std::move(m_program); }
private: private:
uint32_t compile_node_inner(ParsedRegex::NodeIndex index) uint32_t compile_node_inner(ParsedRegex::NodeIndex index, bool forward)
{ {
auto& node = get_node(index); auto& node = get_node(index);
@ -647,7 +664,7 @@ private:
const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs))); (node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
if (save) if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 0 : 1)); push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1));
Vector<uint32_t> goto_inner_end_offsets; Vector<uint32_t> goto_inner_end_offsets;
switch (node.op) switch (node.op)
@ -669,13 +686,13 @@ private:
break; break;
case ParsedRegex::Sequence: case ParsedRegex::Sequence:
{ {
if (m_forward) if (forward)
for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
compile_node(child); return true; compile_node(child, true); return true;
}); });
else else
for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
compile_node(child); return true; compile_node(child, false); return true;
}); });
break; break;
} }
@ -690,7 +707,7 @@ private:
for_each_child(m_parsed_regex, index, for_each_child(m_parsed_regex, index,
[&, end = node.children_end](ParsedRegex::NodeIndex child) { [&, end = node.children_end](ParsedRegex::NodeIndex child) {
auto node = compile_node(child); auto node = compile_node(child, forward);
if (child != index+1) if (child != index+1)
m_program.instructions[split_pos++].param = node; m_program.instructions[split_pos++].param = node;
if (get_node(child).children_end != end) if (get_node(child).children_end != end)
@ -703,39 +720,39 @@ private:
break; break;
} }
case ParsedRegex::LookAhead: case ParsedRegex::LookAhead:
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead) : CompiledRegex::LookAhead)
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind), : CompiledRegex::LookBehind),
push_lookaround(index, false, ignore_case)); push_lookaround(index, false, ignore_case));
break; break;
case ParsedRegex::NegativeLookAhead: case ParsedRegex::NegativeLookAhead:
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead) : CompiledRegex::NegativeLookAhead)
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind), : CompiledRegex::NegativeLookBehind),
push_lookaround(index, false, ignore_case)); push_lookaround(index, false, ignore_case));
break; break;
case ParsedRegex::LookBehind: case ParsedRegex::LookBehind:
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind) : CompiledRegex::LookBehind)
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead), : CompiledRegex::LookAhead),
push_lookaround(index, true, ignore_case)); push_lookaround(index, true, ignore_case));
break; break;
case ParsedRegex::NegativeLookBehind: case ParsedRegex::NegativeLookBehind:
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind) : CompiledRegex::NegativeLookBehind)
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead), : CompiledRegex::NegativeLookAhead),
push_lookaround(index, true, ignore_case)); push_lookaround(index, true, ignore_case));
break; break;
case ParsedRegex::LineStart: case ParsedRegex::LineStart:
push_inst(m_forward ? CompiledRegex::LineStart push_inst(forward ? CompiledRegex::LineStart
: CompiledRegex::LineEnd); : CompiledRegex::LineEnd);
break; break;
case ParsedRegex::LineEnd: case ParsedRegex::LineEnd:
push_inst(m_forward ? CompiledRegex::LineEnd push_inst(forward ? CompiledRegex::LineEnd
: CompiledRegex::LineStart); : CompiledRegex::LineStart);
break; break;
case ParsedRegex::WordBoundary: case ParsedRegex::WordBoundary:
@ -745,11 +762,11 @@ private:
push_inst(CompiledRegex::NotWordBoundary); push_inst(CompiledRegex::NotWordBoundary);
break; break;
case ParsedRegex::SubjectBegin: case ParsedRegex::SubjectBegin:
push_inst(m_forward ? CompiledRegex::SubjectBegin push_inst(forward ? CompiledRegex::SubjectBegin
: CompiledRegex::SubjectEnd); : CompiledRegex::SubjectEnd);
break; break;
case ParsedRegex::SubjectEnd: case ParsedRegex::SubjectEnd:
push_inst(m_forward ? CompiledRegex::SubjectEnd push_inst(forward ? CompiledRegex::SubjectEnd
: CompiledRegex::SubjectBegin); : CompiledRegex::SubjectBegin);
break; break;
case ParsedRegex::ResetStart: case ParsedRegex::ResetStart:
@ -761,12 +778,12 @@ private:
m_program.instructions[offset].param = m_program.instructions.size(); m_program.instructions[offset].param = m_program.instructions.size();
if (save) if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 1 : 0)); push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0));
return start_pos; return start_pos;
} }
uint32_t compile_node(ParsedRegex::NodeIndex index) uint32_t compile_node(ParsedRegex::NodeIndex index, bool forward)
{ {
auto& node = get_node(index); auto& node = get_node(index);
@ -784,10 +801,10 @@ private:
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
} }
auto inner_pos = compile_node_inner(index); auto inner_pos = compile_node_inner(index, forward);
// Write the node multiple times when we have a min count quantifier // Write the node multiple times when we have a min count quantifier
for (int i = 1; i < quantifier.min; ++i) for (int i = 1; i < quantifier.min; ++i)
inner_pos = compile_node_inner(index); inner_pos = compile_node_inner(index, forward);
if (quantifier.allows_infinite_repeat()) if (quantifier.allows_infinite_repeat())
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
@ -801,7 +818,7 @@ private:
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild); : CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
compile_node_inner(index); compile_node_inner(index, forward);
} }
for (auto offset : goto_ends) for (auto offset : goto_ends)
@ -813,11 +830,11 @@ private:
// Add an set of instruction prefix used in the search use case // Add an set of instruction prefix used in the search use case
void write_search_prefix() void write_search_prefix()
{ {
kak_assert(m_program.instructions.empty()); const uint32_t first_inst = m_program.instructions.size();
push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size); push_inst(CompiledRegex::Split_PrioritizeChild, first_inst + CompiledRegex::search_prefix_size);
push_inst(CompiledRegex::FindNextStart); push_inst(CompiledRegex::FindNextStart);
push_inst(CompiledRegex::Split_PrioritizeParent, 1); push_inst(CompiledRegex::Split_PrioritizeParent, first_inst + 1);
kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size); kak_assert(m_program.instructions.size() == first_inst + CompiledRegex::search_prefix_size);
} }
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
@ -862,7 +879,7 @@ private:
// returns true if the node did not consume the char, hence a following node in // returns true if the node did not consume the char, hence a following node in
// sequence would be still relevant for the parent node start chars computation. // sequence would be still relevant for the parent node start chars computation.
bool compute_start_desc(ParsedRegex::NodeIndex index, bool compute_start_desc(ParsedRegex::NodeIndex index,
CompiledRegex::StartDesc& start_desc) const CompiledRegex::StartDesc& start_desc, bool forward) const
{ {
auto& node = get_node(index); auto& node = get_node(index);
switch (node.op) switch (node.op)
@ -924,9 +941,9 @@ private:
{ {
bool did_not_consume = false; bool did_not_consume = false;
auto does_not_consume = [&, this](auto child) { auto does_not_consume = [&, this](auto child) {
return this->compute_start_desc(child, start_desc); return this->compute_start_desc(child, start_desc, forward);
}; };
if (m_forward) if (forward)
did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume); did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume);
else else
did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume); did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume);
@ -937,7 +954,7 @@ private:
{ {
bool all_consumed = not node.quantifier.allows_none(); bool all_consumed = not node.quantifier.allows_none();
for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) { for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) {
if (compute_start_desc(child, start_desc)) if (compute_start_desc(child, start_desc, forward))
all_consumed = false; all_consumed = false;
return true; return true;
}); });
@ -960,10 +977,10 @@ private:
} }
[[gnu::noinline]] [[gnu::noinline]]
std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc() const std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc(bool forward) const
{ {
CompiledRegex::StartDesc start_desc{}; CompiledRegex::StartDesc start_desc{};
if (compute_start_desc(0, start_desc) or if (compute_start_desc(0, start_desc, forward) or
not contains(start_desc.map, false)) not contains(start_desc.map, false))
return nullptr; return nullptr;
@ -978,7 +995,6 @@ private:
CompiledRegex m_program; CompiledRegex m_program;
RegexCompileFlags m_flags; RegexCompileFlags m_flags;
ParsedRegex& m_parsed_regex; ParsedRegex& m_parsed_regex;
const bool m_forward;
}; };
void dump_regex(const CompiledRegex& program) void dump_regex(const CompiledRegex& program)
@ -1079,9 +1095,9 @@ void dump_regex(const CompiledRegex& program)
} }
} }
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction) CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
{ {
return RegexCompiler{RegexParser::parse(re), flags, direction}.get_compiled_regex(); return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
} }
bool is_character_class(const CharacterClass& character_class, Codepoint cp) bool is_character_class(const CharacterClass& character_class, Codepoint cp)
@ -1120,7 +1136,8 @@ struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
using VMType = ThreadedRegexVM<const char*, dir>; using VMType = ThreadedRegexVM<const char*, dir>;
TestVM(StringView re, bool dump = false) TestVM(StringView re, bool dump = false)
: CompiledRegex{compile_regex(re, RegexCompileFlags::None, dir)}, : CompiledRegex{compile_regex(re, dir == MatchDirection::Forward ?
RegexCompileFlags::None : RegexCompileFlags::Backward)},
VMType{(const CompiledRegex&)*this} VMType{(const CompiledRegex&)*this}
{ if (dump) dump_regex(*this); } { if (dump) dump_regex(*this); }

View File

@ -98,8 +98,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
Vector<Instruction, MemoryDomain::Regex> instructions; Vector<Instruction, MemoryDomain::Regex> instructions;
Vector<CharacterClass, MemoryDomain::Regex> character_classes; Vector<CharacterClass, MemoryDomain::Regex> character_classes;
Vector<Codepoint, MemoryDomain::Regex> lookarounds; Vector<Codepoint, MemoryDomain::Regex> lookarounds;
MatchDirection direction; uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
size_t save_count; uint32_t save_count;
struct StartDesc struct StartDesc
{ {
@ -108,18 +108,21 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
bool map[count+1]; bool map[count+1];
}; };
std::unique_ptr<StartDesc> start_desc; std::unique_ptr<StartDesc> forward_start_desc;
std::unique_ptr<StartDesc> backward_start_desc;
}; };
enum class RegexCompileFlags enum class RegexCompileFlags
{ {
None = 0, None = 0,
NoSubs = 1 << 0, NoSubs = 1 << 0,
Optimize = 1 << 1 Optimize = 1 << 1,
Backward = 1 << 1,
NoForward = 1 << 2,
}; };
constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; } constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; }
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward); CompiledRegex compile_regex(StringView re, RegexCompileFlags flags);
enum class RegexExecFlags enum class RegexExecFlags
{ {
@ -145,7 +148,8 @@ public:
ThreadedRegexVM(const CompiledRegex& program) ThreadedRegexVM(const CompiledRegex& program)
: m_program{program} : m_program{program}
{ {
kak_assert(m_program and direction == m_program.direction); kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or
(direction == MatchDirection::Backward and program.first_backward_inst != -1));
} }
ThreadedRegexVM(const ThreadedRegexVM&) = delete; ThreadedRegexVM(const ThreadedRegexVM&) = delete;
@ -183,20 +187,30 @@ public:
const bool search = (flags & RegexExecFlags::Search); const bool search = (flags & RegexExecFlags::Search);
Utf8It start{m_begin}; Utf8It start{m_begin};
if (m_program.start_desc) const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
: m_program.backward_start_desc;
if (start_desc)
{ {
if (search) if (search)
{ {
to_next_start(start, m_end, *m_program.start_desc); to_next_start(start, m_end, *start_desc);
if (start == m_end) // If start_desc is not null, it means we consume at least one char if (start == m_end) // If start_desc is not null, it means we consume at least one char
return false; return false;
} }
else if (start != m_end and else if (start != m_end and
not m_program.start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)]) not start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)])
return false; return false;
} }
return exec_program(start, Thread{&m_program.instructions[search ? 0 : CompiledRegex::search_prefix_size], nullptr}); ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
if (direction == MatchDirection::Forward)
instructions = instructions.subrange(0, m_program.first_backward_inst);
else
instructions = instructions.subrange(m_program.first_backward_inst);
if (not search)
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
return exec_program(start, instructions);
} }
ArrayView<const Iterator> captures() const ArrayView<const Iterator> captures() const
@ -397,10 +411,13 @@ private:
return StepResult::Failed; return StepResult::Failed;
} }
bool exec_program(Utf8It pos, Thread init_thread) bool exec_program(Utf8It pos, ConstArrayView<CompiledRegex::Instruction> instructions)
{ {
ExecState state; ExecState state;
state.current_threads.push_back(init_thread); state.current_threads.push_back({instructions.begin(), nullptr});
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
: m_program.backward_start_desc;
bool found_match = false; bool found_match = false;
while (true) // Iterate on all codepoints and once at the end while (true) // Iterate on all codepoints and once at the end
@ -408,7 +425,7 @@ private:
if (++state.step == 0) if (++state.step == 0)
{ {
// We wrapped, avoid potential collision on inst.last_step by resetting them // We wrapped, avoid potential collision on inst.last_step by resetting them
for (auto& inst : m_program.instructions) for (auto& inst : instructions)
inst.last_step = 0; inst.last_step = 0;
state.step = 1; // step 0 is never valid state.step = 1; // step 0 is never valid
} }
@ -470,8 +487,8 @@ private:
std::reverse(state.current_threads.begin(), state.current_threads.end()); std::reverse(state.current_threads.begin(), state.current_threads.end());
++pos; ++pos;
if (find_next_start and m_program.start_desc) if (find_next_start and start_desc)
to_next_start(pos, m_end, *m_program.start_desc); to_next_start(pos, m_end, *start_desc);
} }
} }