Regex: Support forward and backward matching code in the same CompiledRegex

No need to have two separate regexes to handle forward and backward
matching, just passing RegexCompileFlags::Backward will add support
for backward matching to the regex. For backward only regex, pass
RegexCompileFlags::NoForward as well to disable generation of
forward matching code.
This commit is contained in:
Maxime Coste 2017-12-01 19:57:02 +08:00
parent e9e3dc862c
commit 413f880e9e
5 changed files with 117 additions and 78 deletions

View File

@ -693,6 +693,12 @@ void paste_all(Context& context, NormalParams params)
selections = std::move(result);
}
constexpr RegexCompileFlags direction_flags(MatchDirection direction)
{
return (direction == MatchDirection::Forward) ?
RegexCompileFlags::None : RegexCompileFlags::Backward | RegexCompileFlags::NoForward;
}
template<MatchDirection direction = MatchDirection::Forward, typename T>
void regex_prompt(Context& context, String prompt, String default_regex, T func)
{
@ -725,7 +731,7 @@ void regex_prompt(Context& context, String prompt, String default_regex, T func)
context.push_jump();
if (not str.empty() or event == PromptEvent::Validate)
func(Regex{str.empty() ? default_regex : str, RegexCompileFlags::None, direction}, event, context);
func(Regex{str.empty() ? default_regex : str, direction_flags(direction)}, event, context);
}
catch (regex_error& err)
{
@ -795,7 +801,7 @@ void search_next(Context& context, NormalParams params)
StringView str = context.main_sel_register_value(reg);
if (not str.empty())
{
Regex regex{str, RegexCompileFlags::None, direction};
Regex regex{str, direction_flags(direction)};
auto& selections = context.selections();
bool main_wrapped = false;
do {

View File

@ -3,8 +3,8 @@
namespace Kakoune
{
Regex::Regex(StringView re, RegexCompileFlags flags, MatchDirection direction)
: m_impl{new CompiledRegex{compile_regex(re, flags, direction)}},
Regex::Regex(StringView re, RegexCompileFlags flags)
: m_impl{new CompiledRegex{compile_regex(re, flags)}},
m_str{re.str()}
{}

View File

@ -13,8 +13,7 @@ class Regex
public:
Regex() = default;
explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None,
MatchDirection direction = MatchDirection::Forward);
explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None);
bool empty() const { return m_str.empty(); }
bool operator==(const Regex& other) const { return m_str == other.m_str; }
bool operator!=(const Regex& other) const { return m_str != other.m_str; }

View File

@ -618,26 +618,43 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[];
struct RegexCompiler
{
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags, MatchDirection direction)
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward}
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags)
: m_parsed_regex{parsed_regex}, m_flags(flags)
{
kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward);
// Approximation of the number of instructions generated
m_program.instructions.reserve(CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1);
m_program.start_desc = compute_start_desc();
m_program.instructions.reserve((CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1)
* (((flags & RegexCompileFlags::Backward) and
not (flags & RegexCompileFlags::NoForward)) ? 2 : 1));
if (not (flags & RegexCompileFlags::NoForward))
{
m_program.forward_start_desc = compute_start_desc(true);
write_search_prefix();
compile_node(0);
compile_node(0, true);
push_inst(CompiledRegex::Match);
}
if (flags & RegexCompileFlags::Backward)
{
m_program.first_backward_inst = m_program.instructions.size();
m_program.backward_start_desc = compute_start_desc(false);
write_search_prefix();
compile_node(0, false);
push_inst(CompiledRegex::Match);
}
else
m_program.first_backward_inst = -1;
m_program.character_classes = std::move(m_parsed_regex.character_classes);
m_program.save_count = m_parsed_regex.capture_count * 2;
m_program.direction = direction;
}
CompiledRegex get_compiled_regex() { return std::move(m_program); }
private:
uint32_t compile_node_inner(ParsedRegex::NodeIndex index)
uint32_t compile_node_inner(ParsedRegex::NodeIndex index, bool forward)
{
auto& node = get_node(index);
@ -647,7 +664,7 @@ private:
const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 0 : 1));
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1));
Vector<uint32_t> goto_inner_end_offsets;
switch (node.op)
@ -669,13 +686,13 @@ private:
break;
case ParsedRegex::Sequence:
{
if (m_forward)
if (forward)
for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
compile_node(child); return true;
compile_node(child, true); return true;
});
else
for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
compile_node(child); return true;
compile_node(child, false); return true;
});
break;
}
@ -690,7 +707,7 @@ private:
for_each_child(m_parsed_regex, index,
[&, end = node.children_end](ParsedRegex::NodeIndex child) {
auto node = compile_node(child);
auto node = compile_node(child, forward);
if (child != index+1)
m_program.instructions[split_pos++].param = node;
if (get_node(child).children_end != end)
@ -703,39 +720,39 @@ private:
break;
}
case ParsedRegex::LookAhead:
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead)
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind),
push_lookaround(index, false, ignore_case));
break;
case ParsedRegex::NegativeLookAhead:
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead)
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind),
push_lookaround(index, false, ignore_case));
break;
case ParsedRegex::LookBehind:
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind)
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead),
push_lookaround(index, true, ignore_case));
break;
case ParsedRegex::NegativeLookBehind:
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind)
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead),
push_lookaround(index, true, ignore_case));
break;
case ParsedRegex::LineStart:
push_inst(m_forward ? CompiledRegex::LineStart
push_inst(forward ? CompiledRegex::LineStart
: CompiledRegex::LineEnd);
break;
case ParsedRegex::LineEnd:
push_inst(m_forward ? CompiledRegex::LineEnd
push_inst(forward ? CompiledRegex::LineEnd
: CompiledRegex::LineStart);
break;
case ParsedRegex::WordBoundary:
@ -745,11 +762,11 @@ private:
push_inst(CompiledRegex::NotWordBoundary);
break;
case ParsedRegex::SubjectBegin:
push_inst(m_forward ? CompiledRegex::SubjectBegin
push_inst(forward ? CompiledRegex::SubjectBegin
: CompiledRegex::SubjectEnd);
break;
case ParsedRegex::SubjectEnd:
push_inst(m_forward ? CompiledRegex::SubjectEnd
push_inst(forward ? CompiledRegex::SubjectEnd
: CompiledRegex::SubjectBegin);
break;
case ParsedRegex::ResetStart:
@ -761,12 +778,12 @@ private:
m_program.instructions[offset].param = m_program.instructions.size();
if (save)
push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 1 : 0));
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0));
return start_pos;
}
uint32_t compile_node(ParsedRegex::NodeIndex index)
uint32_t compile_node(ParsedRegex::NodeIndex index, bool forward)
{
auto& node = get_node(index);
@ -784,10 +801,10 @@ private:
goto_ends.push_back(split_pos);
}
auto inner_pos = compile_node_inner(index);
auto inner_pos = compile_node_inner(index, forward);
// Write the node multiple times when we have a min count quantifier
for (int i = 1; i < quantifier.min; ++i)
inner_pos = compile_node_inner(index);
inner_pos = compile_node_inner(index, forward);
if (quantifier.allows_infinite_repeat())
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
@ -801,7 +818,7 @@ private:
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos);
compile_node_inner(index);
compile_node_inner(index, forward);
}
for (auto offset : goto_ends)
@ -813,11 +830,11 @@ private:
// Add an set of instruction prefix used in the search use case
void write_search_prefix()
{
kak_assert(m_program.instructions.empty());
push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size);
const uint32_t first_inst = m_program.instructions.size();
push_inst(CompiledRegex::Split_PrioritizeChild, first_inst + CompiledRegex::search_prefix_size);
push_inst(CompiledRegex::FindNextStart);
push_inst(CompiledRegex::Split_PrioritizeParent, 1);
kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size);
push_inst(CompiledRegex::Split_PrioritizeParent, first_inst + 1);
kak_assert(m_program.instructions.size() == first_inst + CompiledRegex::search_prefix_size);
}
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
@ -862,7 +879,7 @@ private:
// returns true if the node did not consume the char, hence a following node in
// sequence would be still relevant for the parent node start chars computation.
bool compute_start_desc(ParsedRegex::NodeIndex index,
CompiledRegex::StartDesc& start_desc) const
CompiledRegex::StartDesc& start_desc, bool forward) const
{
auto& node = get_node(index);
switch (node.op)
@ -924,9 +941,9 @@ private:
{
bool did_not_consume = false;
auto does_not_consume = [&, this](auto child) {
return this->compute_start_desc(child, start_desc);
return this->compute_start_desc(child, start_desc, forward);
};
if (m_forward)
if (forward)
did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume);
else
did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume);
@ -937,7 +954,7 @@ private:
{
bool all_consumed = not node.quantifier.allows_none();
for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) {
if (compute_start_desc(child, start_desc))
if (compute_start_desc(child, start_desc, forward))
all_consumed = false;
return true;
});
@ -960,10 +977,10 @@ private:
}
[[gnu::noinline]]
std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc() const
std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc(bool forward) const
{
CompiledRegex::StartDesc start_desc{};
if (compute_start_desc(0, start_desc) or
if (compute_start_desc(0, start_desc, forward) or
not contains(start_desc.map, false))
return nullptr;
@ -978,7 +995,6 @@ private:
CompiledRegex m_program;
RegexCompileFlags m_flags;
ParsedRegex& m_parsed_regex;
const bool m_forward;
};
void dump_regex(const CompiledRegex& program)
@ -1079,9 +1095,9 @@ void dump_regex(const CompiledRegex& program)
}
}
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction)
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
{
return RegexCompiler{RegexParser::parse(re), flags, direction}.get_compiled_regex();
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
}
bool is_character_class(const CharacterClass& character_class, Codepoint cp)
@ -1120,7 +1136,8 @@ struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
using VMType = ThreadedRegexVM<const char*, dir>;
TestVM(StringView re, bool dump = false)
: CompiledRegex{compile_regex(re, RegexCompileFlags::None, dir)},
: CompiledRegex{compile_regex(re, dir == MatchDirection::Forward ?
RegexCompileFlags::None : RegexCompileFlags::Backward)},
VMType{(const CompiledRegex&)*this}
{ if (dump) dump_regex(*this); }

View File

@ -98,8 +98,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
Vector<Instruction, MemoryDomain::Regex> instructions;
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
Vector<Codepoint, MemoryDomain::Regex> lookarounds;
MatchDirection direction;
size_t save_count;
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
uint32_t save_count;
struct StartDesc
{
@ -108,18 +108,21 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
bool map[count+1];
};
std::unique_ptr<StartDesc> start_desc;
std::unique_ptr<StartDesc> forward_start_desc;
std::unique_ptr<StartDesc> backward_start_desc;
};
enum class RegexCompileFlags
{
None = 0,
NoSubs = 1 << 0,
Optimize = 1 << 1
Optimize = 1 << 1,
Backward = 1 << 1,
NoForward = 1 << 2,
};
constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; }
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward);
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags);
enum class RegexExecFlags
{
@ -145,7 +148,8 @@ public:
ThreadedRegexVM(const CompiledRegex& program)
: m_program{program}
{
kak_assert(m_program and direction == m_program.direction);
kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or
(direction == MatchDirection::Backward and program.first_backward_inst != -1));
}
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
@ -183,20 +187,30 @@ public:
const bool search = (flags & RegexExecFlags::Search);
Utf8It start{m_begin};
if (m_program.start_desc)
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
: m_program.backward_start_desc;
if (start_desc)
{
if (search)
{
to_next_start(start, m_end, *m_program.start_desc);
to_next_start(start, m_end, *start_desc);
if (start == m_end) // If start_desc is not null, it means we consume at least one char
return false;
}
else if (start != m_end and
not m_program.start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)])
not start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)])
return false;
}
return exec_program(start, Thread{&m_program.instructions[search ? 0 : CompiledRegex::search_prefix_size], nullptr});
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
if (direction == MatchDirection::Forward)
instructions = instructions.subrange(0, m_program.first_backward_inst);
else
instructions = instructions.subrange(m_program.first_backward_inst);
if (not search)
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
return exec_program(start, instructions);
}
ArrayView<const Iterator> captures() const
@ -397,10 +411,13 @@ private:
return StepResult::Failed;
}
bool exec_program(Utf8It pos, Thread init_thread)
bool exec_program(Utf8It pos, ConstArrayView<CompiledRegex::Instruction> instructions)
{
ExecState state;
state.current_threads.push_back(init_thread);
state.current_threads.push_back({instructions.begin(), nullptr});
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
: m_program.backward_start_desc;
bool found_match = false;
while (true) // Iterate on all codepoints and once at the end
@ -408,7 +425,7 @@ private:
if (++state.step == 0)
{
// We wrapped, avoid potential collision on inst.last_step by resetting them
for (auto& inst : m_program.instructions)
for (auto& inst : instructions)
inst.last_step = 0;
state.step = 1; // step 0 is never valid
}
@ -470,8 +487,8 @@ private:
std::reverse(state.current_threads.begin(), state.current_threads.end());
++pos;
if (find_next_start and m_program.start_desc)
to_next_start(pos, m_end, *m_program.start_desc);
if (find_next_start and start_desc)
to_next_start(pos, m_end, *start_desc);
}
}