Remove most regex impl special casing for backwards matching

This commit is contained in:
Maxime Coste 2018-11-03 13:52:40 +11:00
parent ee74c2c2df
commit 4ac7df3842
2 changed files with 60 additions and 51 deletions

View File

@ -755,40 +755,30 @@ private:
break;
}
case ParsedRegex::LookAhead:
push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead)
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind),
push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead,
push_lookaround<MatchDirection::Forward>(index, ignore_case));
break;
case ParsedRegex::NegativeLookAhead:
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead)
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind),
push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead,
push_lookaround<MatchDirection::Forward>(index, ignore_case));
break;
case ParsedRegex::LookBehind:
push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind)
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead),
push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind,
push_lookaround<MatchDirection::Backward>(index, ignore_case));
break;
case ParsedRegex::NegativeLookBehind:
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind)
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead),
push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind,
push_lookaround<MatchDirection::Backward>(index, ignore_case));
break;
case ParsedRegex::LineStart:
push_inst(forward ? CompiledRegex::LineStart
: CompiledRegex::LineEnd);
push_inst(CompiledRegex::LineStart);
break;
case ParsedRegex::LineEnd:
push_inst(forward ? CompiledRegex::LineEnd
: CompiledRegex::LineStart);
push_inst(CompiledRegex::LineEnd);
break;
case ParsedRegex::WordBoundary:
push_inst(CompiledRegex::WordBoundary);
@ -797,12 +787,10 @@ private:
push_inst(CompiledRegex::NotWordBoundary);
break;
case ParsedRegex::SubjectBegin:
push_inst(forward ? CompiledRegex::SubjectBegin
: CompiledRegex::SubjectEnd);
push_inst(CompiledRegex::SubjectBegin);
break;
case ParsedRegex::SubjectEnd:
push_inst(forward ? CompiledRegex::SubjectEnd
: CompiledRegex::SubjectBegin);
push_inst(CompiledRegex::SubjectEnd);
break;
case ParsedRegex::ResetStart:
push_inst(CompiledRegex::Save, 0);
@ -1443,6 +1431,28 @@ auto test_regex = UnitTest{[]{
TestVM<MatchDirection::Backward> vm{R"($)"};
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search | RegexExecFlags::NotEndOfLine));
kak_assert(StringView{vm.captures()[0]} == "\nqux");
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0]} == "");
}
{
TestVM<MatchDirection::Backward> vm{R"(^)"};
kak_assert(not vm.exec("foo", RegexExecFlags::Search | RegexExecFlags::NotBeginOfLine));
kak_assert(vm.exec("foo", RegexExecFlags::Search));
kak_assert(vm.exec("foo\nbar", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0]} == "bar");
}
{
TestVM<MatchDirection::Backward> vm{R"(\A\w+)"};
kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foo");
}
{
TestVM<MatchDirection::Backward> vm{R"(\b\w+\z)"};
kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "baz");
}
{

View File

@ -192,14 +192,6 @@ public:
if (flags & RegexExecFlags::NotInitialNull and begin == end)
return false;
constexpr bool forward = direction == MatchDirection::Forward;
if (not forward) // Flip line begin/end flags as we flipped the instructions on compilation.
flags = (RegexExecFlags)(flags & ~(RegexExecFlags::NotEndOfLine | RegexExecFlags::NotBeginOfLine)) |
((flags & RegexExecFlags::NotEndOfLine) ? RegexExecFlags::NotBeginOfLine : RegexExecFlags::None) |
((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None);
const bool search = (flags & RegexExecFlags::Search);
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
@ -210,12 +202,13 @@ public:
if (not search)
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
constexpr bool forward = direction == MatchDirection::Forward;
const ExecConfig config{
Sentinel{forward ? begin : end},
Sentinel{forward ? end : begin},
Sentinel{forward ? subject_begin : subject_end},
Sentinel{forward ? subject_end : subject_begin},
Sentinel{subject_begin},
Sentinel{subject_end},
flags,
instructions
};
@ -226,8 +219,7 @@ public:
Sentinel{subject_end}
}};
if (const auto& start_desc = direction == MatchDirection::Forward ?
m_program.forward_start_desc : m_program.backward_start_desc)
if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
{
if (search)
{
@ -525,7 +517,7 @@ private:
}
m_threads.swap_next();
next(pos);
(direction == MatchDirection::Forward) ? ++pos : --pos;
if (find_next_start and start_desc)
to_next_start(pos, config.end, *start_desc);
@ -536,10 +528,10 @@ private:
{
while (start != end)
{
const Codepoint cp = read(start);
const Codepoint cp = read_codepoint(start);
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
{
prev(start);
(direction == MatchDirection::Forward) ? --start : ++start;
return;
}
}
@ -550,12 +542,19 @@ private:
{
using Lookaround = CompiledRegex::Lookaround;
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
if (look_direction == MatchDirection::Backward)
{
if (pos == config.subject_begin)
return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
--pos;
}
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
{
if (pos == end)
if (look_direction == MatchDirection::Forward and pos == config.subject_end)
return false;
Codepoint cp = (look_direction == MatchDirection::Forward ? codepoint(pos) : prev_codepoint(pos));
Codepoint cp = *pos;
if (ignore_case)
cp = to_lower(cp);
@ -582,7 +581,10 @@ private:
else if (static_cast<Codepoint>(op) != cp)
return false;
(look_direction == MatchDirection::Forward) ? next(pos) : prev(pos);
if (look_direction == MatchDirection::Backward and pos == config.subject_begin)
return *++it == Lookaround::EndOfLookaround;
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
}
return true;
}
@ -591,14 +593,14 @@ private:
{
if (pos == config.subject_begin)
return not (config.flags & RegexExecFlags::NotBeginOfLine);
return prev_codepoint(pos) == '\n';
return *(pos-1) == '\n';
}
static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
{
if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfLine);
return codepoint(pos) == '\n';
return *pos == '\n';
}
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
@ -607,10 +609,10 @@ private:
return not (config.flags & RegexExecFlags::NotBeginOfWord);
if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfWord);
return is_word(prev_codepoint(pos)) != is_word(codepoint(pos));
return is_word(*(pos-1)) != is_word(*pos);
}
static Codepoint read(Utf8It& it)
static Codepoint read_codepoint(Utf8It& it)
{
if (direction == MatchDirection::Forward)
return it.read();
@ -618,10 +620,7 @@ private:
return *--it;
}
static constexpr Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
static constexpr Utf8It& next(Utf8It& it) { return (direction == MatchDirection::Forward) ? ++it : --it; }
static constexpr Utf8It& prev(Utf8It& it) { return (direction == MatchDirection::Forward) ? --it : ++it; }
static constexpr Codepoint prev_codepoint(Utf8It it) { return codepoint(prev(it)); }
static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
const CompiledRegex& m_program;