Remove most regex impl special casing for backwards matching

This commit is contained in:
Maxime Coste 2018-11-03 13:52:40 +11:00
parent ee74c2c2df
commit 4ac7df3842
2 changed files with 60 additions and 51 deletions

View File

@ -755,40 +755,30 @@ private:
break; break;
} }
case ParsedRegex::LookAhead: case ParsedRegex::LookAhead:
push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead) : CompiledRegex::LookAhead,
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind),
push_lookaround<MatchDirection::Forward>(index, ignore_case)); push_lookaround<MatchDirection::Forward>(index, ignore_case));
break; break;
case ParsedRegex::NegativeLookAhead: case ParsedRegex::NegativeLookAhead:
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead) : CompiledRegex::NegativeLookAhead,
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind),
push_lookaround<MatchDirection::Forward>(index, ignore_case)); push_lookaround<MatchDirection::Forward>(index, ignore_case));
break; break;
case ParsedRegex::LookBehind: case ParsedRegex::LookBehind:
push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind) : CompiledRegex::LookBehind,
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead),
push_lookaround<MatchDirection::Backward>(index, ignore_case)); push_lookaround<MatchDirection::Backward>(index, ignore_case));
break; break;
case ParsedRegex::NegativeLookBehind: case ParsedRegex::NegativeLookBehind:
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind) : CompiledRegex::NegativeLookBehind,
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead),
push_lookaround<MatchDirection::Backward>(index, ignore_case)); push_lookaround<MatchDirection::Backward>(index, ignore_case));
break; break;
case ParsedRegex::LineStart: case ParsedRegex::LineStart:
push_inst(forward ? CompiledRegex::LineStart push_inst(CompiledRegex::LineStart);
: CompiledRegex::LineEnd);
break; break;
case ParsedRegex::LineEnd: case ParsedRegex::LineEnd:
push_inst(forward ? CompiledRegex::LineEnd push_inst(CompiledRegex::LineEnd);
: CompiledRegex::LineStart);
break; break;
case ParsedRegex::WordBoundary: case ParsedRegex::WordBoundary:
push_inst(CompiledRegex::WordBoundary); push_inst(CompiledRegex::WordBoundary);
@ -797,12 +787,10 @@ private:
push_inst(CompiledRegex::NotWordBoundary); push_inst(CompiledRegex::NotWordBoundary);
break; break;
case ParsedRegex::SubjectBegin: case ParsedRegex::SubjectBegin:
push_inst(forward ? CompiledRegex::SubjectBegin push_inst(CompiledRegex::SubjectBegin);
: CompiledRegex::SubjectEnd);
break; break;
case ParsedRegex::SubjectEnd: case ParsedRegex::SubjectEnd:
push_inst(forward ? CompiledRegex::SubjectEnd push_inst(CompiledRegex::SubjectEnd);
: CompiledRegex::SubjectBegin);
break; break;
case ParsedRegex::ResetStart: case ParsedRegex::ResetStart:
push_inst(CompiledRegex::Save, 0); push_inst(CompiledRegex::Save, 0);
@ -1443,6 +1431,28 @@ auto test_regex = UnitTest{[]{
TestVM<MatchDirection::Backward> vm{R"($)"}; TestVM<MatchDirection::Backward> vm{R"($)"};
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search | RegexExecFlags::NotEndOfLine)); kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search | RegexExecFlags::NotEndOfLine));
kak_assert(StringView{vm.captures()[0]} == "\nqux"); kak_assert(StringView{vm.captures()[0]} == "\nqux");
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0]} == "");
}
{
TestVM<MatchDirection::Backward> vm{R"(^)"};
kak_assert(not vm.exec("foo", RegexExecFlags::Search | RegexExecFlags::NotBeginOfLine));
kak_assert(vm.exec("foo", RegexExecFlags::Search));
kak_assert(vm.exec("foo\nbar", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0]} == "bar");
}
{
TestVM<MatchDirection::Backward> vm{R"(\A\w+)"};
kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foo");
}
{
TestVM<MatchDirection::Backward> vm{R"(\b\w+\z)"};
kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search));
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "baz");
} }
{ {

View File

@ -192,14 +192,6 @@ public:
if (flags & RegexExecFlags::NotInitialNull and begin == end) if (flags & RegexExecFlags::NotInitialNull and begin == end)
return false; return false;
constexpr bool forward = direction == MatchDirection::Forward;
if (not forward) // Flip line begin/end flags as we flipped the instructions on compilation.
flags = (RegexExecFlags)(flags & ~(RegexExecFlags::NotEndOfLine | RegexExecFlags::NotBeginOfLine)) |
((flags & RegexExecFlags::NotEndOfLine) ? RegexExecFlags::NotBeginOfLine : RegexExecFlags::None) |
((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None);
const bool search = (flags & RegexExecFlags::Search); const bool search = (flags & RegexExecFlags::Search);
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions}; ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
@ -210,12 +202,13 @@ public:
if (not search) if (not search)
instructions = instructions.subrange(CompiledRegex::search_prefix_size); instructions = instructions.subrange(CompiledRegex::search_prefix_size);
constexpr bool forward = direction == MatchDirection::Forward;
const ExecConfig config{ const ExecConfig config{
Sentinel{forward ? begin : end}, Sentinel{forward ? begin : end},
Sentinel{forward ? end : begin}, Sentinel{forward ? end : begin},
Sentinel{forward ? subject_begin : subject_end}, Sentinel{subject_begin},
Sentinel{forward ? subject_end : subject_begin}, Sentinel{subject_end},
flags, flags,
instructions instructions
}; };
@ -226,8 +219,7 @@ public:
Sentinel{subject_end} Sentinel{subject_end}
}}; }};
if (const auto& start_desc = direction == MatchDirection::Forward ? if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
m_program.forward_start_desc : m_program.backward_start_desc)
{ {
if (search) if (search)
{ {
@ -525,7 +517,7 @@ private:
} }
m_threads.swap_next(); m_threads.swap_next();
next(pos); (direction == MatchDirection::Forward) ? ++pos : --pos;
if (find_next_start and start_desc) if (find_next_start and start_desc)
to_next_start(pos, config.end, *start_desc); to_next_start(pos, config.end, *start_desc);
@ -536,10 +528,10 @@ private:
{ {
while (start != end) while (start != end)
{ {
const Codepoint cp = read(start); const Codepoint cp = read_codepoint(start);
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other]) if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
{ {
prev(start); (direction == MatchDirection::Forward) ? --start : ++start;
return; return;
} }
} }
@ -550,12 +542,19 @@ private:
{ {
using Lookaround = CompiledRegex::Lookaround; using Lookaround = CompiledRegex::Lookaround;
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin); if (look_direction == MatchDirection::Backward)
{
if (pos == config.subject_begin)
return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
--pos;
}
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
{ {
if (pos == end) if (look_direction == MatchDirection::Forward and pos == config.subject_end)
return false; return false;
Codepoint cp = (look_direction == MatchDirection::Forward ? codepoint(pos) : prev_codepoint(pos));
Codepoint cp = *pos;
if (ignore_case) if (ignore_case)
cp = to_lower(cp); cp = to_lower(cp);
@ -582,7 +581,10 @@ private:
else if (static_cast<Codepoint>(op) != cp) else if (static_cast<Codepoint>(op) != cp)
return false; return false;
(look_direction == MatchDirection::Forward) ? next(pos) : prev(pos); if (look_direction == MatchDirection::Backward and pos == config.subject_begin)
return *++it == Lookaround::EndOfLookaround;
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
} }
return true; return true;
} }
@ -591,14 +593,14 @@ private:
{ {
if (pos == config.subject_begin) if (pos == config.subject_begin)
return not (config.flags & RegexExecFlags::NotBeginOfLine); return not (config.flags & RegexExecFlags::NotBeginOfLine);
return prev_codepoint(pos) == '\n'; return *(pos-1) == '\n';
} }
static bool is_line_end(const Utf8It& pos, const ExecConfig& config) static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
{ {
if (pos == config.subject_end) if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfLine); return not (config.flags & RegexExecFlags::NotEndOfLine);
return codepoint(pos) == '\n'; return *pos == '\n';
} }
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config) static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
@ -607,10 +609,10 @@ private:
return not (config.flags & RegexExecFlags::NotBeginOfWord); return not (config.flags & RegexExecFlags::NotBeginOfWord);
if (pos == config.subject_end) if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfWord); return not (config.flags & RegexExecFlags::NotEndOfWord);
return is_word(prev_codepoint(pos)) != is_word(codepoint(pos)); return is_word(*(pos-1)) != is_word(*pos);
} }
static Codepoint read(Utf8It& it) static Codepoint read_codepoint(Utf8It& it)
{ {
if (direction == MatchDirection::Forward) if (direction == MatchDirection::Forward)
return it.read(); return it.read();
@ -618,10 +620,7 @@ private:
return *--it; return *--it;
} }
static constexpr Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); } static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
static constexpr Utf8It& next(Utf8It& it) { return (direction == MatchDirection::Forward) ? ++it : --it; }
static constexpr Utf8It& prev(Utf8It& it) { return (direction == MatchDirection::Forward) ? --it : ++it; }
static constexpr Codepoint prev_codepoint(Utf8It it) { return codepoint(prev(it)); }
const CompiledRegex& m_program; const CompiledRegex& m_program;