Remove most regex impl special casing for backwards matching
This commit is contained in:
parent
ee74c2c2df
commit
4ac7df3842
|
@ -755,40 +755,30 @@ private:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
push_inst(ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
||||||
: CompiledRegex::LookAhead)
|
: CompiledRegex::LookAhead,
|
||||||
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
|
||||||
: CompiledRegex::LookBehind),
|
|
||||||
push_lookaround<MatchDirection::Forward>(index, ignore_case));
|
push_lookaround<MatchDirection::Forward>(index, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
push_inst(ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
||||||
: CompiledRegex::NegativeLookAhead)
|
: CompiledRegex::NegativeLookAhead,
|
||||||
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
|
||||||
: CompiledRegex::NegativeLookBehind),
|
|
||||||
push_lookaround<MatchDirection::Forward>(index, ignore_case));
|
push_lookaround<MatchDirection::Forward>(index, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LookBehind:
|
case ParsedRegex::LookBehind:
|
||||||
push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
push_inst(ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
||||||
: CompiledRegex::LookBehind)
|
: CompiledRegex::LookBehind,
|
||||||
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
|
||||||
: CompiledRegex::LookAhead),
|
|
||||||
push_lookaround<MatchDirection::Backward>(index, ignore_case));
|
push_lookaround<MatchDirection::Backward>(index, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookBehind:
|
case ParsedRegex::NegativeLookBehind:
|
||||||
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
push_inst(ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
||||||
: CompiledRegex::NegativeLookBehind)
|
: CompiledRegex::NegativeLookBehind,
|
||||||
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
|
||||||
: CompiledRegex::NegativeLookAhead),
|
|
||||||
push_lookaround<MatchDirection::Backward>(index, ignore_case));
|
push_lookaround<MatchDirection::Backward>(index, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineStart:
|
case ParsedRegex::LineStart:
|
||||||
push_inst(forward ? CompiledRegex::LineStart
|
push_inst(CompiledRegex::LineStart);
|
||||||
: CompiledRegex::LineEnd);
|
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineEnd:
|
case ParsedRegex::LineEnd:
|
||||||
push_inst(forward ? CompiledRegex::LineEnd
|
push_inst(CompiledRegex::LineEnd);
|
||||||
: CompiledRegex::LineStart);
|
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::WordBoundary:
|
case ParsedRegex::WordBoundary:
|
||||||
push_inst(CompiledRegex::WordBoundary);
|
push_inst(CompiledRegex::WordBoundary);
|
||||||
|
@ -797,12 +787,10 @@ private:
|
||||||
push_inst(CompiledRegex::NotWordBoundary);
|
push_inst(CompiledRegex::NotWordBoundary);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectBegin:
|
case ParsedRegex::SubjectBegin:
|
||||||
push_inst(forward ? CompiledRegex::SubjectBegin
|
push_inst(CompiledRegex::SubjectBegin);
|
||||||
: CompiledRegex::SubjectEnd);
|
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectEnd:
|
case ParsedRegex::SubjectEnd:
|
||||||
push_inst(forward ? CompiledRegex::SubjectEnd
|
push_inst(CompiledRegex::SubjectEnd);
|
||||||
: CompiledRegex::SubjectBegin);
|
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::ResetStart:
|
case ParsedRegex::ResetStart:
|
||||||
push_inst(CompiledRegex::Save, 0);
|
push_inst(CompiledRegex::Save, 0);
|
||||||
|
@ -1443,6 +1431,28 @@ auto test_regex = UnitTest{[]{
|
||||||
TestVM<MatchDirection::Backward> vm{R"($)"};
|
TestVM<MatchDirection::Backward> vm{R"($)"};
|
||||||
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search | RegexExecFlags::NotEndOfLine));
|
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search | RegexExecFlags::NotEndOfLine));
|
||||||
kak_assert(StringView{vm.captures()[0]} == "\nqux");
|
kak_assert(StringView{vm.captures()[0]} == "\nqux");
|
||||||
|
kak_assert(vm.exec("foo\nbar\nbaz\nqux", RegexExecFlags::Search));
|
||||||
|
kak_assert(StringView{vm.captures()[0]} == "");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<MatchDirection::Backward> vm{R"(^)"};
|
||||||
|
kak_assert(not vm.exec("foo", RegexExecFlags::Search | RegexExecFlags::NotBeginOfLine));
|
||||||
|
kak_assert(vm.exec("foo", RegexExecFlags::Search));
|
||||||
|
kak_assert(vm.exec("foo\nbar", RegexExecFlags::Search));
|
||||||
|
kak_assert(StringView{vm.captures()[0]} == "bar");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<MatchDirection::Backward> vm{R"(\A\w+)"};
|
||||||
|
kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search));
|
||||||
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "foo");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<MatchDirection::Backward> vm{R"(\b\w+\z)"};
|
||||||
|
kak_assert(vm.exec("foo\nbar\nbaz", RegexExecFlags::Search));
|
||||||
|
kak_assert(StringView{vm.captures()[0], vm.captures()[1]} == "baz");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
|
@ -192,14 +192,6 @@ public:
|
||||||
if (flags & RegexExecFlags::NotInitialNull and begin == end)
|
if (flags & RegexExecFlags::NotInitialNull and begin == end)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
constexpr bool forward = direction == MatchDirection::Forward;
|
|
||||||
|
|
||||||
|
|
||||||
if (not forward) // Flip line begin/end flags as we flipped the instructions on compilation.
|
|
||||||
flags = (RegexExecFlags)(flags & ~(RegexExecFlags::NotEndOfLine | RegexExecFlags::NotBeginOfLine)) |
|
|
||||||
((flags & RegexExecFlags::NotEndOfLine) ? RegexExecFlags::NotBeginOfLine : RegexExecFlags::None) |
|
|
||||||
((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None);
|
|
||||||
|
|
||||||
const bool search = (flags & RegexExecFlags::Search);
|
const bool search = (flags & RegexExecFlags::Search);
|
||||||
|
|
||||||
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
|
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
|
||||||
|
@ -210,12 +202,13 @@ public:
|
||||||
if (not search)
|
if (not search)
|
||||||
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
|
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
|
||||||
|
|
||||||
|
constexpr bool forward = direction == MatchDirection::Forward;
|
||||||
|
|
||||||
const ExecConfig config{
|
const ExecConfig config{
|
||||||
Sentinel{forward ? begin : end},
|
Sentinel{forward ? begin : end},
|
||||||
Sentinel{forward ? end : begin},
|
Sentinel{forward ? end : begin},
|
||||||
Sentinel{forward ? subject_begin : subject_end},
|
Sentinel{subject_begin},
|
||||||
Sentinel{forward ? subject_end : subject_begin},
|
Sentinel{subject_end},
|
||||||
flags,
|
flags,
|
||||||
instructions
|
instructions
|
||||||
};
|
};
|
||||||
|
@ -226,8 +219,7 @@ public:
|
||||||
Sentinel{subject_end}
|
Sentinel{subject_end}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
if (const auto& start_desc = direction == MatchDirection::Forward ?
|
if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
|
||||||
m_program.forward_start_desc : m_program.backward_start_desc)
|
|
||||||
{
|
{
|
||||||
if (search)
|
if (search)
|
||||||
{
|
{
|
||||||
|
@ -525,7 +517,7 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
m_threads.swap_next();
|
m_threads.swap_next();
|
||||||
next(pos);
|
(direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||||
|
|
||||||
if (find_next_start and start_desc)
|
if (find_next_start and start_desc)
|
||||||
to_next_start(pos, config.end, *start_desc);
|
to_next_start(pos, config.end, *start_desc);
|
||||||
|
@ -536,10 +528,10 @@ private:
|
||||||
{
|
{
|
||||||
while (start != end)
|
while (start != end)
|
||||||
{
|
{
|
||||||
const Codepoint cp = read(start);
|
const Codepoint cp = read_codepoint(start);
|
||||||
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
|
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
|
||||||
{
|
{
|
||||||
prev(start);
|
(direction == MatchDirection::Forward) ? --start : ++start;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -550,12 +542,19 @@ private:
|
||||||
{
|
{
|
||||||
using Lookaround = CompiledRegex::Lookaround;
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
|
|
||||||
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
|
if (look_direction == MatchDirection::Backward)
|
||||||
|
{
|
||||||
|
if (pos == config.subject_begin)
|
||||||
|
return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
|
||||||
|
--pos;
|
||||||
|
}
|
||||||
|
|
||||||
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
||||||
{
|
{
|
||||||
if (pos == end)
|
if (look_direction == MatchDirection::Forward and pos == config.subject_end)
|
||||||
return false;
|
return false;
|
||||||
Codepoint cp = (look_direction == MatchDirection::Forward ? codepoint(pos) : prev_codepoint(pos));
|
|
||||||
|
Codepoint cp = *pos;
|
||||||
if (ignore_case)
|
if (ignore_case)
|
||||||
cp = to_lower(cp);
|
cp = to_lower(cp);
|
||||||
|
|
||||||
|
@ -582,7 +581,10 @@ private:
|
||||||
else if (static_cast<Codepoint>(op) != cp)
|
else if (static_cast<Codepoint>(op) != cp)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
(look_direction == MatchDirection::Forward) ? next(pos) : prev(pos);
|
if (look_direction == MatchDirection::Backward and pos == config.subject_begin)
|
||||||
|
return *++it == Lookaround::EndOfLookaround;
|
||||||
|
|
||||||
|
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -591,14 +593,14 @@ private:
|
||||||
{
|
{
|
||||||
if (pos == config.subject_begin)
|
if (pos == config.subject_begin)
|
||||||
return not (config.flags & RegexExecFlags::NotBeginOfLine);
|
return not (config.flags & RegexExecFlags::NotBeginOfLine);
|
||||||
return prev_codepoint(pos) == '\n';
|
return *(pos-1) == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
|
static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
|
||||||
{
|
{
|
||||||
if (pos == config.subject_end)
|
if (pos == config.subject_end)
|
||||||
return not (config.flags & RegexExecFlags::NotEndOfLine);
|
return not (config.flags & RegexExecFlags::NotEndOfLine);
|
||||||
return codepoint(pos) == '\n';
|
return *pos == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
|
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
|
||||||
|
@ -607,10 +609,10 @@ private:
|
||||||
return not (config.flags & RegexExecFlags::NotBeginOfWord);
|
return not (config.flags & RegexExecFlags::NotBeginOfWord);
|
||||||
if (pos == config.subject_end)
|
if (pos == config.subject_end)
|
||||||
return not (config.flags & RegexExecFlags::NotEndOfWord);
|
return not (config.flags & RegexExecFlags::NotEndOfWord);
|
||||||
return is_word(prev_codepoint(pos)) != is_word(codepoint(pos));
|
return is_word(*(pos-1)) != is_word(*pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Codepoint read(Utf8It& it)
|
static Codepoint read_codepoint(Utf8It& it)
|
||||||
{
|
{
|
||||||
if (direction == MatchDirection::Forward)
|
if (direction == MatchDirection::Forward)
|
||||||
return it.read();
|
return it.read();
|
||||||
|
@ -618,10 +620,7 @@ private:
|
||||||
return *--it;
|
return *--it;
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
|
static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
|
||||||
static constexpr Utf8It& next(Utf8It& it) { return (direction == MatchDirection::Forward) ? ++it : --it; }
|
|
||||||
static constexpr Utf8It& prev(Utf8It& it) { return (direction == MatchDirection::Forward) ? --it : ++it; }
|
|
||||||
static constexpr Codepoint prev_codepoint(Utf8It it) { return codepoint(prev(it)); }
|
|
||||||
|
|
||||||
const CompiledRegex& m_program;
|
const CompiledRegex& m_program;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user