Regex: take the full subject range as a parameter

To allow more general look arounds out of the actual search range,
pass a second range (the actual subject). This allows us to remove
various flags such as PrevAvailable or NotBeginOfSubject, which are
now easy to check from the subject range.

Fixes #1902
This commit is contained in:
Maxime Coste 2018-03-05 05:48:10 +11:00
parent d9e44dfacf
commit fb65fa60f8
6 changed files with 109 additions and 96 deletions

View File

@ -363,13 +363,12 @@ private:
kak_assert(matches.size() % m_faces.size() == 0);
using RegexIt = RegexIterator<BufferIterator>;
RegexIt re_it{get_iterator(buffer, range.begin),
get_iterator(buffer, range.end), m_regex,
get_iterator(buffer, range.end),
buffer.begin(), buffer.end(), m_regex,
match_flags(is_bol(range.begin),
is_eol(buffer, range.end),
is_bow(buffer, range.begin),
is_eow(buffer, range.end),
range.begin == BufferCoord{0,0},
buffer.is_end(range.end))};
is_eow(buffer, range.end))};
RegexIt re_end;
for (; re_it != re_end; ++re_it)
{

View File

@ -1041,10 +1041,9 @@ void keep(Context& context, NormalParams params)
// give more intuitive behaviours in keep use cases.
const auto flags = match_flags(is_bol(begin.coord()), false,
is_bow(buffer, begin.coord()),
is_eow(buffer, end.coord()),
true, true) |
is_eow(buffer, end.coord())) |
RegexExecFlags::AnyMatch;
if (regex_search(begin, end, regex, flags) == matching)
if (regex_search(begin, end, begin, end, regex, flags) == matching)
keep.push_back(sel);
}
if (keep.empty())

View File

@ -100,21 +100,19 @@ private:
Vector<Iterator, MemoryDomain::Regex> m_values;
};
inline RegexExecFlags match_flags(bool bol, bool eol, bool bow, bool eow, bool bos, bool eos)
inline RegexExecFlags match_flags(bool bol, bool eol, bool bow, bool eow)
{
return (bol ? RegexExecFlags::None : RegexExecFlags::NotBeginOfLine) |
(eol ? RegexExecFlags::None : RegexExecFlags::NotEndOfLine) |
(bow ? RegexExecFlags::None : RegexExecFlags::NotBeginOfWord) |
(eow ? RegexExecFlags::None : RegexExecFlags::NotEndOfWord) |
(bos ? RegexExecFlags::None : RegexExecFlags::NotBeginOfSubject) |
(eos ? RegexExecFlags::None : RegexExecFlags::NotEndOfSubject);
(eow ? RegexExecFlags::None : RegexExecFlags::NotEndOfWord);
}
template<typename It>
bool regex_match(It begin, It end, const Regex& re)
{
ThreadedRegexVM<It, MatchDirection::Forward> vm{*re.impl()};
return vm.exec(begin, end, RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
return vm.exec(begin, end, begin, end, RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
}
template<typename It>
@ -122,7 +120,7 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
{
res.values().clear();
ThreadedRegexVM<It, MatchDirection::Forward> vm{*re.impl()};
if (vm.exec(begin, end, RegexExecFlags::None))
if (vm.exec(begin, end, begin, end, RegexExecFlags::None))
{
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(res.values()));
return true;
@ -131,20 +129,22 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
}
template<typename It>
bool regex_search(It begin, It end, const Regex& re,
bool regex_search(It begin, It end, It subject_begin, It subject_end, const Regex& re,
RegexExecFlags flags = RegexExecFlags::None)
{
ThreadedRegexVM<It, MatchDirection::Forward> vm{*re.impl()};
return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
return vm.exec(begin, end, subject_begin, subject_end,
flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
}
template<typename It, MatchDirection direction = MatchDirection::Forward>
bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
bool regex_search(It begin, It end, It subject_begin, It subject_end,
MatchResults<It>& res, const Regex& re,
RegexExecFlags flags = RegexExecFlags::None)
{
res.values().clear();
ThreadedRegexVM<It, direction> vm{*re.impl()};
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
if (vm.exec(begin, end, subject_begin, subject_end, flags | RegexExecFlags::Search))
{
std::move(vm.captures().begin(), vm.captures().end(), std::back_inserter(res.values()));
return true;
@ -153,10 +153,11 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
}
template<typename It>
bool backward_regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
RegexExecFlags flags = RegexExecFlags::None)
bool backward_regex_search(It begin, It end, It subject_begin, It subject_end,
MatchResults<It>& res, const Regex& re,
RegexExecFlags flags = RegexExecFlags::None)
{
return regex_search<It, MatchDirection::Backward>(std::move(begin), std::move(end), res, re, flags);
return regex_search<It, MatchDirection::Backward>(begin, end, subject_begin, subject_end, res, re, flags);
}
String option_to_string(const Regex& re);
@ -168,14 +169,22 @@ struct RegexIterator
using ValueType = MatchResults<Iterator>;
RegexIterator() = default;
RegexIterator(Iterator begin, Iterator end, const Regex& re,
RegexIterator(Iterator begin, Iterator end,
Iterator subject_begin, Iterator subject_end,
const Regex& re,
RegexExecFlags flags = RegexExecFlags::None)
: m_regex{&re}, m_next_pos{direction == MatchDirection::Forward ? begin : end},
m_begin{begin}, m_end{end}, m_flags{flags}
m_begin{begin}, m_end{end},
m_subject_begin{subject_begin}, m_subject_end{subject_end},
m_flags{flags}
{
next();
}
RegexIterator(Iterator begin, Iterator end, const Regex& re,
RegexExecFlags flags = RegexExecFlags::None)
: RegexIterator{begin, end, begin, end, re, flags} {}
const ValueType& operator*() const { kak_assert(m_regex); return m_results; }
const ValueType* operator->() const { kak_assert(m_regex); return &m_results; }
@ -216,19 +225,16 @@ private:
if (direction == MatchDirection::Forward)
{
if (m_begin != m_next_pos)
additional_flags |= RegexExecFlags::NotBeginOfSubject | RegexExecFlags::PrevAvailable;
if (not regex_search(m_next_pos, m_end, m_results, *m_regex,
m_flags | additional_flags))
if (not regex_search(m_next_pos, m_end, m_subject_begin, m_subject_end,
m_results, *m_regex, m_flags | additional_flags))
m_regex = nullptr;
else
m_next_pos = m_results[0].second;
}
else
{
if (not backward_regex_search(m_begin, m_next_pos, m_results, *m_regex,
m_flags | additional_flags))
if (not backward_regex_search(m_begin, m_next_pos, m_subject_begin, m_subject_end,
m_results, *m_regex, m_flags | additional_flags))
m_regex = nullptr;
else
m_next_pos = m_results[0].first;
@ -240,6 +246,8 @@ private:
Iterator m_next_pos{};
const Iterator m_begin{};
const Iterator m_end{};
const Iterator m_subject_begin{};
const Iterator m_subject_end{};
const RegexExecFlags m_flags = RegexExecFlags::None;
};

View File

@ -1141,7 +1141,7 @@ struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
bool exec(StringView re, RegexExecFlags flags = RegexExecFlags::AnyMatch)
{
return VMType::exec(re.begin(), re.end(), flags);
return VMType::exec(re.begin(), re.end(), re.begin(), re.end(), flags);
}
};
}

View File

@ -132,12 +132,9 @@ enum class RegexExecFlags
NotEndOfLine = 1 << 2,
NotBeginOfWord = 1 << 3,
NotEndOfWord = 1 << 4,
NotBeginOfSubject = 1 << 5,
NotEndOfSubject = 1 << 6,
NotInitialNull = 1 << 7,
AnyMatch = 1 << 8,
NoSaves = 1 << 9,
PrevAvailable = 1 << 10,
NotInitialNull = 1 << 5,
AnyMatch = 1 << 6,
NoSaves = 1 << 7,
};
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
@ -167,18 +164,21 @@ public:
}
}
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
bool exec(Iterator begin, Iterator end,
Iterator subject_begin, Iterator subject_end,
RegexExecFlags flags)
{
if (flags & RegexExecFlags::NotInitialNull and begin == end)
return false;
constexpr bool forward = direction == MatchDirection::Forward;
const bool prev_avail = flags & RegexExecFlags::PrevAvailable;
m_begin = Utf8It{utf8::iterator<Iterator>{forward ? begin : end,
prev_avail ? begin-1 : begin, end}};
m_end = Utf8It{utf8::iterator<Iterator>{forward ? end : begin,
prev_avail ? begin-1 : begin, end}};
m_begin = EffectiveIt{Utf8It{forward ? begin : end, subject_begin, subject_end}};
m_end = EffectiveIt{Utf8It{forward ? end : begin, subject_begin, subject_end}};
m_subject_begin = EffectiveIt{Utf8It{forward ? subject_begin : subject_end, subject_begin, subject_end}};
m_subject_end = EffectiveIt{Utf8It{forward ? subject_end : subject_begin, subject_begin, subject_end}};
if (forward)
m_flags = flags;
else // Flip line begin/end flags as we flipped the instructions on compilation.
@ -187,7 +187,7 @@ public:
((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None);
const bool search = (flags & RegexExecFlags::Search);
Utf8It start{m_begin};
EffectiveIt start{m_begin};
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
: m_program.backward_start_desc;
if (start_desc)
@ -273,9 +273,9 @@ private:
Saves* saves;
};
using Utf8It = std::conditional_t<direction == MatchDirection::Forward,
utf8::iterator<Iterator>,
std::reverse_iterator<utf8::iterator<Iterator>>>;
using Utf8It = utf8::iterator<Iterator>;
using EffectiveIt = std::conditional_t<direction == MatchDirection::Forward,
Utf8It, std::reverse_iterator<Utf8It>>;
struct ExecState
{
@ -287,7 +287,7 @@ private:
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
// Steps a thread until it consumes the current character, matches or fail
StepResult step(Utf8It& pos, Thread& thread, ExecState& state)
StepResult step(EffectiveIt& pos, Thread& thread, ExecState& state)
{
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
auto* instructions = m_program.instructions.data();
@ -371,11 +371,11 @@ private:
return StepResult::Failed;
break;
case CompiledRegex::SubjectBegin:
if (pos != m_begin or (m_flags & RegexExecFlags::NotBeginOfSubject))
if (pos != m_subject_begin)
return StepResult::Failed;
break;
case CompiledRegex::SubjectEnd:
if (pos != m_end or (m_flags & RegexExecFlags::NotEndOfSubject))
if (pos != m_subject_end)
return StepResult::Failed;
break;
case CompiledRegex::LookAhead:
@ -414,7 +414,7 @@ private:
return StepResult::Failed;
}
bool exec_program(Utf8It pos, ConstArrayView<CompiledRegex::Instruction> instructions)
bool exec_program(EffectiveIt pos, ConstArrayView<CompiledRegex::Instruction> instructions)
{
ExecState state;
state.current_threads.push_back({instructions.begin(), nullptr});
@ -495,7 +495,7 @@ private:
}
}
void to_next_start(Utf8It& start, const Utf8It& end,
void to_next_start(EffectiveIt& start, const EffectiveIt& end,
const CompiledRegex::StartDesc& start_desc)
{
while (start != end and *start >= 0 and
@ -504,11 +504,12 @@ private:
}
template<MatchDirection look_direction, bool ignore_case>
bool lookaround(uint32_t index, Utf8It pos) const
bool lookaround(uint32_t index, EffectiveIt pos) const
{
const auto end = (look_direction == MatchDirection::Forward ? m_subject_end : m_subject_begin);
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
{
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
if (pos == end)
return false;
Codepoint cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1));
if (ignore_case)
@ -535,36 +536,38 @@ private:
return true;
}
bool is_line_start(const Utf8It& pos) const
bool is_line_start(const EffectiveIt& pos) const
{
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
if (pos == m_subject_begin)
return not (m_flags & RegexExecFlags::NotBeginOfLine);
return *(pos-1) == '\n';
}
bool is_line_end(const Utf8It& pos) const
bool is_line_end(const EffectiveIt& pos) const
{
if (pos == m_end)
if (pos == m_subject_end)
return not (m_flags & RegexExecFlags::NotEndOfLine);
return *pos == '\n';
}
bool is_word_boundary(const Utf8It& pos) const
bool is_word_boundary(const EffectiveIt& pos) const
{
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
if (pos == m_subject_begin)
return not (m_flags & RegexExecFlags::NotBeginOfWord);
if (pos == m_end)
if (pos == m_subject_end)
return not (m_flags & RegexExecFlags::NotEndOfWord);
return is_word(*(pos-1)) != is_word(*pos);
}
static const Iterator& get_base(const utf8::iterator<Iterator>& it) { return it.base(); }
static Iterator get_base(const std::reverse_iterator<utf8::iterator<Iterator>>& it) { return it.base().base(); }
static const Iterator& get_base(const Utf8It& it) { return it.base(); }
static Iterator get_base(const std::reverse_iterator<Utf8It>& it) { return it.base().base(); }
const CompiledRegex& m_program;
Utf8It m_begin;
Utf8It m_end;
EffectiveIt m_begin;
EffectiveIt m_end;
EffectiveIt m_subject_begin;
EffectiveIt m_subject_end;
RegexExecFlags m_flags;
Vector<Saves*, MemoryDomain::Regex> m_saves;

View File

@ -271,22 +271,24 @@ select_matching(const Context& context, const Selection& selection)
return {};
}
template<typename Iterator>
template<typename Iterator, typename Container>
Optional<std::pair<Iterator, Iterator>>
find_opening(const Iterator& begin, Iterator pos,
find_opening(Iterator pos, const Container& container,
const Regex& opening, const Regex& closing,
int level, bool nestable)
{
MatchResults<Iterator> res;
if (backward_regex_search(begin, pos, res, closing) and
if (backward_regex_search(container.begin(), pos,
container.begin(), container.end(), res, closing) and
res[0].second == pos)
pos = res[0].first;
for (auto match : RegexIterator<Iterator, MatchDirection::Backward>{begin, pos, opening})
using RegexIt = RegexIterator<Iterator, MatchDirection::Backward>;
for (auto match : RegexIt{container.begin(), pos, container.begin(), container.end(), opening})
{
if (nestable)
{
for (auto m : RegexIterator<Iterator, MatchDirection::Backward>{match[0].second, pos, closing})
for (auto m : RegexIt{match[0].second, pos, container.begin(), container.end(), closing})
++level;
}
@ -298,22 +300,23 @@ find_opening(const Iterator& begin, Iterator pos,
return {};
}
template<typename Iterator>
template<typename Iterator, typename Container>
Optional<std::pair<Iterator, Iterator>>
find_closing(Iterator pos, const Iterator& end,
find_closing(Iterator pos, const Container& container,
const Regex& opening, const Regex& closing,
int level, bool nestable)
{
MatchResults<Iterator> res;
if (regex_search(pos, end, res, opening) and
res[0].first == pos)
if (regex_search(pos, container.end(), container.begin(), container.end(),
res, opening) and res[0].first == pos)
pos = res[0].second;
for (auto match : RegexIterator<Iterator, MatchDirection::Forward>{pos, end, closing})
using RegexIt = RegexIterator<Iterator, MatchDirection::Forward>;
for (auto match : RegexIt{pos, container.end(), container.begin(), container.end(), closing})
{
if (nestable)
{
for (auto m : RegexIterator<Iterator, MatchDirection::Forward>{pos, match[0].first, opening})
for (auto m : RegexIt{pos, match[0].first, container.begin(), container.end(), opening})
++level;
}
@ -335,7 +338,8 @@ find_surrounding(const Container& container, Iterator pos,
// When onto the token of a non nestable block, consider it as an opening.
MatchResults<Iterator> matches;
if (not nestable and regex_search(pos, container.end(), matches, opening) and
if (not nestable and regex_search(pos, container.end(), container.begin(),
container.end(), matches, opening) and
matches[0].first == pos)
pos = matches[0].second;
@ -344,10 +348,11 @@ find_surrounding(const Container& container, Iterator pos,
{
// When positionned onto opening and searching to opening, search the parent one
if (nestable and first != container.begin() and not (flags & ObjectFlags::ToEnd) and
regex_search(first, container.end(), matches, opening) and matches[0].first == first)
regex_search(first, container.end(), container.begin(), container.end(),
matches, opening) and matches[0].first == first)
first = utf8::previous(first, container.begin());
if (auto res = find_opening(container.begin(), first+1, opening, closing, level, nestable))
if (auto res = find_opening(first+1, container, opening, closing, level, nestable))
first = (flags & ObjectFlags::Inner) ? res->second : res->first;
else
return {};
@ -359,10 +364,11 @@ find_surrounding(const Container& container, Iterator pos,
// When positionned onto closing and searching to closing, search the parent one
auto next = utf8::next(last, container.end());
if (nestable and next != container.end() and not (flags & ObjectFlags::ToBegin) and
backward_regex_search(container.begin(), next, matches, closing) and matches[0].second == next)
backward_regex_search(container.begin(), next, container.begin(), container.end(),
matches, closing) and matches[0].second == next)
last = next;
if (auto res = find_closing(last, container.end(), opening, closing, level, nestable))
if (auto res = find_closing(last, container, opening, closing, level, nestable))
last = (flags & ObjectFlags::Inner) ? utf8::previous(res->first, container.begin())
: utf8::previous(res->second, container.begin());
else
@ -835,12 +841,10 @@ void select_buffer(SelectionList& selections)
}
static RegexExecFlags
match_flags(const Buffer& buf, const BufferIterator& begin, const BufferIterator& end,
bool bos, bool eos)
match_flags(const Buffer& buf, const BufferIterator& begin, const BufferIterator& end)
{
return match_flags(is_bol(begin.coord()), is_eol(buf, end.coord()),
is_bow(buf, begin.coord()), is_eow(buf, end.coord()),
bos, eos);
is_bow(buf, begin.coord()), is_eow(buf, end.coord()));
}
static bool find_next(const Buffer& buffer, const BufferIterator& pos,
@ -848,12 +852,12 @@ static bool find_next(const Buffer& buffer, const BufferIterator& pos,
const Regex& ex, bool& wrapped)
{
if (pos != buffer.end() and
regex_search(pos, buffer.end(), matches, ex,
match_flags(buffer, pos, buffer.end(), pos.coord() == BufferCoord{0,0}, true)))
regex_search(pos, buffer.end(), buffer.begin(), buffer.end(),
matches, ex, match_flags(buffer, pos, buffer.end())))
return true;
wrapped = true;
return regex_search(buffer.begin(), buffer.end(), matches, ex,
match_flags(buffer, buffer.begin(), buffer.end(), true, true));
return regex_search(buffer.begin(), buffer.end(), buffer.begin(), buffer.end(),
matches, ex, match_flags(buffer, buffer.begin(), buffer.end()));
}
static bool find_prev(const Buffer& buffer, const BufferIterator& pos,
@ -861,13 +865,15 @@ static bool find_prev(const Buffer& buffer, const BufferIterator& pos,
const Regex& ex, bool& wrapped)
{
if (pos != buffer.begin() and
backward_regex_search(buffer.begin(), pos, matches, ex,
match_flags(buffer, buffer.begin(), pos, true, buffer.is_end(pos.coord())) |
backward_regex_search(buffer.begin(), pos, buffer.begin(), buffer.end(),
matches, ex,
match_flags(buffer, buffer.begin(), pos) |
RegexExecFlags::NotInitialNull))
return true;
wrapped = true;
return backward_regex_search(buffer.begin(), buffer.end(), matches, ex,
match_flags(buffer, buffer.begin(), buffer.end(), true, true) |
return backward_regex_search(buffer.begin(), buffer.end(), buffer.begin(), buffer.end(),
matches, ex,
match_flags(buffer, buffer.begin(), buffer.end()) |
RegexExecFlags::NotInitialNull);
}
@ -913,8 +919,7 @@ void select_all_matches(SelectionList& selections, const Regex& regex, int captu
{
auto sel_beg = buffer.iterator_at(sel.min());
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
RegexIt re_it(sel_beg, sel_end, regex,
match_flags(buffer, sel_beg, sel_end, true, true));
RegexIt re_it(sel_beg, sel_end, regex, match_flags(buffer, sel_beg, sel_end));
RegexIt re_end;
for (; re_it != re_end; ++re_it)
@ -958,8 +963,7 @@ void split_selections(SelectionList& selections, const Regex& regex, int capture
auto begin = buffer.iterator_at(sel.min());
auto sel_end = utf8::next(buffer.iterator_at(sel.max()), buffer.end());
RegexIt re_it(begin, sel_end, regex,
match_flags(buffer, begin, sel_end, true, true));
RegexIt re_it(begin, sel_end, regex, match_flags(buffer, begin, sel_end));
RegexIt re_end;
for (; re_it != re_end; ++re_it)