Regex: introduce RegexExecFlags to control various behaviours
This commit is contained in:
parent
73b14b11be
commit
f007794d9c
26
src/regex.hh
26
src/regex.hh
|
@ -130,6 +130,28 @@ void check_captures(const Regex& re, const MatchResults<It>& res, const Vector<I
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline RegexExecFlags convert_flags(RegexConstant::match_flag_type flags)
|
||||||
|
{
|
||||||
|
auto res = RegexExecFlags::None;
|
||||||
|
|
||||||
|
if (flags & RegexConstant::match_not_bol)
|
||||||
|
res |= RegexExecFlags::NotBeginOfLine;
|
||||||
|
if (flags & RegexConstant::match_not_eol)
|
||||||
|
res |= RegexExecFlags::NotEndOfLine;
|
||||||
|
if (flags & RegexConstant::match_not_bow)
|
||||||
|
res |= RegexExecFlags::NotBeginOfWord;
|
||||||
|
if (flags & RegexConstant::match_not_eow)
|
||||||
|
res |= RegexExecFlags::NotEndOfWord;
|
||||||
|
if (flags & RegexConstant::match_not_bob)
|
||||||
|
res |= RegexExecFlags::NotBeginOfSubject;
|
||||||
|
if (flags & RegexConstant::match_not_initial_null)
|
||||||
|
res |= RegexExecFlags::NotInitialNull;
|
||||||
|
if (flags & RegexConstant::match_any)
|
||||||
|
res |= RegexExecFlags::AnyMatch;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It>
|
||||||
bool regex_match(It begin, It end, const Regex& re)
|
bool regex_match(It begin, It end, const Regex& re)
|
||||||
{
|
{
|
||||||
|
@ -172,7 +194,7 @@ bool regex_search(It begin, It end, const Regex& re,
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
|
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
|
||||||
if (re.impl() and matched != regex_search(begin, end, re.impl()))
|
if (re.impl() and matched != regex_search(begin, end, re.impl(), convert_flags(flags)))
|
||||||
regex_mismatch(re);
|
regex_mismatch(re);
|
||||||
return matched;
|
return matched;
|
||||||
}
|
}
|
||||||
|
@ -190,7 +212,7 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
|
||||||
{
|
{
|
||||||
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
|
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
|
||||||
Vector<It> captures;
|
Vector<It> captures;
|
||||||
if (re.impl() and matched != regex_search(begin, end, captures, re.impl()))
|
if (re.impl() and matched != regex_search(begin, end, captures, re.impl(), convert_flags(flags)))
|
||||||
regex_mismatch(re);
|
regex_mismatch(re);
|
||||||
if (re.impl() and matched)
|
if (re.impl() and matched)
|
||||||
check_captures(re, res, captures);
|
check_captures(re, res, captures);
|
||||||
|
|
|
@ -821,7 +821,13 @@ auto test_regex = UnitTest{[]{
|
||||||
|
|
||||||
bool exec(StringView re, bool match = true, bool longest = false)
|
bool exec(StringView re, bool match = true, bool longest = false)
|
||||||
{
|
{
|
||||||
return ThreadedRegexVM::exec(re.begin(), re.end(), match, longest);
|
RegexExecFlags flags = RegexExecFlags::None;
|
||||||
|
if (not match)
|
||||||
|
flags |= RegexExecFlags::Search;
|
||||||
|
if (not longest)
|
||||||
|
flags |= RegexExecFlags::AnyMatch;
|
||||||
|
|
||||||
|
return ThreadedRegexVM::exec(re.begin(), re.end(), flags);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include "utf8.hh"
|
#include "utf8.hh"
|
||||||
#include "utf8_iterator.hh"
|
#include "utf8_iterator.hh"
|
||||||
#include "vector.hh"
|
#include "vector.hh"
|
||||||
|
#include "flags.hh"
|
||||||
|
|
||||||
namespace Kakoune
|
namespace Kakoune
|
||||||
{
|
{
|
||||||
|
@ -46,6 +47,21 @@ struct CompiledRegex
|
||||||
|
|
||||||
CompiledRegex compile_regex(StringView re);
|
CompiledRegex compile_regex(StringView re);
|
||||||
|
|
||||||
|
enum class RegexExecFlags
|
||||||
|
{
|
||||||
|
None = 0,
|
||||||
|
Search = 1 << 0,
|
||||||
|
NotBeginOfLine = 1 << 1,
|
||||||
|
NotEndOfLine = 1 << 2,
|
||||||
|
NotBeginOfWord = 1 << 3,
|
||||||
|
NotEndOfWord = 1 << 4,
|
||||||
|
NotBeginOfSubject = 1 << 5,
|
||||||
|
NotInitialNull = 1 << 6,
|
||||||
|
AnyMatch = 1 << 7
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
|
||||||
|
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
struct ThreadedRegexVM
|
struct ThreadedRegexVM
|
||||||
{
|
{
|
||||||
|
@ -133,7 +149,7 @@ struct ThreadedRegexVM
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::SubjectBegin:
|
case CompiledRegex::SubjectBegin:
|
||||||
if (m_pos != m_begin)
|
if (m_pos != m_begin or m_flags & RegexExecFlags::NotBeginOfSubject)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::SubjectEnd:
|
case CompiledRegex::SubjectEnd:
|
||||||
|
@ -173,16 +189,20 @@ struct ThreadedRegexVM
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec(Iterator begin, Iterator end, bool match = true, bool longest = false)
|
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
||||||
{
|
{
|
||||||
bool found_match = false;
|
bool found_match = false;
|
||||||
m_threads.clear();
|
m_threads.clear();
|
||||||
const auto start_offset = (match ? CompiledRegex::search_prefix_size : 0);
|
const auto start_offset = (flags & RegexExecFlags::Search) ? 0 : CompiledRegex::search_prefix_size;
|
||||||
add_thread(0, m_program.bytecode.data() + start_offset,
|
add_thread(0, m_program.bytecode.data() + start_offset,
|
||||||
Vector<Iterator>(m_program.save_count, Iterator{}));
|
Vector<Iterator>(m_program.save_count, Iterator{}));
|
||||||
|
|
||||||
m_begin = begin;
|
m_begin = begin;
|
||||||
m_end = end;
|
m_end = end;
|
||||||
|
m_flags = flags;
|
||||||
|
|
||||||
|
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
|
||||||
|
return false;
|
||||||
|
|
||||||
for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
|
for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
|
||||||
{
|
{
|
||||||
|
@ -191,17 +211,19 @@ struct ThreadedRegexVM
|
||||||
const auto res = step(i);
|
const auto res = step(i);
|
||||||
if (res == StepResult::Matched)
|
if (res == StepResult::Matched)
|
||||||
{
|
{
|
||||||
if (match)
|
if (not (flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
|
||||||
|
(flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
|
||||||
{
|
{
|
||||||
m_threads.erase(m_threads.begin() + i);
|
m_threads.erase(m_threads.begin() + i);
|
||||||
continue; // We are not at end, this is not a full match
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_captures = std::move(m_threads[i].saves);
|
m_captures = std::move(m_threads[i].saves);
|
||||||
|
if (flags & RegexExecFlags::AnyMatch)
|
||||||
|
return true;
|
||||||
|
|
||||||
found_match = true;
|
found_match = true;
|
||||||
m_threads.resize(i); // remove this and lower priority threads
|
m_threads.resize(i); // remove this and lower priority threads
|
||||||
if (not longest)
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
else if (res == StepResult::Failed)
|
else if (res == StepResult::Failed)
|
||||||
m_threads.erase(m_threads.begin() + i);
|
m_threads.erase(m_threads.begin() + i);
|
||||||
|
@ -227,10 +249,11 @@ struct ThreadedRegexVM
|
||||||
if (step(i) == StepResult::Matched)
|
if (step(i) == StepResult::Matched)
|
||||||
{
|
{
|
||||||
m_captures = std::move(m_threads[i].saves);
|
m_captures = std::move(m_threads[i].saves);
|
||||||
|
if (flags & RegexExecFlags::AnyMatch)
|
||||||
|
return true;
|
||||||
|
|
||||||
found_match = true;
|
found_match = true;
|
||||||
m_threads.resize(i); // remove this and lower priority threads
|
m_threads.resize(i); // remove this and lower priority threads
|
||||||
if (not longest)
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return found_match;
|
return found_match;
|
||||||
|
@ -246,17 +269,20 @@ struct ThreadedRegexVM
|
||||||
|
|
||||||
bool is_line_start() const
|
bool is_line_start() const
|
||||||
{
|
{
|
||||||
return m_pos == m_begin or *(m_pos-1) == '\n';
|
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
||||||
|
*(m_pos-1) == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_line_end() const
|
bool is_line_end() const
|
||||||
{
|
{
|
||||||
return m_pos == m_end or *m_pos == '\n';
|
return (m_pos == m_end and not (m_flags & RegexExecFlags::NotEndOfLine)) or
|
||||||
|
*m_pos == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_word_boundary() const
|
bool is_word_boundary() const
|
||||||
{
|
{
|
||||||
return m_pos == m_begin or m_pos == m_end or
|
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfWord)) or
|
||||||
|
(m_pos == m_end and not (m_flags & RegexExecFlags::NotEndOfWord)) or
|
||||||
is_word(*(m_pos-1)) != is_word(*m_pos);
|
is_word(*(m_pos-1)) != is_word(*m_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -268,22 +294,24 @@ struct ThreadedRegexVM
|
||||||
Iterator m_begin;
|
Iterator m_begin;
|
||||||
Iterator m_end;
|
Iterator m_end;
|
||||||
Utf8It m_pos;
|
Utf8It m_pos;
|
||||||
|
RegexExecFlags m_flags;
|
||||||
|
|
||||||
Vector<Iterator> m_captures;
|
Vector<Iterator> m_captures;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename It>
|
template<typename It>
|
||||||
bool regex_match(It begin, It end, const CompiledRegex& re)
|
bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It> vm{re};
|
||||||
return vm.exec(begin, end, true, false);
|
return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) | RegexExecFlags::AnyMatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It>
|
||||||
bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re)
|
bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
||||||
|
RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It> vm{re};
|
||||||
if (vm.exec(begin, end, true, true))
|
if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search)))
|
||||||
{
|
{
|
||||||
captures = std::move(vm.m_captures);
|
captures = std::move(vm.m_captures);
|
||||||
return true;
|
return true;
|
||||||
|
@ -292,17 +320,19 @@ bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It>
|
||||||
bool regex_search(It begin, It end, const CompiledRegex& re)
|
bool regex_search(It begin, It end, const CompiledRegex& re,
|
||||||
|
RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It> vm{re};
|
||||||
return vm.exec(begin, end, false, false);
|
return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename It>
|
template<typename It>
|
||||||
bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re)
|
bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
||||||
|
RegexExecFlags flags = RegexExecFlags::None)
|
||||||
{
|
{
|
||||||
ThreadedRegexVM<It> vm{re};
|
ThreadedRegexVM<It> vm{re};
|
||||||
if (vm.exec(begin, end, false, true))
|
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
|
||||||
{
|
{
|
||||||
captures = std::move(vm.m_captures);
|
captures = std::move(vm.m_captures);
|
||||||
return true;
|
return true;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user