2017-09-17 10:50:53 +02:00
|
|
|
#ifndef regex_impl_hh_INCLUDED
|
|
|
|
#define regex_impl_hh_INCLUDED
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
#include "exception.hh"
|
2017-10-07 04:43:21 +02:00
|
|
|
#include "flags.hh"
|
|
|
|
#include "ref_ptr.hh"
|
2017-10-02 08:59:04 +02:00
|
|
|
#include "unicode.hh"
|
|
|
|
#include "utf8.hh"
|
|
|
|
#include "utf8_iterator.hh"
|
|
|
|
#include "vector.hh"
|
|
|
|
|
2017-09-26 09:44:30 +02:00
|
|
|
namespace Kakoune
|
|
|
|
{
|
|
|
|
|
2017-10-09 08:04:14 +02:00
|
|
|
struct regex_error : runtime_error
|
|
|
|
{
|
|
|
|
using runtime_error::runtime_error;
|
|
|
|
};
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
enum class MatchDirection
|
|
|
|
{
|
|
|
|
Forward,
|
|
|
|
Backward
|
|
|
|
};
|
|
|
|
|
2017-10-06 07:40:27 +02:00
|
|
|
struct CompiledRegex : RefCountable
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
|
|
|
enum Op : char
|
|
|
|
{
|
|
|
|
Match,
|
|
|
|
Literal,
|
|
|
|
LiteralIgnoreCase,
|
|
|
|
AnyChar,
|
|
|
|
Matcher,
|
|
|
|
Jump,
|
|
|
|
Split_PrioritizeParent,
|
|
|
|
Split_PrioritizeChild,
|
|
|
|
Save,
|
|
|
|
LineStart,
|
|
|
|
LineEnd,
|
|
|
|
WordBoundary,
|
|
|
|
NotWordBoundary,
|
|
|
|
SubjectBegin,
|
|
|
|
SubjectEnd,
|
|
|
|
LookAhead,
|
|
|
|
NegativeLookAhead,
|
2017-10-04 17:00:19 +02:00
|
|
|
LookBehind,
|
2017-10-02 08:59:04 +02:00
|
|
|
NegativeLookBehind,
|
|
|
|
};
|
|
|
|
|
2017-10-07 12:51:32 +02:00
|
|
|
struct Instruction
|
|
|
|
{
|
|
|
|
Op op;
|
2017-10-07 13:08:14 +02:00
|
|
|
mutable bool processed;
|
2017-10-07 13:58:10 +02:00
|
|
|
mutable bool scheduled;
|
2017-10-07 12:51:32 +02:00
|
|
|
uint32_t param;
|
|
|
|
};
|
2017-10-07 13:08:14 +02:00
|
|
|
static_assert(sizeof(Instruction) == 8, "");
|
2017-10-07 12:51:32 +02:00
|
|
|
|
|
|
|
explicit operator bool() const { return not instructions.empty(); }
|
2017-10-02 08:59:04 +02:00
|
|
|
|
2017-10-07 12:51:32 +02:00
|
|
|
Vector<Instruction> instructions;
|
2017-10-02 08:59:04 +02:00
|
|
|
Vector<std::function<bool (Codepoint)>> matchers;
|
2017-10-07 12:51:32 +02:00
|
|
|
Vector<Codepoint> lookarounds;
|
2017-10-07 06:46:27 +02:00
|
|
|
MatchDirection direction;
|
2017-10-02 08:59:04 +02:00
|
|
|
size_t save_count;
|
2017-10-06 07:40:27 +02:00
|
|
|
|
2017-10-09 12:19:36 +02:00
|
|
|
struct StartChars
|
|
|
|
{
|
|
|
|
static constexpr size_t count = 256;
|
|
|
|
bool map[count];
|
|
|
|
};
|
2017-10-06 07:40:27 +02:00
|
|
|
std::unique_ptr<StartChars> start_chars;
|
2017-10-02 08:59:04 +02:00
|
|
|
};
|
|
|
|
|
2017-10-09 08:04:14 +02:00
|
|
|
enum RegexCompileFlags
|
|
|
|
{
|
|
|
|
None = 0,
|
|
|
|
NoSubs = 1 << 0,
|
|
|
|
Optimize = 1 << 1
|
|
|
|
};
|
|
|
|
constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; }
|
|
|
|
|
|
|
|
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward);
|
2017-10-02 08:59:04 +02:00
|
|
|
|
2017-10-02 16:34:57 +02:00
|
|
|
enum class RegexExecFlags
|
|
|
|
{
|
|
|
|
None = 0,
|
|
|
|
Search = 1 << 0,
|
|
|
|
NotBeginOfLine = 1 << 1,
|
|
|
|
NotEndOfLine = 1 << 2,
|
|
|
|
NotBeginOfWord = 1 << 3,
|
|
|
|
NotEndOfWord = 1 << 4,
|
|
|
|
NotBeginOfSubject = 1 << 5,
|
|
|
|
NotInitialNull = 1 << 6,
|
2017-10-03 13:07:44 +02:00
|
|
|
AnyMatch = 1 << 7,
|
|
|
|
NoSaves = 1 << 8,
|
2017-10-08 03:22:24 +02:00
|
|
|
PrevAvailable = 1 << 9,
|
2017-10-02 16:34:57 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
template<typename Iterator, MatchDirection direction>
|
2017-10-06 13:30:46 +02:00
|
|
|
class ThreadedRegexVM
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-06 13:30:46 +02:00
|
|
|
public:
|
2017-10-02 08:59:04 +02:00
|
|
|
ThreadedRegexVM(const CompiledRegex& program)
|
2017-10-07 06:46:27 +02:00
|
|
|
: m_program{program}
|
2017-10-09 15:56:48 +02:00
|
|
|
{
|
|
|
|
kak_assert(m_program and direction == m_program.direction);
|
|
|
|
}
|
2017-10-02 08:59:04 +02:00
|
|
|
|
2017-10-04 14:11:15 +02:00
|
|
|
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
|
2017-10-06 13:30:46 +02:00
|
|
|
ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete;
|
2017-10-04 14:11:15 +02:00
|
|
|
|
2017-10-04 05:14:24 +02:00
|
|
|
~ThreadedRegexVM()
|
|
|
|
{
|
|
|
|
for (auto* saves : m_saves)
|
|
|
|
{
|
|
|
|
for (size_t i = m_program.save_count-1; i > 0; --i)
|
|
|
|
saves->pos[i].~Iterator();
|
|
|
|
saves->~Saves();
|
2017-10-07 16:25:10 +02:00
|
|
|
::operator delete(saves);
|
2017-10-04 05:14:24 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-06 13:30:46 +02:00
|
|
|
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
|
|
|
{
|
2017-10-07 06:46:27 +02:00
|
|
|
const bool forward = direction == MatchDirection::Forward;
|
2017-10-08 03:22:24 +02:00
|
|
|
const bool prev_avail = flags & RegexExecFlags::PrevAvailable;
|
|
|
|
m_begin = Utf8It{utf8::iterator<Iterator>{forward ? begin : end,
|
|
|
|
prev_avail ? begin-1 : begin, end}};
|
|
|
|
m_end = Utf8It{utf8::iterator<Iterator>{forward ? end : begin,
|
|
|
|
prev_avail ? begin-1 : begin, end}};
|
2017-10-06 13:30:46 +02:00
|
|
|
m_flags = flags;
|
|
|
|
|
|
|
|
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
Vector<Thread> current_threads, next_threads;
|
|
|
|
|
|
|
|
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
2017-10-07 06:46:27 +02:00
|
|
|
Utf8It start{m_begin};
|
2017-10-06 13:30:46 +02:00
|
|
|
|
|
|
|
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
|
|
|
|
|
|
|
|
if (flags & RegexExecFlags::Search)
|
2017-10-07 06:46:27 +02:00
|
|
|
to_next_start(start, m_end, start_chars);
|
2017-10-06 13:30:46 +02:00
|
|
|
|
|
|
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
2017-10-07 13:08:14 +02:00
|
|
|
current_threads, next_threads))
|
2017-10-06 13:30:46 +02:00
|
|
|
return true;
|
|
|
|
|
|
|
|
if (not (flags & RegexExecFlags::Search))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
2017-10-07 06:46:27 +02:00
|
|
|
to_next_start(++start, m_end, start_chars);
|
2017-10-06 13:30:46 +02:00
|
|
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
2017-10-07 13:08:14 +02:00
|
|
|
current_threads, next_threads))
|
2017-10-06 13:30:46 +02:00
|
|
|
return true;
|
|
|
|
}
|
2017-10-07 06:46:27 +02:00
|
|
|
while (start != m_end);
|
2017-10-06 13:30:46 +02:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
ArrayView<const Iterator> captures() const
|
|
|
|
{
|
|
|
|
if (m_captures)
|
|
|
|
return { m_captures->pos, m_program.save_count };
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2017-10-03 04:54:43 +02:00
|
|
|
struct Saves
|
|
|
|
{
|
|
|
|
int refcount;
|
2017-10-04 05:14:24 +02:00
|
|
|
Iterator pos[1];
|
2017-10-03 04:54:43 +02:00
|
|
|
};
|
|
|
|
|
2017-10-04 13:49:16 +02:00
|
|
|
template<bool copy>
|
|
|
|
Saves* new_saves(Iterator* pos)
|
2017-10-03 04:54:43 +02:00
|
|
|
{
|
2017-10-04 13:49:16 +02:00
|
|
|
kak_assert(not copy or pos != nullptr);
|
|
|
|
const auto count = m_program.save_count;
|
2017-10-03 04:54:43 +02:00
|
|
|
if (not m_free_saves.empty())
|
|
|
|
{
|
2017-10-03 12:23:31 +02:00
|
|
|
Saves* res = m_free_saves.back();
|
2017-10-03 04:54:43 +02:00
|
|
|
m_free_saves.pop_back();
|
2017-10-03 12:23:31 +02:00
|
|
|
res->refcount = 1;
|
2017-10-04 13:49:16 +02:00
|
|
|
if (copy)
|
|
|
|
std::copy(pos, pos + count, res->pos);
|
|
|
|
else
|
|
|
|
std::fill(res->pos, res->pos + count, Iterator{});
|
|
|
|
|
2017-10-03 12:23:31 +02:00
|
|
|
return res;
|
2017-10-03 04:54:43 +02:00
|
|
|
}
|
|
|
|
|
2017-10-04 13:49:16 +02:00
|
|
|
void* ptr = ::operator new (sizeof(Saves) + (count-1) * sizeof(Iterator));
|
2017-10-08 06:19:14 +02:00
|
|
|
Saves* saves = new (ptr) Saves{1, {copy ? pos[0] : Iterator{}}};
|
2017-10-04 13:49:16 +02:00
|
|
|
for (size_t i = 1; i < count; ++i)
|
|
|
|
new (&saves->pos[i]) Iterator{copy ? pos[i] : Iterator{}};
|
|
|
|
m_saves.push_back(saves);
|
|
|
|
return saves;
|
2017-10-03 04:54:43 +02:00
|
|
|
}
|
|
|
|
|
2017-10-04 04:49:40 +02:00
|
|
|
void release_saves(Saves* saves)
|
|
|
|
{
|
|
|
|
if (saves and --saves->refcount == 0)
|
|
|
|
m_free_saves.push_back(saves);
|
|
|
|
};
|
|
|
|
|
2017-10-02 08:59:04 +02:00
|
|
|
struct Thread
|
|
|
|
{
|
2017-10-07 12:51:32 +02:00
|
|
|
uint32_t inst;
|
2017-10-03 04:54:43 +02:00
|
|
|
Saves* saves;
|
2017-10-02 08:59:04 +02:00
|
|
|
};
|
|
|
|
|
2017-10-08 13:56:03 +02:00
|
|
|
using Utf8It = std::conditional_t<direction == MatchDirection::Forward,
|
|
|
|
utf8::iterator<Iterator>,
|
|
|
|
std::reverse_iterator<utf8::iterator<Iterator>>>;
|
2017-10-04 06:16:52 +02:00
|
|
|
|
2017-10-02 08:59:04 +02:00
|
|
|
enum class StepResult { Consumed, Matched, Failed };
|
2017-10-07 10:36:53 +02:00
|
|
|
|
|
|
|
// Steps a thread until it consumes the current character, matches or fail
|
2017-10-07 13:08:14 +02:00
|
|
|
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
|
|
|
while (true)
|
|
|
|
{
|
2017-10-07 12:51:32 +02:00
|
|
|
auto& inst = m_program.instructions[thread.inst++];
|
2017-10-07 13:08:14 +02:00
|
|
|
if (inst.processed)
|
|
|
|
return StepResult::Failed;
|
|
|
|
inst.processed = true;
|
2017-10-07 08:25:14 +02:00
|
|
|
|
2017-10-07 12:51:32 +02:00
|
|
|
switch (inst.op)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
|
|
|
case CompiledRegex::Literal:
|
2017-10-07 13:58:10 +02:00
|
|
|
if (pos != m_end and inst.param == *pos)
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Consumed;
|
|
|
|
return StepResult::Failed;
|
|
|
|
case CompiledRegex::LiteralIgnoreCase:
|
2017-10-07 13:58:10 +02:00
|
|
|
if (pos != m_end and inst.param == to_lower(*pos))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Consumed;
|
|
|
|
return StepResult::Failed;
|
|
|
|
case CompiledRegex::AnyChar:
|
|
|
|
return StepResult::Consumed;
|
|
|
|
case CompiledRegex::Jump:
|
2017-10-07 12:51:32 +02:00
|
|
|
thread.inst = inst.param;
|
2017-10-02 08:59:04 +02:00
|
|
|
break;
|
|
|
|
case CompiledRegex::Split_PrioritizeParent:
|
|
|
|
{
|
2017-10-03 13:07:44 +02:00
|
|
|
if (thread.saves)
|
|
|
|
++thread.saves->refcount;
|
2017-10-07 12:51:32 +02:00
|
|
|
threads.push_back({inst.param, thread.saves});
|
2017-10-02 08:59:04 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CompiledRegex::Split_PrioritizeChild:
|
|
|
|
{
|
2017-10-03 13:07:44 +02:00
|
|
|
if (thread.saves)
|
|
|
|
++thread.saves->refcount;
|
2017-10-07 12:51:32 +02:00
|
|
|
threads.push_back({thread.inst, thread.saves});
|
|
|
|
thread.inst = inst.param;
|
2017-10-02 08:59:04 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CompiledRegex::Save:
|
|
|
|
{
|
2017-10-03 13:07:44 +02:00
|
|
|
if (thread.saves == nullptr)
|
|
|
|
break;
|
2017-10-03 04:54:43 +02:00
|
|
|
if (thread.saves->refcount > 1)
|
|
|
|
{
|
|
|
|
--thread.saves->refcount;
|
2017-10-04 13:49:16 +02:00
|
|
|
thread.saves = new_saves<true>(thread.saves->pos);
|
2017-10-03 04:54:43 +02:00
|
|
|
}
|
2017-10-07 12:51:32 +02:00
|
|
|
thread.saves->pos[inst.param] = get_base(pos);
|
2017-10-02 08:59:04 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CompiledRegex::Matcher:
|
2017-10-07 13:58:10 +02:00
|
|
|
if (pos == m_end)
|
|
|
|
return StepResult::Failed;
|
|
|
|
return m_program.matchers[inst.param](*pos) ?
|
2017-10-02 08:59:04 +02:00
|
|
|
StepResult::Consumed : StepResult::Failed;
|
|
|
|
case CompiledRegex::LineStart:
|
2017-10-04 14:11:15 +02:00
|
|
|
if (not is_line_start(pos))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LineEnd:
|
2017-10-04 14:11:15 +02:00
|
|
|
if (not is_line_end(pos))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::WordBoundary:
|
2017-10-04 14:11:15 +02:00
|
|
|
if (not is_word_boundary(pos))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::NotWordBoundary:
|
2017-10-04 14:11:15 +02:00
|
|
|
if (is_word_boundary(pos))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::SubjectBegin:
|
2017-10-04 14:11:15 +02:00
|
|
|
if (pos != m_begin or (m_flags & RegexExecFlags::NotBeginOfSubject))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::SubjectEnd:
|
2017-10-04 14:11:15 +02:00
|
|
|
if (pos != m_end)
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LookAhead:
|
|
|
|
case CompiledRegex::NegativeLookAhead:
|
2017-10-09 05:12:42 +02:00
|
|
|
if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LookBehind:
|
|
|
|
case CompiledRegex::NegativeLookBehind:
|
2017-10-09 05:12:42 +02:00
|
|
|
if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind))
|
2017-10-02 08:59:04 +02:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::Match:
|
|
|
|
return StepResult::Matched;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return StepResult::Failed;
|
|
|
|
}
|
|
|
|
|
2017-10-07 13:27:06 +02:00
|
|
|
bool exec_from(Utf8It pos, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-07 12:51:32 +02:00
|
|
|
current_threads.push_back({0, initial_saves});
|
2017-10-04 06:16:52 +02:00
|
|
|
next_threads.clear();
|
2017-10-03 13:07:44 +02:00
|
|
|
|
2017-10-07 13:27:06 +02:00
|
|
|
bool found_match = false;
|
2017-10-07 13:58:10 +02:00
|
|
|
while (true) // Iterate on all codepoints and once at the end
|
2017-10-07 13:27:06 +02:00
|
|
|
{
|
2017-10-07 13:08:14 +02:00
|
|
|
for (auto& inst : m_program.instructions)
|
2017-10-07 13:58:10 +02:00
|
|
|
{
|
2017-10-07 13:08:14 +02:00
|
|
|
inst.processed = false;
|
2017-10-07 13:58:10 +02:00
|
|
|
inst.scheduled = false;
|
|
|
|
}
|
2017-10-07 13:08:14 +02:00
|
|
|
|
2017-10-04 04:49:40 +02:00
|
|
|
while (not current_threads.empty())
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-04 04:49:40 +02:00
|
|
|
auto thread = current_threads.back();
|
|
|
|
current_threads.pop_back();
|
2017-10-07 13:08:14 +02:00
|
|
|
switch (step(pos, thread, current_threads))
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-03 12:00:52 +02:00
|
|
|
case StepResult::Matched:
|
2017-10-07 13:27:06 +02:00
|
|
|
if ((pos != m_end and not (m_flags & RegexExecFlags::Search)) or
|
2017-10-04 14:11:15 +02:00
|
|
|
(m_flags & RegexExecFlags::NotInitialNull and pos == m_begin))
|
2017-10-02 10:24:38 +02:00
|
|
|
{
|
2017-10-03 12:00:52 +02:00
|
|
|
release_saves(thread.saves);
|
2017-10-02 16:34:57 +02:00
|
|
|
continue;
|
2017-10-02 10:24:38 +02:00
|
|
|
}
|
2017-10-02 08:59:04 +02:00
|
|
|
|
2017-10-04 14:11:15 +02:00
|
|
|
release_saves(m_captures);
|
2017-10-04 05:28:58 +02:00
|
|
|
m_captures = thread.saves;
|
2017-10-07 13:27:06 +02:00
|
|
|
if (pos == m_end or (m_flags & RegexExecFlags::AnyMatch))
|
2017-10-02 16:34:57 +02:00
|
|
|
return true;
|
|
|
|
|
2017-10-02 08:59:04 +02:00
|
|
|
found_match = true;
|
2017-10-04 04:49:40 +02:00
|
|
|
current_threads.clear(); // remove this and lower priority threads
|
2017-10-03 12:00:52 +02:00
|
|
|
break;
|
|
|
|
case StepResult::Failed:
|
|
|
|
release_saves(thread.saves);
|
|
|
|
break;
|
|
|
|
case StepResult::Consumed:
|
2017-10-07 13:58:10 +02:00
|
|
|
if (m_program.instructions[thread.inst].scheduled)
|
|
|
|
{
|
2017-10-03 12:00:52 +02:00
|
|
|
release_saves(thread.saves);
|
2017-10-07 13:58:10 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
m_program.instructions[thread.inst].scheduled = true;
|
|
|
|
next_threads.push_back(thread);
|
2017-10-03 12:00:52 +02:00
|
|
|
break;
|
2017-10-02 10:24:38 +02:00
|
|
|
}
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
2017-10-07 13:27:06 +02:00
|
|
|
if (pos == m_end or next_threads.empty())
|
2017-10-02 08:59:04 +02:00
|
|
|
return found_match;
|
2017-10-03 12:00:52 +02:00
|
|
|
|
2017-10-04 04:49:40 +02:00
|
|
|
std::swap(current_threads, next_threads);
|
|
|
|
std::reverse(current_threads.begin(), current_threads.end());
|
2017-10-07 13:27:06 +02:00
|
|
|
++pos;
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
void to_next_start(Utf8It& start, const Utf8It& end, const bool* start_chars)
|
2017-10-06 07:40:27 +02:00
|
|
|
{
|
|
|
|
if (not start_chars)
|
|
|
|
return;
|
|
|
|
|
|
|
|
while (start != end and *start >= 0 and *start < 256 and
|
|
|
|
not start_chars[*start])
|
|
|
|
++start;
|
|
|
|
}
|
|
|
|
|
2017-10-09 05:12:42 +02:00
|
|
|
template<MatchDirection look_direction>
|
|
|
|
bool lookaround(uint32_t index, Utf8It pos) const
|
|
|
|
{
|
|
|
|
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
|
|
|
{
|
|
|
|
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
|
|
|
|
return false;
|
|
|
|
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
|
|
|
|
if (ref == 0xF000)
|
|
|
|
{} // any character matches
|
2017-10-09 05:20:05 +02:00
|
|
|
else if (ref > 0xF0000 and ref <= 0xFFFFD)
|
|
|
|
{
|
|
|
|
if (not m_program.matchers[ref - 0xF0001](cp))
|
|
|
|
return false;
|
|
|
|
}
|
2017-10-09 05:12:42 +02:00
|
|
|
else if (ref != cp)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-10-04 14:11:15 +02:00
|
|
|
bool is_line_start(const Utf8It& pos) const
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-08 03:22:24 +02:00
|
|
|
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
|
|
|
|
return not (m_flags & RegexExecFlags::NotBeginOfLine);
|
|
|
|
return *(pos-1) == '\n';
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
|
|
|
|
2017-10-04 14:11:15 +02:00
|
|
|
bool is_line_end(const Utf8It& pos) const
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-08 03:22:24 +02:00
|
|
|
if (pos == m_end)
|
|
|
|
return not (m_flags & RegexExecFlags::NotEndOfLine);
|
|
|
|
return *pos == '\n';
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
|
|
|
|
2017-10-04 14:11:15 +02:00
|
|
|
bool is_word_boundary(const Utf8It& pos) const
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-08 03:22:24 +02:00
|
|
|
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
|
|
|
|
return not (m_flags & RegexExecFlags::NotBeginOfWord);
|
|
|
|
if (pos == m_end)
|
|
|
|
return not (m_flags & RegexExecFlags::NotEndOfWord);
|
|
|
|
return is_word(*(pos-1)) != is_word(*pos);
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
static const Iterator& get_base(const utf8::iterator<Iterator>& it) { return it.base(); }
|
2017-10-07 10:09:43 +02:00
|
|
|
static Iterator get_base(const std::reverse_iterator<utf8::iterator<Iterator>>& it) { return it.base().base(); }
|
2017-10-07 06:46:27 +02:00
|
|
|
|
2017-10-02 08:59:04 +02:00
|
|
|
const CompiledRegex& m_program;
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
Utf8It m_begin;
|
|
|
|
Utf8It m_end;
|
2017-10-02 16:34:57 +02:00
|
|
|
RegexExecFlags m_flags;
|
2017-10-02 08:59:04 +02:00
|
|
|
|
2017-10-04 05:14:24 +02:00
|
|
|
Vector<Saves*> m_saves;
|
2017-10-03 04:54:43 +02:00
|
|
|
Vector<Saves*> m_free_saves;
|
|
|
|
|
2017-10-04 05:14:24 +02:00
|
|
|
Saves* m_captures = nullptr;
|
2017-10-02 08:59:04 +02:00
|
|
|
};
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
2017-10-02 16:34:57 +02:00
|
|
|
bool regex_match(It begin, It end, const CompiledRegex& re, RegexExecFlags flags = RegexExecFlags::None)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-07 06:46:27 +02:00
|
|
|
ThreadedRegexVM<It, direction> vm{re};
|
2017-10-03 13:07:44 +02:00
|
|
|
return vm.exec(begin, end, (RegexExecFlags)(flags & ~(RegexExecFlags::Search)) |
|
|
|
|
RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
2017-10-02 16:34:57 +02:00
|
|
|
bool regex_match(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
|
|
|
RegexExecFlags flags = RegexExecFlags::None)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-07 06:46:27 +02:00
|
|
|
ThreadedRegexVM<It, direction> vm{re};
|
2017-10-02 16:34:57 +02:00
|
|
|
if (vm.exec(begin, end, flags & ~(RegexExecFlags::Search)))
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-06 13:30:46 +02:00
|
|
|
std::copy(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
|
2017-10-02 08:59:04 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
2017-10-02 16:34:57 +02:00
|
|
|
bool regex_search(It begin, It end, const CompiledRegex& re,
|
|
|
|
RegexExecFlags flags = RegexExecFlags::None)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-07 06:46:27 +02:00
|
|
|
ThreadedRegexVM<It, direction> vm{re};
|
2017-10-03 13:07:44 +02:00
|
|
|
return vm.exec(begin, end, flags | RegexExecFlags::Search | RegexExecFlags::AnyMatch | RegexExecFlags::NoSaves);
|
2017-10-02 08:59:04 +02:00
|
|
|
}
|
2017-09-26 09:44:30 +02:00
|
|
|
|
2017-10-07 06:46:27 +02:00
|
|
|
template<typename It, MatchDirection direction = MatchDirection::Forward>
|
2017-10-02 16:34:57 +02:00
|
|
|
bool regex_search(It begin, It end, Vector<It>& captures, const CompiledRegex& re,
|
|
|
|
RegexExecFlags flags = RegexExecFlags::None)
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-07 06:46:27 +02:00
|
|
|
ThreadedRegexVM<It, direction> vm{re};
|
2017-10-02 16:34:57 +02:00
|
|
|
if (vm.exec(begin, end, flags | RegexExecFlags::Search))
|
2017-10-02 08:59:04 +02:00
|
|
|
{
|
2017-10-09 08:04:14 +02:00
|
|
|
std::move(vm.captures().begin(), vm.captures().end(), std::back_inserter(captures));
|
2017-10-02 08:59:04 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2017-09-26 09:44:30 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2017-09-17 10:50:53 +02:00
|
|
|
#endif // regex_impl_hh_INCLUDED
|