Regex: More code tweaks and cleanups in ThreadedRegexVM
This commit is contained in:
parent
5f54e0de0e
commit
5f6e71c4dc
|
@ -67,6 +67,8 @@ struct ThreadedRegexVM
|
||||||
ThreadedRegexVM(const CompiledRegex& program)
|
ThreadedRegexVM(const CompiledRegex& program)
|
||||||
: m_program{program} { kak_assert(m_program); }
|
: m_program{program} { kak_assert(m_program); }
|
||||||
|
|
||||||
|
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
|
||||||
|
|
||||||
~ThreadedRegexVM()
|
~ThreadedRegexVM()
|
||||||
{
|
{
|
||||||
for (auto* saves : m_saves)
|
for (auto* saves : m_saves)
|
||||||
|
@ -124,13 +126,13 @@ struct ThreadedRegexVM
|
||||||
using Utf8It = utf8::iterator<Iterator>;
|
using Utf8It = utf8::iterator<Iterator>;
|
||||||
|
|
||||||
enum class StepResult { Consumed, Matched, Failed };
|
enum class StepResult { Consumed, Matched, Failed };
|
||||||
StepResult step(Thread& thread, Vector<Thread>& threads)
|
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads)
|
||||||
{
|
{
|
||||||
const auto prog_start = m_program.bytecode.data();
|
const auto prog_start = m_program.bytecode.data();
|
||||||
const auto prog_end = prog_start + m_program.bytecode.size();
|
const auto prog_end = prog_start + m_program.bytecode.size();
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
const Codepoint cp = m_pos == m_end ? 0 : *m_pos;
|
const Codepoint cp = pos == m_end ? 0 : *pos;
|
||||||
const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
|
const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
|
||||||
switch (op)
|
switch (op)
|
||||||
{
|
{
|
||||||
|
@ -177,44 +179,44 @@ struct ThreadedRegexVM
|
||||||
thread.saves = new_saves<true>(thread.saves->pos);
|
thread.saves = new_saves<true>(thread.saves->pos);
|
||||||
}
|
}
|
||||||
const size_t index = *thread.inst++;
|
const size_t index = *thread.inst++;
|
||||||
thread.saves->pos[index] = m_pos.base();
|
thread.saves->pos[index] = pos.base();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Matcher:
|
case CompiledRegex::Matcher:
|
||||||
{
|
{
|
||||||
const int matcher_id = *thread.inst++;
|
const int matcher_id = *thread.inst++;
|
||||||
return m_program.matchers[matcher_id](*m_pos) ?
|
return m_program.matchers[matcher_id](cp) ?
|
||||||
StepResult::Consumed : StepResult::Failed;
|
StepResult::Consumed : StepResult::Failed;
|
||||||
}
|
}
|
||||||
case CompiledRegex::LineStart:
|
case CompiledRegex::LineStart:
|
||||||
if (not is_line_start())
|
if (not is_line_start(pos))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LineEnd:
|
case CompiledRegex::LineEnd:
|
||||||
if (not is_line_end())
|
if (not is_line_end(pos))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::WordBoundary:
|
case CompiledRegex::WordBoundary:
|
||||||
if (not is_word_boundary())
|
if (not is_word_boundary(pos))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::NotWordBoundary:
|
case CompiledRegex::NotWordBoundary:
|
||||||
if (is_word_boundary())
|
if (is_word_boundary(pos))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::SubjectBegin:
|
case CompiledRegex::SubjectBegin:
|
||||||
if (m_pos != m_begin or m_flags & RegexExecFlags::NotBeginOfSubject)
|
if (pos != m_begin or (m_flags & RegexExecFlags::NotBeginOfSubject))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::SubjectEnd:
|
case CompiledRegex::SubjectEnd:
|
||||||
if (m_pos != m_end)
|
if (pos != m_end)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LookAhead:
|
case CompiledRegex::LookAhead:
|
||||||
case CompiledRegex::NegativeLookAhead:
|
case CompiledRegex::NegativeLookAhead:
|
||||||
{
|
{
|
||||||
int count = *thread.inst++;
|
int count = *thread.inst++;
|
||||||
for (auto it = m_pos; count and it != m_end; ++it, --count)
|
for (auto it = pos; count and it != m_end; ++it, --count)
|
||||||
if (*it != utf8::read(thread.inst))
|
if (*it != utf8::read(thread.inst))
|
||||||
break;
|
break;
|
||||||
if ((op == CompiledRegex::LookAhead and count != 0) or
|
if ((op == CompiledRegex::LookAhead and count != 0) or
|
||||||
|
@ -227,7 +229,7 @@ struct ThreadedRegexVM
|
||||||
case CompiledRegex::NegativeLookBehind:
|
case CompiledRegex::NegativeLookBehind:
|
||||||
{
|
{
|
||||||
int count = *thread.inst++;
|
int count = *thread.inst++;
|
||||||
for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
|
for (auto it = pos-1; count and it >= m_begin; --it, --count)
|
||||||
if (*it != utf8::read(thread.inst))
|
if (*it != utf8::read(thread.inst))
|
||||||
break;
|
break;
|
||||||
if ((op == CompiledRegex::LookBehind and count != 0) or
|
if ((op == CompiledRegex::LookBehind and count != 0) or
|
||||||
|
@ -243,28 +245,29 @@ struct ThreadedRegexVM
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_from(Utf8It start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads)
|
bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads)
|
||||||
{
|
{
|
||||||
current_threads.push_back({m_program.bytecode.data(), initial_saves});
|
current_threads.push_back({m_program.bytecode.data(), initial_saves});
|
||||||
next_threads.clear();
|
next_threads.clear();
|
||||||
|
|
||||||
bool found_match = false;
|
bool found_match = false;
|
||||||
for (m_pos = start; m_pos != m_end; ++m_pos)
|
for (Utf8It pos = start; pos != m_end; ++pos)
|
||||||
{
|
{
|
||||||
while (not current_threads.empty())
|
while (not current_threads.empty())
|
||||||
{
|
{
|
||||||
auto thread = current_threads.back();
|
auto thread = current_threads.back();
|
||||||
current_threads.pop_back();
|
current_threads.pop_back();
|
||||||
switch (step(thread, current_threads))
|
switch (step(pos, thread, current_threads))
|
||||||
{
|
{
|
||||||
case StepResult::Matched:
|
case StepResult::Matched:
|
||||||
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
|
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
|
||||||
(m_flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
|
(m_flags & RegexExecFlags::NotInitialNull and pos == m_begin))
|
||||||
{
|
{
|
||||||
release_saves(thread.saves);
|
release_saves(thread.saves);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
release_saves(m_captures);
|
||||||
m_captures = thread.saves;
|
m_captures = thread.saves;
|
||||||
if (m_flags & RegexExecFlags::AnyMatch)
|
if (m_flags & RegexExecFlags::AnyMatch)
|
||||||
return true;
|
return true;
|
||||||
|
@ -293,12 +296,14 @@ struct ThreadedRegexVM
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
// Step remaining threads to see if they match without consuming anything else
|
// Step remaining threads to see if they match without consuming anything else
|
||||||
|
const Utf8It end{m_end, m_begin, m_end};
|
||||||
while (not current_threads.empty())
|
while (not current_threads.empty())
|
||||||
{
|
{
|
||||||
auto thread = current_threads.back();
|
auto thread = current_threads.back();
|
||||||
current_threads.pop_back();
|
current_threads.pop_back();
|
||||||
if (step(thread, current_threads) == StepResult::Matched)
|
if (step(end, thread, current_threads) == StepResult::Matched)
|
||||||
{
|
{
|
||||||
|
release_saves(m_captures);
|
||||||
m_captures = thread.saves;
|
m_captures = thread.saves;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -335,30 +340,29 @@ struct ThreadedRegexVM
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_line_start() const
|
bool is_line_start(const Utf8It& pos) const
|
||||||
{
|
{
|
||||||
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
return (pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
||||||
*(m_pos-1) == '\n';
|
*(pos-1) == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_line_end() const
|
bool is_line_end(const Utf8It& pos) const
|
||||||
{
|
{
|
||||||
return (m_pos == m_end and not (m_flags & RegexExecFlags::NotEndOfLine)) or
|
return (pos == m_end and not (m_flags & RegexExecFlags::NotEndOfLine)) or
|
||||||
*m_pos == '\n';
|
*pos == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_word_boundary() const
|
bool is_word_boundary(const Utf8It& pos) const
|
||||||
{
|
{
|
||||||
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfWord)) or
|
return (pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfWord)) or
|
||||||
(m_pos == m_end and not (m_flags & RegexExecFlags::NotEndOfWord)) or
|
(pos == m_end and not (m_flags & RegexExecFlags::NotEndOfWord)) or
|
||||||
is_word(*(m_pos-1)) != is_word(*m_pos);
|
is_word(*(pos-1)) != is_word(*pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
const CompiledRegex& m_program;
|
const CompiledRegex& m_program;
|
||||||
|
|
||||||
Iterator m_begin;
|
Iterator m_begin;
|
||||||
Iterator m_end;
|
Iterator m_end;
|
||||||
Utf8It m_pos;
|
|
||||||
RegexExecFlags m_flags;
|
RegexExecFlags m_flags;
|
||||||
|
|
||||||
Vector<Saves*> m_saves;
|
Vector<Saves*> m_saves;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user