Remove use of utf8::iterator in regex execution
This avoids having two copies of the subject string bounds, one in the ExecConfig and one in the utf8 iterator.
This commit is contained in:
parent
b4571bd172
commit
7463a0d449
|
@ -6,7 +6,6 @@
|
|||
#include "ref_ptr.hh"
|
||||
#include "unicode.hh"
|
||||
#include "utf8.hh"
|
||||
#include "utf8_iterator.hh"
|
||||
#include "vector.hh"
|
||||
#include "utils.hh"
|
||||
|
||||
|
@ -167,7 +166,7 @@ public:
|
|||
ThreadedRegexVM(const CompiledRegex& program)
|
||||
: m_program{program}
|
||||
{
|
||||
kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or
|
||||
kak_assert((forward and program.first_backward_inst != 0) or
|
||||
(direction == MatchDirection::Backward and program.first_backward_inst != -1));
|
||||
}
|
||||
|
||||
|
@ -195,15 +194,13 @@ public:
|
|||
const bool search = (flags & RegexExecFlags::Search);
|
||||
|
||||
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
|
||||
if (direction == MatchDirection::Forward)
|
||||
if (forward)
|
||||
instructions = instructions.subrange(0, m_program.first_backward_inst);
|
||||
else
|
||||
instructions = instructions.subrange(m_program.first_backward_inst);
|
||||
if (not search)
|
||||
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
|
||||
|
||||
constexpr bool forward = direction == MatchDirection::Forward;
|
||||
|
||||
const ExecConfig config{
|
||||
Sentinel{forward ? begin : end},
|
||||
Sentinel{forward ? end : begin},
|
||||
|
@ -213,24 +210,22 @@ public:
|
|||
instructions
|
||||
};
|
||||
|
||||
Utf8It start{Utf8It{
|
||||
forward ? begin : end,
|
||||
Sentinel{subject_begin},
|
||||
Sentinel{subject_end}
|
||||
}};
|
||||
|
||||
Iterator start = forward ? begin : end;
|
||||
if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
|
||||
{
|
||||
if (search)
|
||||
{
|
||||
to_next_start(start, config.end, *start_desc);
|
||||
to_next_start(start, config, *start_desc);
|
||||
if (start == config.end) // If start_desc is not null, it means we consume at least one char
|
||||
return false;
|
||||
}
|
||||
else if (start != config.end and
|
||||
not start_desc->map[codepoint(start) < StartDesc::count ? codepoint(start) : StartDesc::other])
|
||||
else if (start != config.end)
|
||||
{
|
||||
const Codepoint cp = codepoint(start, config);
|
||||
if (not start_desc->map[cp < StartDesc::count ? cp : StartDesc::other])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return exec_program(std::move(start), config);
|
||||
}
|
||||
|
@ -297,7 +292,6 @@ private:
|
|||
|
||||
using StartDesc = CompiledRegex::StartDesc;
|
||||
using Sentinel = typename SentinelType<Iterator>::Type;
|
||||
using Utf8It = utf8::iterator<Iterator, Sentinel>;
|
||||
struct ExecConfig
|
||||
{
|
||||
const Sentinel begin;
|
||||
|
@ -311,7 +305,7 @@ private:
|
|||
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
|
||||
|
||||
// Steps a thread until it consumes the current character, matches or fail
|
||||
StepResult step_thread(const Utf8It& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
|
||||
StepResult step_thread(const Iterator& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
|
||||
{
|
||||
const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
|
||||
auto* instructions = m_program.instructions.data();
|
||||
|
@ -327,17 +321,17 @@ private:
|
|||
switch (inst.op)
|
||||
{
|
||||
case CompiledRegex::Literal:
|
||||
if (pos != config.end and inst.param == codepoint(pos))
|
||||
if (pos != config.end and inst.param == codepoint(pos, config))
|
||||
return StepResult::Consumed;
|
||||
return StepResult::Failed;
|
||||
case CompiledRegex::Literal_IgnoreCase:
|
||||
if (pos != config.end and inst.param == to_lower(codepoint(pos)))
|
||||
if (pos != config.end and inst.param == to_lower(codepoint(pos, config)))
|
||||
return StepResult::Consumed;
|
||||
return StepResult::Failed;
|
||||
case CompiledRegex::AnyChar:
|
||||
return StepResult::Consumed;
|
||||
case CompiledRegex::AnyCharExceptNewLine:
|
||||
if (pos != config.end and codepoint(pos) != '\n')
|
||||
if (pos != config.end and codepoint(pos, config) != '\n')
|
||||
return StepResult::Consumed;
|
||||
return StepResult::Failed;
|
||||
case CompiledRegex::Jump:
|
||||
|
@ -369,18 +363,18 @@ private:
|
|||
--m_saves[thread.saves]->refcount;
|
||||
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
|
||||
}
|
||||
m_saves[thread.saves]->pos[inst.param] = pos.base();
|
||||
m_saves[thread.saves]->pos[inst.param] = pos;
|
||||
break;
|
||||
}
|
||||
case CompiledRegex::Class:
|
||||
if (pos == config.end)
|
||||
return StepResult::Failed;
|
||||
return is_character_class(m_program.character_classes[inst.param], codepoint(pos)) ?
|
||||
return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ?
|
||||
StepResult::Consumed : StepResult::Failed;
|
||||
case CompiledRegex::CharacterType:
|
||||
if (pos == config.end)
|
||||
return StepResult::Failed;
|
||||
return is_ctype((CharacterType)inst.param, codepoint(pos)) ?
|
||||
return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ?
|
||||
StepResult::Consumed : StepResult::Failed;
|
||||
case CompiledRegex::LineStart:
|
||||
if (not is_line_start(pos, config))
|
||||
|
@ -442,15 +436,14 @@ private:
|
|||
return StepResult::Failed;
|
||||
}
|
||||
|
||||
bool exec_program(Utf8It pos, const ExecConfig& config)
|
||||
bool exec_program(Iterator pos, const ExecConfig& config)
|
||||
{
|
||||
kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
|
||||
release_saves(m_captures);
|
||||
m_captures = -1;
|
||||
m_threads.push_current({static_cast<int16_t>(&config.instructions[0] - &m_program.instructions[0]), -1});
|
||||
|
||||
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
|
||||
: m_program.backward_start_desc;
|
||||
const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc;
|
||||
|
||||
uint16_t current_step = -1;
|
||||
bool found_match = false;
|
||||
|
@ -517,28 +510,30 @@ private:
|
|||
}
|
||||
|
||||
m_threads.swap_next();
|
||||
(direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||
forward ? utf8::to_next(pos, config.subject_end)
|
||||
: utf8::to_previous(pos, config.subject_begin);
|
||||
|
||||
if (find_next_start and start_desc)
|
||||
to_next_start(pos, config.end, *start_desc);
|
||||
to_next_start(pos, config, *start_desc);
|
||||
}
|
||||
}
|
||||
|
||||
void to_next_start(Utf8It& start, const Sentinel& end, const StartDesc& start_desc)
|
||||
void to_next_start(Iterator& start, const ExecConfig& config, const StartDesc& start_desc)
|
||||
{
|
||||
while (start != end)
|
||||
while (start != config.end)
|
||||
{
|
||||
const Codepoint cp = read_codepoint(start);
|
||||
const Codepoint cp = read_codepoint(start, config);
|
||||
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
|
||||
{
|
||||
(direction == MatchDirection::Forward) ? --start : ++start;
|
||||
forward ? utf8::to_previous(start, config.subject_begin)
|
||||
: utf8::to_next(start, config.subject_end);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<MatchDirection look_direction, bool ignore_case>
|
||||
bool lookaround(uint32_t index, Utf8It pos, const ExecConfig& config) const
|
||||
bool lookaround(uint32_t index, Iterator pos, const ExecConfig& config) const
|
||||
{
|
||||
using Lookaround = CompiledRegex::Lookaround;
|
||||
|
||||
|
@ -546,7 +541,7 @@ private:
|
|||
{
|
||||
if (pos == config.subject_begin)
|
||||
return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
|
||||
--pos;
|
||||
utf8::to_previous(pos, config.subject_begin);
|
||||
}
|
||||
|
||||
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
||||
|
@ -554,7 +549,7 @@ private:
|
|||
if (look_direction == MatchDirection::Forward and pos == config.subject_end)
|
||||
return false;
|
||||
|
||||
Codepoint cp = *pos;
|
||||
Codepoint cp = utf8::codepoint(pos, config.subject_end);
|
||||
if (ignore_case)
|
||||
cp = to_lower(cp);
|
||||
|
||||
|
@ -584,43 +579,52 @@ private:
|
|||
if (look_direction == MatchDirection::Backward and pos == config.subject_begin)
|
||||
return *++it == Lookaround::EndOfLookaround;
|
||||
|
||||
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||
(look_direction == MatchDirection::Forward) ? utf8::to_next(pos, config.subject_end)
|
||||
: utf8::to_previous(pos, config.subject_begin);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_line_start(const Utf8It& pos, const ExecConfig& config)
|
||||
static bool is_line_start(const Iterator& pos, const ExecConfig& config)
|
||||
{
|
||||
if (pos == config.subject_begin)
|
||||
return not (config.flags & RegexExecFlags::NotBeginOfLine);
|
||||
return *(pos-1) == '\n';
|
||||
return utf8::codepoint(utf8::previous(pos, config.subject_begin), config.subject_end) == '\n';
|
||||
}
|
||||
|
||||
static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
|
||||
static bool is_line_end(const Iterator& pos, const ExecConfig& config)
|
||||
{
|
||||
if (pos == config.subject_end)
|
||||
return not (config.flags & RegexExecFlags::NotEndOfLine);
|
||||
return *pos == '\n';
|
||||
return utf8::codepoint(pos, config.subject_end) == '\n';
|
||||
}
|
||||
|
||||
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
|
||||
static bool is_word_boundary(const Iterator& pos, const ExecConfig& config)
|
||||
{
|
||||
if (pos == config.subject_begin)
|
||||
return not (config.flags & RegexExecFlags::NotBeginOfWord);
|
||||
if (pos == config.subject_end)
|
||||
return not (config.flags & RegexExecFlags::NotEndOfWord);
|
||||
return is_word(*(pos-1)) != is_word(*pos);
|
||||
return is_word(utf8::codepoint(utf8::previous(pos, config.subject_begin), config.subject_end)) !=
|
||||
is_word(utf8::codepoint(pos, config.subject_end));
|
||||
}
|
||||
|
||||
static Codepoint read_codepoint(Utf8It& it)
|
||||
static Codepoint read_codepoint(Iterator& it, const ExecConfig& config)
|
||||
{
|
||||
if (direction == MatchDirection::Forward)
|
||||
return it.read();
|
||||
if (forward)
|
||||
return utf8::read_codepoint(it, config.subject_end);
|
||||
else
|
||||
return *--it;
|
||||
{
|
||||
utf8::to_previous(it, config.subject_begin);
|
||||
return utf8::codepoint(it, config.subject_end);
|
||||
}
|
||||
}
|
||||
|
||||
static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
|
||||
static Codepoint codepoint(const Iterator& it, const ExecConfig& config)
|
||||
{
|
||||
return utf8::codepoint(forward ? it : utf8::previous(it, config.subject_begin),
|
||||
config.subject_end);
|
||||
}
|
||||
|
||||
const CompiledRegex& m_program;
|
||||
|
||||
|
@ -664,6 +668,8 @@ private:
|
|||
int32_t m_next = 0;
|
||||
};
|
||||
|
||||
static constexpr bool forward = direction == MatchDirection::Forward;
|
||||
|
||||
DualThreadStack m_threads;
|
||||
Vector<Saves*, MemoryDomain::Regex> m_saves;
|
||||
int16_t m_first_free = -1;
|
||||
|
|
Loading…
Reference in New Issue
Block a user