Use custom code instead of reverse_iterator in Regex VM
This commit is contained in:
parent
6fce8050ee
commit
ee74c2c2df
|
@ -220,7 +220,7 @@ public:
|
||||||
instructions
|
instructions
|
||||||
};
|
};
|
||||||
|
|
||||||
EffectiveIt start{Utf8It{
|
Utf8It start{Utf8It{
|
||||||
forward ? begin : end,
|
forward ? begin : end,
|
||||||
Sentinel{subject_begin},
|
Sentinel{subject_begin},
|
||||||
Sentinel{subject_end}
|
Sentinel{subject_end}
|
||||||
|
@ -232,11 +232,11 @@ public:
|
||||||
if (search)
|
if (search)
|
||||||
{
|
{
|
||||||
to_next_start(start, config.end, *start_desc);
|
to_next_start(start, config.end, *start_desc);
|
||||||
if (base(start) == config.end) // If start_desc is not null, it means we consume at least one char
|
if (start == config.end) // If start_desc is not null, it means we consume at least one char
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (base(start) != config.end and
|
else if (start != config.end and
|
||||||
not start_desc->map[*start < StartDesc::count ? *start : StartDesc::other])
|
not start_desc->map[codepoint(start) < StartDesc::count ? codepoint(start) : StartDesc::other])
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -306,9 +306,6 @@ private:
|
||||||
using StartDesc = CompiledRegex::StartDesc;
|
using StartDesc = CompiledRegex::StartDesc;
|
||||||
using Sentinel = typename SentinelType<Iterator>::Type;
|
using Sentinel = typename SentinelType<Iterator>::Type;
|
||||||
using Utf8It = utf8::iterator<Iterator, Sentinel>;
|
using Utf8It = utf8::iterator<Iterator, Sentinel>;
|
||||||
using EffectiveIt = std::conditional_t<direction == MatchDirection::Forward,
|
|
||||||
Utf8It, std::reverse_iterator<Utf8It>>;
|
|
||||||
|
|
||||||
struct ExecConfig
|
struct ExecConfig
|
||||||
{
|
{
|
||||||
const Sentinel begin;
|
const Sentinel begin;
|
||||||
|
@ -322,7 +319,7 @@ private:
|
||||||
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
|
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
|
||||||
|
|
||||||
// Steps a thread until it consumes the current character, matches or fail
|
// Steps a thread until it consumes the current character, matches or fail
|
||||||
StepResult step(EffectiveIt& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
|
StepResult step_thread(const Utf8It& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
|
||||||
{
|
{
|
||||||
const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
|
const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
|
||||||
auto* instructions = m_program.instructions.data();
|
auto* instructions = m_program.instructions.data();
|
||||||
|
@ -338,17 +335,17 @@ private:
|
||||||
switch (inst.op)
|
switch (inst.op)
|
||||||
{
|
{
|
||||||
case CompiledRegex::Literal:
|
case CompiledRegex::Literal:
|
||||||
if (base(pos) != config.end and inst.param == *pos)
|
if (pos != config.end and inst.param == codepoint(pos))
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
case CompiledRegex::Literal_IgnoreCase:
|
case CompiledRegex::Literal_IgnoreCase:
|
||||||
if (base(pos) != config.end and inst.param == to_lower(*pos))
|
if (pos != config.end and inst.param == to_lower(codepoint(pos)))
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
case CompiledRegex::AnyChar:
|
case CompiledRegex::AnyChar:
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
case CompiledRegex::AnyCharExceptNewLine:
|
case CompiledRegex::AnyCharExceptNewLine:
|
||||||
if (base(pos) != config.end and *pos != '\n')
|
if (pos != config.end and codepoint(pos) != '\n')
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
case CompiledRegex::Jump:
|
case CompiledRegex::Jump:
|
||||||
|
@ -380,18 +377,18 @@ private:
|
||||||
--m_saves[thread.saves]->refcount;
|
--m_saves[thread.saves]->refcount;
|
||||||
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
|
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
|
||||||
}
|
}
|
||||||
m_saves[thread.saves]->pos[inst.param] = base(pos);
|
m_saves[thread.saves]->pos[inst.param] = pos.base();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Class:
|
case CompiledRegex::Class:
|
||||||
if (base(pos) == config.end)
|
if (pos == config.end)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
return is_character_class(m_program.character_classes[inst.param], *pos) ?
|
return is_character_class(m_program.character_classes[inst.param], codepoint(pos)) ?
|
||||||
StepResult::Consumed : StepResult::Failed;
|
StepResult::Consumed : StepResult::Failed;
|
||||||
case CompiledRegex::CharacterType:
|
case CompiledRegex::CharacterType:
|
||||||
if (base(pos) == config.end)
|
if (pos == config.end)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
return is_ctype((CharacterType)inst.param, *pos) ?
|
return is_ctype((CharacterType)inst.param, codepoint(pos)) ?
|
||||||
StepResult::Consumed : StepResult::Failed;
|
StepResult::Consumed : StepResult::Failed;
|
||||||
case CompiledRegex::LineStart:
|
case CompiledRegex::LineStart:
|
||||||
if (not is_line_start(pos, config))
|
if (not is_line_start(pos, config))
|
||||||
|
@ -410,11 +407,11 @@ private:
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::SubjectBegin:
|
case CompiledRegex::SubjectBegin:
|
||||||
if (base(pos) != config.subject_begin)
|
if (pos != config.subject_begin)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::SubjectEnd:
|
case CompiledRegex::SubjectEnd:
|
||||||
if (base(pos) != config.subject_end)
|
if (pos != config.subject_end)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LookAhead:
|
case CompiledRegex::LookAhead:
|
||||||
|
@ -453,7 +450,7 @@ private:
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_program(EffectiveIt pos, const ExecConfig& config)
|
bool exec_program(Utf8It pos, const ExecConfig& config)
|
||||||
{
|
{
|
||||||
kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
|
kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
|
||||||
release_saves(m_captures);
|
release_saves(m_captures);
|
||||||
|
@ -479,11 +476,11 @@ private:
|
||||||
while (not m_threads.current_is_empty())
|
while (not m_threads.current_is_empty())
|
||||||
{
|
{
|
||||||
auto thread = m_threads.pop_current();
|
auto thread = m_threads.pop_current();
|
||||||
switch (step(pos, current_step, thread, config))
|
switch (step_thread(pos, current_step, thread, config))
|
||||||
{
|
{
|
||||||
case StepResult::Matched:
|
case StepResult::Matched:
|
||||||
if ((base(pos) != config.end and not (config.flags & RegexExecFlags::Search)) or
|
if ((pos != config.end and not (config.flags & RegexExecFlags::Search)) or
|
||||||
(config.flags & RegexExecFlags::NotInitialNull and base(pos) == config.begin))
|
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
|
||||||
{
|
{
|
||||||
release_saves(thread.saves);
|
release_saves(thread.saves);
|
||||||
continue;
|
continue;
|
||||||
|
@ -518,7 +515,7 @@ private:
|
||||||
for (auto& thread : m_threads.next_threads())
|
for (auto& thread : m_threads.next_threads())
|
||||||
m_program.instructions[thread.inst].scheduled = false;
|
m_program.instructions[thread.inst].scheduled = false;
|
||||||
|
|
||||||
if (base(pos) == config.end or m_threads.next_is_empty() or
|
if (pos == config.end or m_threads.next_is_empty() or
|
||||||
(found_match and (config.flags & RegexExecFlags::AnyMatch)))
|
(found_match and (config.flags & RegexExecFlags::AnyMatch)))
|
||||||
{
|
{
|
||||||
for (auto& t : m_threads.next_threads())
|
for (auto& t : m_threads.next_threads())
|
||||||
|
@ -528,37 +525,37 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
m_threads.swap_next();
|
m_threads.swap_next();
|
||||||
++pos;
|
next(pos);
|
||||||
|
|
||||||
if (find_next_start and start_desc)
|
if (find_next_start and start_desc)
|
||||||
to_next_start(pos, config.end, *start_desc);
|
to_next_start(pos, config.end, *start_desc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void to_next_start(EffectiveIt& start, const Sentinel& end, const StartDesc& start_desc)
|
void to_next_start(Utf8It& start, const Sentinel& end, const StartDesc& start_desc)
|
||||||
{
|
{
|
||||||
while (base(start) != end)
|
while (start != end)
|
||||||
{
|
{
|
||||||
const Codepoint cp = read(start);
|
const Codepoint cp = read(start);
|
||||||
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
|
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
|
||||||
{
|
{
|
||||||
--start;
|
prev(start);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<MatchDirection look_direction, bool ignore_case>
|
template<MatchDirection look_direction, bool ignore_case>
|
||||||
bool lookaround(uint32_t index, EffectiveIt pos, const ExecConfig& config) const
|
bool lookaround(uint32_t index, Utf8It pos, const ExecConfig& config) const
|
||||||
{
|
{
|
||||||
using Lookaround = CompiledRegex::Lookaround;
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
|
|
||||||
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
|
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
|
||||||
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
||||||
{
|
{
|
||||||
if (base(pos) == end)
|
if (pos == end)
|
||||||
return false;
|
return false;
|
||||||
Codepoint cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1));
|
Codepoint cp = (look_direction == MatchDirection::Forward ? codepoint(pos) : prev_codepoint(pos));
|
||||||
if (ignore_case)
|
if (ignore_case)
|
||||||
cp = to_lower(cp);
|
cp = to_lower(cp);
|
||||||
|
|
||||||
|
@ -585,39 +582,46 @@ private:
|
||||||
else if (static_cast<Codepoint>(op) != cp)
|
else if (static_cast<Codepoint>(op) != cp)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
(look_direction == MatchDirection::Forward) ? next(pos) : prev(pos);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_line_start(const EffectiveIt& pos, const ExecConfig& config)
|
static bool is_line_start(const Utf8It& pos, const ExecConfig& config)
|
||||||
{
|
{
|
||||||
if (base(pos) == config.subject_begin)
|
if (pos == config.subject_begin)
|
||||||
return not (config.flags & RegexExecFlags::NotBeginOfLine);
|
return not (config.flags & RegexExecFlags::NotBeginOfLine);
|
||||||
return *(pos-1) == '\n';
|
return prev_codepoint(pos) == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_line_end(const EffectiveIt& pos, const ExecConfig& config)
|
static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
|
||||||
{
|
{
|
||||||
if (base(pos) == config.subject_end)
|
if (pos == config.subject_end)
|
||||||
return not (config.flags & RegexExecFlags::NotEndOfLine);
|
return not (config.flags & RegexExecFlags::NotEndOfLine);
|
||||||
return *pos == '\n';
|
return codepoint(pos) == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_word_boundary(const EffectiveIt& pos, const ExecConfig& config)
|
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
|
||||||
{
|
{
|
||||||
if (base(pos) == config.subject_begin)
|
if (pos == config.subject_begin)
|
||||||
return not (config.flags & RegexExecFlags::NotBeginOfWord);
|
return not (config.flags & RegexExecFlags::NotBeginOfWord);
|
||||||
if (base(pos) == config.subject_end)
|
if (pos == config.subject_end)
|
||||||
return not (config.flags & RegexExecFlags::NotEndOfWord);
|
return not (config.flags & RegexExecFlags::NotEndOfWord);
|
||||||
return is_word(*(pos-1)) != is_word(*pos);
|
return is_word(prev_codepoint(pos)) != is_word(codepoint(pos));
|
||||||
}
|
}
|
||||||
|
|
||||||
static Codepoint read(Utf8It& it) { return it.read(); }
|
static Codepoint read(Utf8It& it)
|
||||||
static Codepoint read(std::reverse_iterator<Utf8It>& it) { Codepoint cp = *it; ++it; return cp; }
|
{
|
||||||
|
if (direction == MatchDirection::Forward)
|
||||||
|
return it.read();
|
||||||
|
else
|
||||||
|
return *--it;
|
||||||
|
}
|
||||||
|
|
||||||
static const Iterator& base(const Utf8It& it) { return it.base(); }
|
static constexpr Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
|
||||||
static Iterator base(const std::reverse_iterator<Utf8It>& it) { return it.base().base(); }
|
static constexpr Utf8It& next(Utf8It& it) { return (direction == MatchDirection::Forward) ? ++it : --it; }
|
||||||
|
static constexpr Utf8It& prev(Utf8It& it) { return (direction == MatchDirection::Forward) ? --it : ++it; }
|
||||||
|
static constexpr Codepoint prev_codepoint(Utf8It it) { return codepoint(prev(it)); }
|
||||||
|
|
||||||
const CompiledRegex& m_program;
|
const CompiledRegex& m_program;
|
||||||
|
|
||||||
|
|
|
@ -133,7 +133,7 @@ public:
|
||||||
return (CodepointType)utf8::read_codepoint<InvalidPolicy>(m_it, m_end);
|
return (CodepointType)utf8::read_codepoint<InvalidPolicy>(m_it, m_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
const BaseIt& base() const noexcept(noexcept_policy) { return m_it; }
|
const BaseIt& base() const noexcept { return m_it; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
BaseIt m_it;
|
BaseIt m_it;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user