Use custom code instead of reverse_iterator in Regex VM

This commit is contained in:
Maxime Coste 2018-11-02 08:23:39 +11:00
parent 6fce8050ee
commit ee74c2c2df
2 changed files with 50 additions and 46 deletions

View File

@ -220,7 +220,7 @@ public:
instructions instructions
}; };
EffectiveIt start{Utf8It{ Utf8It start{Utf8It{
forward ? begin : end, forward ? begin : end,
Sentinel{subject_begin}, Sentinel{subject_begin},
Sentinel{subject_end} Sentinel{subject_end}
@ -232,11 +232,11 @@ public:
if (search) if (search)
{ {
to_next_start(start, config.end, *start_desc); to_next_start(start, config.end, *start_desc);
if (base(start) == config.end) // If start_desc is not null, it means we consume at least one char if (start == config.end) // If start_desc is not null, it means we consume at least one char
return false; return false;
} }
else if (base(start) != config.end and else if (start != config.end and
not start_desc->map[*start < StartDesc::count ? *start : StartDesc::other]) not start_desc->map[codepoint(start) < StartDesc::count ? codepoint(start) : StartDesc::other])
return false; return false;
} }
@ -306,9 +306,6 @@ private:
using StartDesc = CompiledRegex::StartDesc; using StartDesc = CompiledRegex::StartDesc;
using Sentinel = typename SentinelType<Iterator>::Type; using Sentinel = typename SentinelType<Iterator>::Type;
using Utf8It = utf8::iterator<Iterator, Sentinel>; using Utf8It = utf8::iterator<Iterator, Sentinel>;
using EffectiveIt = std::conditional_t<direction == MatchDirection::Forward,
Utf8It, std::reverse_iterator<Utf8It>>;
struct ExecConfig struct ExecConfig
{ {
const Sentinel begin; const Sentinel begin;
@ -322,7 +319,7 @@ private:
enum class StepResult { Consumed, Matched, Failed, FindNextStart }; enum class StepResult { Consumed, Matched, Failed, FindNextStart };
// Steps a thread until it consumes the current character, matches or fail // Steps a thread until it consumes the current character, matches or fail
StepResult step(EffectiveIt& pos, uint16_t current_step, Thread& thread, const ExecConfig& config) StepResult step_thread(const Utf8It& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
{ {
const bool no_saves = (config.flags & RegexExecFlags::NoSaves); const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
auto* instructions = m_program.instructions.data(); auto* instructions = m_program.instructions.data();
@ -338,17 +335,17 @@ private:
switch (inst.op) switch (inst.op)
{ {
case CompiledRegex::Literal: case CompiledRegex::Literal:
if (base(pos) != config.end and inst.param == *pos) if (pos != config.end and inst.param == codepoint(pos))
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::Literal_IgnoreCase: case CompiledRegex::Literal_IgnoreCase:
if (base(pos) != config.end and inst.param == to_lower(*pos)) if (pos != config.end and inst.param == to_lower(codepoint(pos)))
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
return StepResult::Consumed; return StepResult::Consumed;
case CompiledRegex::AnyCharExceptNewLine: case CompiledRegex::AnyCharExceptNewLine:
if (base(pos) != config.end and *pos != '\n') if (pos != config.end and codepoint(pos) != '\n')
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::Jump: case CompiledRegex::Jump:
@ -380,18 +377,18 @@ private:
--m_saves[thread.saves]->refcount; --m_saves[thread.saves]->refcount;
thread.saves = new_saves<true>(m_saves[thread.saves]->pos); thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
} }
m_saves[thread.saves]->pos[inst.param] = base(pos); m_saves[thread.saves]->pos[inst.param] = pos.base();
break; break;
} }
case CompiledRegex::Class: case CompiledRegex::Class:
if (base(pos) == config.end) if (pos == config.end)
return StepResult::Failed; return StepResult::Failed;
return is_character_class(m_program.character_classes[inst.param], *pos) ? return is_character_class(m_program.character_classes[inst.param], codepoint(pos)) ?
StepResult::Consumed : StepResult::Failed; StepResult::Consumed : StepResult::Failed;
case CompiledRegex::CharacterType: case CompiledRegex::CharacterType:
if (base(pos) == config.end) if (pos == config.end)
return StepResult::Failed; return StepResult::Failed;
return is_ctype((CharacterType)inst.param, *pos) ? return is_ctype((CharacterType)inst.param, codepoint(pos)) ?
StepResult::Consumed : StepResult::Failed; StepResult::Consumed : StepResult::Failed;
case CompiledRegex::LineStart: case CompiledRegex::LineStart:
if (not is_line_start(pos, config)) if (not is_line_start(pos, config))
@ -410,11 +407,11 @@ private:
return StepResult::Failed; return StepResult::Failed;
break; break;
case CompiledRegex::SubjectBegin: case CompiledRegex::SubjectBegin:
if (base(pos) != config.subject_begin) if (pos != config.subject_begin)
return StepResult::Failed; return StepResult::Failed;
break; break;
case CompiledRegex::SubjectEnd: case CompiledRegex::SubjectEnd:
if (base(pos) != config.subject_end) if (pos != config.subject_end)
return StepResult::Failed; return StepResult::Failed;
break; break;
case CompiledRegex::LookAhead: case CompiledRegex::LookAhead:
@ -453,7 +450,7 @@ private:
return StepResult::Failed; return StepResult::Failed;
} }
bool exec_program(EffectiveIt pos, const ExecConfig& config) bool exec_program(Utf8It pos, const ExecConfig& config)
{ {
kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty()); kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
release_saves(m_captures); release_saves(m_captures);
@ -479,11 +476,11 @@ private:
while (not m_threads.current_is_empty()) while (not m_threads.current_is_empty())
{ {
auto thread = m_threads.pop_current(); auto thread = m_threads.pop_current();
switch (step(pos, current_step, thread, config)) switch (step_thread(pos, current_step, thread, config))
{ {
case StepResult::Matched: case StepResult::Matched:
if ((base(pos) != config.end and not (config.flags & RegexExecFlags::Search)) or if ((pos != config.end and not (config.flags & RegexExecFlags::Search)) or
(config.flags & RegexExecFlags::NotInitialNull and base(pos) == config.begin)) (config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
{ {
release_saves(thread.saves); release_saves(thread.saves);
continue; continue;
@ -518,7 +515,7 @@ private:
for (auto& thread : m_threads.next_threads()) for (auto& thread : m_threads.next_threads())
m_program.instructions[thread.inst].scheduled = false; m_program.instructions[thread.inst].scheduled = false;
if (base(pos) == config.end or m_threads.next_is_empty() or if (pos == config.end or m_threads.next_is_empty() or
(found_match and (config.flags & RegexExecFlags::AnyMatch))) (found_match and (config.flags & RegexExecFlags::AnyMatch)))
{ {
for (auto& t : m_threads.next_threads()) for (auto& t : m_threads.next_threads())
@ -528,37 +525,37 @@ private:
} }
m_threads.swap_next(); m_threads.swap_next();
++pos; next(pos);
if (find_next_start and start_desc) if (find_next_start and start_desc)
to_next_start(pos, config.end, *start_desc); to_next_start(pos, config.end, *start_desc);
} }
} }
void to_next_start(EffectiveIt& start, const Sentinel& end, const StartDesc& start_desc) void to_next_start(Utf8It& start, const Sentinel& end, const StartDesc& start_desc)
{ {
while (base(start) != end) while (start != end)
{ {
const Codepoint cp = read(start); const Codepoint cp = read(start);
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other]) if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
{ {
--start; prev(start);
return; return;
} }
} }
} }
template<MatchDirection look_direction, bool ignore_case> template<MatchDirection look_direction, bool ignore_case>
bool lookaround(uint32_t index, EffectiveIt pos, const ExecConfig& config) const bool lookaround(uint32_t index, Utf8It pos, const ExecConfig& config) const
{ {
using Lookaround = CompiledRegex::Lookaround; using Lookaround = CompiledRegex::Lookaround;
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin); const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
{ {
if (base(pos) == end) if (pos == end)
return false; return false;
Codepoint cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)); Codepoint cp = (look_direction == MatchDirection::Forward ? codepoint(pos) : prev_codepoint(pos));
if (ignore_case) if (ignore_case)
cp = to_lower(cp); cp = to_lower(cp);
@ -585,39 +582,46 @@ private:
else if (static_cast<Codepoint>(op) != cp) else if (static_cast<Codepoint>(op) != cp)
return false; return false;
(look_direction == MatchDirection::Forward) ? ++pos : --pos; (look_direction == MatchDirection::Forward) ? next(pos) : prev(pos);
} }
return true; return true;
} }
static bool is_line_start(const EffectiveIt& pos, const ExecConfig& config) static bool is_line_start(const Utf8It& pos, const ExecConfig& config)
{ {
if (base(pos) == config.subject_begin) if (pos == config.subject_begin)
return not (config.flags & RegexExecFlags::NotBeginOfLine); return not (config.flags & RegexExecFlags::NotBeginOfLine);
return *(pos-1) == '\n'; return prev_codepoint(pos) == '\n';
} }
static bool is_line_end(const EffectiveIt& pos, const ExecConfig& config) static bool is_line_end(const Utf8It& pos, const ExecConfig& config)
{ {
if (base(pos) == config.subject_end) if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfLine); return not (config.flags & RegexExecFlags::NotEndOfLine);
return *pos == '\n'; return codepoint(pos) == '\n';
} }
static bool is_word_boundary(const EffectiveIt& pos, const ExecConfig& config) static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config)
{ {
if (base(pos) == config.subject_begin) if (pos == config.subject_begin)
return not (config.flags & RegexExecFlags::NotBeginOfWord); return not (config.flags & RegexExecFlags::NotBeginOfWord);
if (base(pos) == config.subject_end) if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfWord); return not (config.flags & RegexExecFlags::NotEndOfWord);
return is_word(*(pos-1)) != is_word(*pos); return is_word(prev_codepoint(pos)) != is_word(codepoint(pos));
} }
static Codepoint read(Utf8It& it) { return it.read(); } static Codepoint read(Utf8It& it)
static Codepoint read(std::reverse_iterator<Utf8It>& it) { Codepoint cp = *it; ++it; return cp; } {
if (direction == MatchDirection::Forward)
return it.read();
else
return *--it;
}
static const Iterator& base(const Utf8It& it) { return it.base(); } static constexpr Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); }
static Iterator base(const std::reverse_iterator<Utf8It>& it) { return it.base().base(); } static constexpr Utf8It& next(Utf8It& it) { return (direction == MatchDirection::Forward) ? ++it : --it; }
static constexpr Utf8It& prev(Utf8It& it) { return (direction == MatchDirection::Forward) ? --it : ++it; }
static constexpr Codepoint prev_codepoint(Utf8It it) { return codepoint(prev(it)); }
const CompiledRegex& m_program; const CompiledRegex& m_program;

View File

@ -133,7 +133,7 @@ public:
return (CodepointType)utf8::read_codepoint<InvalidPolicy>(m_it, m_end); return (CodepointType)utf8::read_codepoint<InvalidPolicy>(m_it, m_end);
} }
const BaseIt& base() const noexcept(noexcept_policy) { return m_it; } const BaseIt& base() const noexcept { return m_it; }
private: private:
BaseIt m_it; BaseIt m_it;