Remove use of utf8::iterator in regex execution

This avoids having two copies of the subject string bounds, one
in the ExecConfig and one in the utf8 iterator.
This commit is contained in:
Maxime Coste 2018-11-05 08:17:50 +11:00
parent b4571bd172
commit 7463a0d449

View File

@ -6,7 +6,6 @@
#include "ref_ptr.hh" #include "ref_ptr.hh"
#include "unicode.hh" #include "unicode.hh"
#include "utf8.hh" #include "utf8.hh"
#include "utf8_iterator.hh"
#include "vector.hh" #include "vector.hh"
#include "utils.hh" #include "utils.hh"
@ -167,7 +166,7 @@ public:
ThreadedRegexVM(const CompiledRegex& program) ThreadedRegexVM(const CompiledRegex& program)
: m_program{program} : m_program{program}
{ {
kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or kak_assert((forward and program.first_backward_inst != 0) or
(direction == MatchDirection::Backward and program.first_backward_inst != -1)); (direction == MatchDirection::Backward and program.first_backward_inst != -1));
} }
@ -195,15 +194,13 @@ public:
const bool search = (flags & RegexExecFlags::Search); const bool search = (flags & RegexExecFlags::Search);
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions}; ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
if (direction == MatchDirection::Forward) if (forward)
instructions = instructions.subrange(0, m_program.first_backward_inst); instructions = instructions.subrange(0, m_program.first_backward_inst);
else else
instructions = instructions.subrange(m_program.first_backward_inst); instructions = instructions.subrange(m_program.first_backward_inst);
if (not search) if (not search)
instructions = instructions.subrange(CompiledRegex::search_prefix_size); instructions = instructions.subrange(CompiledRegex::search_prefix_size);
constexpr bool forward = direction == MatchDirection::Forward;
const ExecConfig config{ const ExecConfig config{
Sentinel{forward ? begin : end}, Sentinel{forward ? begin : end},
Sentinel{forward ? end : begin}, Sentinel{forward ? end : begin},
@ -213,23 +210,21 @@ public:
instructions instructions
}; };
Utf8It start{Utf8It{ Iterator start = forward ? begin : end;
forward ? begin : end,
Sentinel{subject_begin},
Sentinel{subject_end}
}};
if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc) if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
{ {
if (search) if (search)
{ {
to_next_start(start, config.end, *start_desc); to_next_start(start, config, *start_desc);
if (start == config.end) // If start_desc is not null, it means we consume at least one char if (start == config.end) // If start_desc is not null, it means we consume at least one char
return false; return false;
} }
else if (start != config.end and else if (start != config.end)
not start_desc->map[codepoint(start) < StartDesc::count ? codepoint(start) : StartDesc::other]) {
return false; const Codepoint cp = codepoint(start, config);
if (not start_desc->map[cp < StartDesc::count ? cp : StartDesc::other])
return false;
}
} }
return exec_program(std::move(start), config); return exec_program(std::move(start), config);
@ -297,7 +292,6 @@ private:
using StartDesc = CompiledRegex::StartDesc; using StartDesc = CompiledRegex::StartDesc;
using Sentinel = typename SentinelType<Iterator>::Type; using Sentinel = typename SentinelType<Iterator>::Type;
using Utf8It = utf8::iterator<Iterator, Sentinel>;
struct ExecConfig struct ExecConfig
{ {
const Sentinel begin; const Sentinel begin;
@ -311,7 +305,7 @@ private:
enum class StepResult { Consumed, Matched, Failed, FindNextStart }; enum class StepResult { Consumed, Matched, Failed, FindNextStart };
// Steps a thread until it consumes the current character, matches or fail // Steps a thread until it consumes the current character, matches or fail
StepResult step_thread(const Utf8It& pos, uint16_t current_step, Thread& thread, const ExecConfig& config) StepResult step_thread(const Iterator& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
{ {
const bool no_saves = (config.flags & RegexExecFlags::NoSaves); const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
auto* instructions = m_program.instructions.data(); auto* instructions = m_program.instructions.data();
@ -327,17 +321,17 @@ private:
switch (inst.op) switch (inst.op)
{ {
case CompiledRegex::Literal: case CompiledRegex::Literal:
if (pos != config.end and inst.param == codepoint(pos)) if (pos != config.end and inst.param == codepoint(pos, config))
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::Literal_IgnoreCase: case CompiledRegex::Literal_IgnoreCase:
if (pos != config.end and inst.param == to_lower(codepoint(pos))) if (pos != config.end and inst.param == to_lower(codepoint(pos, config)))
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
return StepResult::Consumed; return StepResult::Consumed;
case CompiledRegex::AnyCharExceptNewLine: case CompiledRegex::AnyCharExceptNewLine:
if (pos != config.end and codepoint(pos) != '\n') if (pos != config.end and codepoint(pos, config) != '\n')
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::Jump: case CompiledRegex::Jump:
@ -369,18 +363,18 @@ private:
--m_saves[thread.saves]->refcount; --m_saves[thread.saves]->refcount;
thread.saves = new_saves<true>(m_saves[thread.saves]->pos); thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
} }
m_saves[thread.saves]->pos[inst.param] = pos.base(); m_saves[thread.saves]->pos[inst.param] = pos;
break; break;
} }
case CompiledRegex::Class: case CompiledRegex::Class:
if (pos == config.end) if (pos == config.end)
return StepResult::Failed; return StepResult::Failed;
return is_character_class(m_program.character_classes[inst.param], codepoint(pos)) ? return is_character_class(m_program.character_classes[inst.param], codepoint(pos, config)) ?
StepResult::Consumed : StepResult::Failed; StepResult::Consumed : StepResult::Failed;
case CompiledRegex::CharacterType: case CompiledRegex::CharacterType:
if (pos == config.end) if (pos == config.end)
return StepResult::Failed; return StepResult::Failed;
return is_ctype((CharacterType)inst.param, codepoint(pos)) ? return is_ctype((CharacterType)inst.param, codepoint(pos, config)) ?
StepResult::Consumed : StepResult::Failed; StepResult::Consumed : StepResult::Failed;
case CompiledRegex::LineStart: case CompiledRegex::LineStart:
if (not is_line_start(pos, config)) if (not is_line_start(pos, config))
@ -442,15 +436,14 @@ private:
return StepResult::Failed; return StepResult::Failed;
} }
bool exec_program(Utf8It pos, const ExecConfig& config) bool exec_program(Iterator pos, const ExecConfig& config)
{ {
kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty()); kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
release_saves(m_captures); release_saves(m_captures);
m_captures = -1; m_captures = -1;
m_threads.push_current({static_cast<int16_t>(&config.instructions[0] - &m_program.instructions[0]), -1}); m_threads.push_current({static_cast<int16_t>(&config.instructions[0] - &m_program.instructions[0]), -1});
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc;
: m_program.backward_start_desc;
uint16_t current_step = -1; uint16_t current_step = -1;
bool found_match = false; bool found_match = false;
@ -517,28 +510,30 @@ private:
} }
m_threads.swap_next(); m_threads.swap_next();
(direction == MatchDirection::Forward) ? ++pos : --pos; forward ? utf8::to_next(pos, config.subject_end)
: utf8::to_previous(pos, config.subject_begin);
if (find_next_start and start_desc) if (find_next_start and start_desc)
to_next_start(pos, config.end, *start_desc); to_next_start(pos, config, *start_desc);
} }
} }
void to_next_start(Utf8It& start, const Sentinel& end, const StartDesc& start_desc) void to_next_start(Iterator& start, const ExecConfig& config, const StartDesc& start_desc)
{ {
while (start != end) while (start != config.end)
{ {
const Codepoint cp = read_codepoint(start); const Codepoint cp = read_codepoint(start, config);
if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other]) if (start_desc.map[(cp >= 0 and cp < StartDesc::count) ? cp : StartDesc::other])
{ {
(direction == MatchDirection::Forward) ? --start : ++start; forward ? utf8::to_previous(start, config.subject_begin)
: utf8::to_next(start, config.subject_end);
return; return;
} }
} }
} }
template<MatchDirection look_direction, bool ignore_case> template<MatchDirection look_direction, bool ignore_case>
bool lookaround(uint32_t index, Utf8It pos, const ExecConfig& config) const bool lookaround(uint32_t index, Iterator pos, const ExecConfig& config) const
{ {
using Lookaround = CompiledRegex::Lookaround; using Lookaround = CompiledRegex::Lookaround;
@ -546,7 +541,7 @@ private:
{ {
if (pos == config.subject_begin) if (pos == config.subject_begin)
return m_program.lookarounds[index] == Lookaround::EndOfLookaround; return m_program.lookarounds[index] == Lookaround::EndOfLookaround;
--pos; utf8::to_previous(pos, config.subject_begin);
} }
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
@ -554,7 +549,7 @@ private:
if (look_direction == MatchDirection::Forward and pos == config.subject_end) if (look_direction == MatchDirection::Forward and pos == config.subject_end)
return false; return false;
Codepoint cp = *pos; Codepoint cp = utf8::codepoint(pos, config.subject_end);
if (ignore_case) if (ignore_case)
cp = to_lower(cp); cp = to_lower(cp);
@ -584,43 +579,52 @@ private:
if (look_direction == MatchDirection::Backward and pos == config.subject_begin) if (look_direction == MatchDirection::Backward and pos == config.subject_begin)
return *++it == Lookaround::EndOfLookaround; return *++it == Lookaround::EndOfLookaround;
(look_direction == MatchDirection::Forward) ? ++pos : --pos; (look_direction == MatchDirection::Forward) ? utf8::to_next(pos, config.subject_end)
: utf8::to_previous(pos, config.subject_begin);
} }
return true; return true;
} }
static bool is_line_start(const Utf8It& pos, const ExecConfig& config) static bool is_line_start(const Iterator& pos, const ExecConfig& config)
{ {
if (pos == config.subject_begin) if (pos == config.subject_begin)
return not (config.flags & RegexExecFlags::NotBeginOfLine); return not (config.flags & RegexExecFlags::NotBeginOfLine);
return *(pos-1) == '\n'; return utf8::codepoint(utf8::previous(pos, config.subject_begin), config.subject_end) == '\n';
} }
static bool is_line_end(const Utf8It& pos, const ExecConfig& config) static bool is_line_end(const Iterator& pos, const ExecConfig& config)
{ {
if (pos == config.subject_end) if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfLine); return not (config.flags & RegexExecFlags::NotEndOfLine);
return *pos == '\n'; return utf8::codepoint(pos, config.subject_end) == '\n';
} }
static bool is_word_boundary(const Utf8It& pos, const ExecConfig& config) static bool is_word_boundary(const Iterator& pos, const ExecConfig& config)
{ {
if (pos == config.subject_begin) if (pos == config.subject_begin)
return not (config.flags & RegexExecFlags::NotBeginOfWord); return not (config.flags & RegexExecFlags::NotBeginOfWord);
if (pos == config.subject_end) if (pos == config.subject_end)
return not (config.flags & RegexExecFlags::NotEndOfWord); return not (config.flags & RegexExecFlags::NotEndOfWord);
return is_word(*(pos-1)) != is_word(*pos); return is_word(utf8::codepoint(utf8::previous(pos, config.subject_begin), config.subject_end)) !=
is_word(utf8::codepoint(pos, config.subject_end));
} }
static Codepoint read_codepoint(Utf8It& it) static Codepoint read_codepoint(Iterator& it, const ExecConfig& config)
{ {
if (direction == MatchDirection::Forward) if (forward)
return it.read(); return utf8::read_codepoint(it, config.subject_end);
else else
return *--it; {
utf8::to_previous(it, config.subject_begin);
return utf8::codepoint(it, config.subject_end);
}
} }
static Codepoint codepoint(const Utf8It& it) { return (direction == MatchDirection::Forward) ? *it : *(it - 1); } static Codepoint codepoint(const Iterator& it, const ExecConfig& config)
{
return utf8::codepoint(forward ? it : utf8::previous(it, config.subject_begin),
config.subject_end);
}
const CompiledRegex& m_program; const CompiledRegex& m_program;
@ -664,6 +668,8 @@ private:
int32_t m_next = 0; int32_t m_next = 0;
}; };
static constexpr bool forward = direction == MatchDirection::Forward;
DualThreadStack m_threads; DualThreadStack m_threads;
Vector<Saves*, MemoryDomain::Regex> m_saves; Vector<Saves*, MemoryDomain::Regex> m_saves;
int16_t m_first_free = -1; int16_t m_first_free = -1;