Regex: do not write the search prefix inside the program bytecode
Its faster to have specialized code in the VM directly
This commit is contained in:
parent
cf5055f68b
commit
dbb175841b
|
@ -502,7 +502,6 @@ struct RegexCompiler
|
||||||
RegexCompiler(const ParsedRegex& parsed_regex)
|
RegexCompiler(const ParsedRegex& parsed_regex)
|
||||||
: m_parsed_regex{parsed_regex}
|
: m_parsed_regex{parsed_regex}
|
||||||
{
|
{
|
||||||
write_search_prefix();
|
|
||||||
compile_node(m_parsed_regex.ast);
|
compile_node(m_parsed_regex.ast);
|
||||||
push_op(CompiledRegex::Match);
|
push_op(CompiledRegex::Match);
|
||||||
m_program.matchers = m_parsed_regex.matchers;
|
m_program.matchers = m_parsed_regex.matchers;
|
||||||
|
@ -659,17 +658,6 @@ private:
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a '.*' as the first instructions for the search use case
|
|
||||||
void write_search_prefix()
|
|
||||||
{
|
|
||||||
kak_assert(m_program.bytecode.empty());
|
|
||||||
push_op(CompiledRegex::Split_PrioritizeChild);
|
|
||||||
get_offset(alloc_offset()) = CompiledRegex::search_prefix_size;
|
|
||||||
push_op(CompiledRegex::AnyChar);
|
|
||||||
push_op(CompiledRegex::Split_PrioritizeParent);
|
|
||||||
get_offset(alloc_offset()) = 1 + sizeof(Offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
Offset alloc_offset()
|
Offset alloc_offset()
|
||||||
{
|
{
|
||||||
auto pos = m_program.bytecode.size();
|
auto pos = m_program.bytecode.size();
|
||||||
|
|
|
@ -36,8 +36,6 @@ struct CompiledRegex
|
||||||
};
|
};
|
||||||
|
|
||||||
using Offset = unsigned;
|
using Offset = unsigned;
|
||||||
static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
|
|
||||||
|
|
||||||
explicit operator bool() const { return not bytecode.empty(); }
|
explicit operator bool() const { return not bytecode.empty(); }
|
||||||
|
|
||||||
Vector<char> bytecode;
|
Vector<char> bytecode;
|
||||||
|
@ -103,18 +101,18 @@ struct ThreadedRegexVM
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Saves* clone_saves(Saves* saves)
|
Saves* clone_saves(Iterator* pos)
|
||||||
{
|
{
|
||||||
if (not m_free_saves.empty())
|
if (not m_free_saves.empty())
|
||||||
{
|
{
|
||||||
Saves* res = m_free_saves.back();
|
Saves* res = m_free_saves.back();
|
||||||
m_free_saves.pop_back();
|
m_free_saves.pop_back();
|
||||||
res->refcount = 1;
|
res->refcount = 1;
|
||||||
std::copy(saves->pos, saves->pos + m_program.save_count, res->pos);
|
std::copy(pos, pos + m_program.save_count, res->pos);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_saves.push_back(Saves::allocate(m_program.save_count, saves->pos));
|
m_saves.push_back(Saves::allocate(m_program.save_count, pos));
|
||||||
return m_saves.back();
|
return m_saves.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,6 +128,8 @@ struct ThreadedRegexVM
|
||||||
Saves* saves;
|
Saves* saves;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using Utf8It = utf8::iterator<Iterator>;
|
||||||
|
|
||||||
enum class StepResult { Consumed, Matched, Failed };
|
enum class StepResult { Consumed, Matched, Failed };
|
||||||
StepResult step(Thread& thread, Vector<Thread>& threads)
|
StepResult step(Thread& thread, Vector<Thread>& threads)
|
||||||
{
|
{
|
||||||
|
@ -181,7 +181,7 @@ struct ThreadedRegexVM
|
||||||
if (thread.saves->refcount > 1)
|
if (thread.saves->refcount > 1)
|
||||||
{
|
{
|
||||||
--thread.saves->refcount;
|
--thread.saves->refcount;
|
||||||
thread.saves = clone_saves(thread.saves);
|
thread.saves = clone_saves(thread.saves->pos);
|
||||||
}
|
}
|
||||||
const size_t index = *thread.inst++;
|
const size_t index = *thread.inst++;
|
||||||
thread.saves->pos[index] = m_pos.base();
|
thread.saves->pos[index] = m_pos.base();
|
||||||
|
@ -250,30 +250,13 @@ struct ThreadedRegexVM
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
bool exec_from(Utf8It start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads)
|
||||||
{
|
{
|
||||||
m_begin = begin;
|
current_threads.push_back({m_program.bytecode.data(), initial_saves});
|
||||||
m_end = end;
|
next_threads.clear();
|
||||||
m_flags = flags;
|
|
||||||
|
|
||||||
bool found_match = false;
|
bool found_match = false;
|
||||||
|
for (m_pos = start; m_pos != m_end; ++m_pos)
|
||||||
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
Saves* initial_saves = nullptr;
|
|
||||||
if (not (m_flags & RegexExecFlags::NoSaves))
|
|
||||||
{
|
|
||||||
m_saves.push_back(Saves::allocate(m_program.save_count));
|
|
||||||
initial_saves = m_saves.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool search = (flags & RegexExecFlags::Search);
|
|
||||||
|
|
||||||
const auto start_offset = search ? 0 : CompiledRegex::search_prefix_size;
|
|
||||||
Vector<Thread> current_threads{Thread{m_program.bytecode.data() + start_offset, initial_saves}};
|
|
||||||
Vector<Thread> next_threads;
|
|
||||||
for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
|
|
||||||
{
|
{
|
||||||
while (not current_threads.empty())
|
while (not current_threads.empty())
|
||||||
{
|
{
|
||||||
|
@ -282,15 +265,15 @@ struct ThreadedRegexVM
|
||||||
switch (step(thread, current_threads))
|
switch (step(thread, current_threads))
|
||||||
{
|
{
|
||||||
case StepResult::Matched:
|
case StepResult::Matched:
|
||||||
if (not search or // We are not at end, this is not a full match
|
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
|
||||||
(flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
|
(m_flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
|
||||||
{
|
{
|
||||||
release_saves(thread.saves);
|
release_saves(thread.saves);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_captures = thread.saves;
|
m_captures = thread.saves;
|
||||||
if (flags & RegexExecFlags::AnyMatch)
|
if (m_flags & RegexExecFlags::AnyMatch)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
found_match = true;
|
found_match = true;
|
||||||
|
@ -330,6 +313,37 @@ struct ThreadedRegexVM
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
||||||
|
{
|
||||||
|
m_begin = begin;
|
||||||
|
m_end = end;
|
||||||
|
m_flags = flags;
|
||||||
|
|
||||||
|
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Vector<Thread> current_threads, next_threads;
|
||||||
|
|
||||||
|
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
||||||
|
Vector<Iterator> empty_saves(m_program.save_count, Iterator{});
|
||||||
|
|
||||||
|
Utf8It start{m_begin, m_begin, m_end};
|
||||||
|
if (exec_from(start, no_saves ? nullptr : clone_saves(empty_saves.data()),
|
||||||
|
current_threads, next_threads))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (not (flags & RegexExecFlags::Search))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
while (start != end)
|
||||||
|
{
|
||||||
|
if (exec_from(++start, no_saves ? nullptr : clone_saves(empty_saves.data()),
|
||||||
|
current_threads, next_threads))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool is_line_start() const
|
bool is_line_start() const
|
||||||
{
|
{
|
||||||
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
||||||
|
@ -351,8 +365,6 @@ struct ThreadedRegexVM
|
||||||
|
|
||||||
const CompiledRegex& m_program;
|
const CompiledRegex& m_program;
|
||||||
|
|
||||||
using Utf8It = utf8::iterator<Iterator>;
|
|
||||||
|
|
||||||
Iterator m_begin;
|
Iterator m_begin;
|
||||||
Iterator m_end;
|
Iterator m_end;
|
||||||
Utf8It m_pos;
|
Utf8It m_pos;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user