Regex: do not write the search prefix inside the program bytecode

Its faster to have specialized code in the VM directly
This commit is contained in:
Maxime Coste 2017-10-04 12:16:52 +08:00
parent cf5055f68b
commit dbb175841b
2 changed files with 44 additions and 44 deletions

View File

@ -502,7 +502,6 @@ struct RegexCompiler
RegexCompiler(const ParsedRegex& parsed_regex)
: m_parsed_regex{parsed_regex}
{
write_search_prefix();
compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
@ -659,17 +658,6 @@ private:
return pos;
}
// Add a '.*' as the first instructions for the search use case
void write_search_prefix()
{
kak_assert(m_program.bytecode.empty());
push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(alloc_offset()) = CompiledRegex::search_prefix_size;
push_op(CompiledRegex::AnyChar);
push_op(CompiledRegex::Split_PrioritizeParent);
get_offset(alloc_offset()) = 1 + sizeof(Offset);
}
Offset alloc_offset()
{
auto pos = m_program.bytecode.size();

View File

@ -36,8 +36,6 @@ struct CompiledRegex
};
using Offset = unsigned;
static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
explicit operator bool() const { return not bytecode.empty(); }
Vector<char> bytecode;
@ -103,18 +101,18 @@ struct ThreadedRegexVM
}
};
Saves* clone_saves(Saves* saves)
Saves* clone_saves(Iterator* pos)
{
if (not m_free_saves.empty())
{
Saves* res = m_free_saves.back();
m_free_saves.pop_back();
res->refcount = 1;
std::copy(saves->pos, saves->pos + m_program.save_count, res->pos);
std::copy(pos, pos + m_program.save_count, res->pos);
return res;
}
m_saves.push_back(Saves::allocate(m_program.save_count, saves->pos));
m_saves.push_back(Saves::allocate(m_program.save_count, pos));
return m_saves.back();
}
@ -130,6 +128,8 @@ struct ThreadedRegexVM
Saves* saves;
};
using Utf8It = utf8::iterator<Iterator>;
enum class StepResult { Consumed, Matched, Failed };
StepResult step(Thread& thread, Vector<Thread>& threads)
{
@ -181,7 +181,7 @@ struct ThreadedRegexVM
if (thread.saves->refcount > 1)
{
--thread.saves->refcount;
thread.saves = clone_saves(thread.saves);
thread.saves = clone_saves(thread.saves->pos);
}
const size_t index = *thread.inst++;
thread.saves->pos[index] = m_pos.base();
@ -250,30 +250,13 @@ struct ThreadedRegexVM
return StepResult::Failed;
}
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
bool exec_from(Utf8It start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads)
{
m_begin = begin;
m_end = end;
m_flags = flags;
current_threads.push_back({m_program.bytecode.data(), initial_saves});
next_threads.clear();
bool found_match = false;
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
return false;
Saves* initial_saves = nullptr;
if (not (m_flags & RegexExecFlags::NoSaves))
{
m_saves.push_back(Saves::allocate(m_program.save_count));
initial_saves = m_saves.back();
}
const bool search = (flags & RegexExecFlags::Search);
const auto start_offset = search ? 0 : CompiledRegex::search_prefix_size;
Vector<Thread> current_threads{Thread{m_program.bytecode.data() + start_offset, initial_saves}};
Vector<Thread> next_threads;
for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
for (m_pos = start; m_pos != m_end; ++m_pos)
{
while (not current_threads.empty())
{
@ -282,15 +265,15 @@ struct ThreadedRegexVM
switch (step(thread, current_threads))
{
case StepResult::Matched:
if (not search or // We are not at end, this is not a full match
(flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
(m_flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
{
release_saves(thread.saves);
continue;
}
m_captures = thread.saves;
if (flags & RegexExecFlags::AnyMatch)
if (m_flags & RegexExecFlags::AnyMatch)
return true;
found_match = true;
@ -330,6 +313,37 @@ struct ThreadedRegexVM
return false;
}
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
{
m_begin = begin;
m_end = end;
m_flags = flags;
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
return false;
Vector<Thread> current_threads, next_threads;
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
Vector<Iterator> empty_saves(m_program.save_count, Iterator{});
Utf8It start{m_begin, m_begin, m_end};
if (exec_from(start, no_saves ? nullptr : clone_saves(empty_saves.data()),
current_threads, next_threads))
return true;
if (not (flags & RegexExecFlags::Search))
return false;
while (start != end)
{
if (exec_from(++start, no_saves ? nullptr : clone_saves(empty_saves.data()),
current_threads, next_threads))
return true;
}
return false;
}
bool is_line_start() const
{
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
@ -351,8 +365,6 @@ struct ThreadedRegexVM
const CompiledRegex& m_program;
using Utf8It = utf8::iterator<Iterator>;
Iterator m_begin;
Iterator m_end;
Utf8It m_pos;