Regex: do not write the search prefix inside the program bytecode

Its faster to have specialized code in the VM directly
This commit is contained in:
Maxime Coste 2017-10-04 12:16:52 +08:00
parent cf5055f68b
commit dbb175841b
2 changed files with 44 additions and 44 deletions

View File

@ -502,7 +502,6 @@ struct RegexCompiler
RegexCompiler(const ParsedRegex& parsed_regex) RegexCompiler(const ParsedRegex& parsed_regex)
: m_parsed_regex{parsed_regex} : m_parsed_regex{parsed_regex}
{ {
write_search_prefix();
compile_node(m_parsed_regex.ast); compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match); push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers; m_program.matchers = m_parsed_regex.matchers;
@ -659,17 +658,6 @@ private:
return pos; return pos;
} }
// Add a '.*' as the first instructions for the search use case
void write_search_prefix()
{
kak_assert(m_program.bytecode.empty());
push_op(CompiledRegex::Split_PrioritizeChild);
get_offset(alloc_offset()) = CompiledRegex::search_prefix_size;
push_op(CompiledRegex::AnyChar);
push_op(CompiledRegex::Split_PrioritizeParent);
get_offset(alloc_offset()) = 1 + sizeof(Offset);
}
Offset alloc_offset() Offset alloc_offset()
{ {
auto pos = m_program.bytecode.size(); auto pos = m_program.bytecode.size();

View File

@ -36,8 +36,6 @@ struct CompiledRegex
}; };
using Offset = unsigned; using Offset = unsigned;
static constexpr Offset search_prefix_size = 3 + 2 * sizeof(Offset);
explicit operator bool() const { return not bytecode.empty(); } explicit operator bool() const { return not bytecode.empty(); }
Vector<char> bytecode; Vector<char> bytecode;
@ -103,18 +101,18 @@ struct ThreadedRegexVM
} }
}; };
Saves* clone_saves(Saves* saves) Saves* clone_saves(Iterator* pos)
{ {
if (not m_free_saves.empty()) if (not m_free_saves.empty())
{ {
Saves* res = m_free_saves.back(); Saves* res = m_free_saves.back();
m_free_saves.pop_back(); m_free_saves.pop_back();
res->refcount = 1; res->refcount = 1;
std::copy(saves->pos, saves->pos + m_program.save_count, res->pos); std::copy(pos, pos + m_program.save_count, res->pos);
return res; return res;
} }
m_saves.push_back(Saves::allocate(m_program.save_count, saves->pos)); m_saves.push_back(Saves::allocate(m_program.save_count, pos));
return m_saves.back(); return m_saves.back();
} }
@ -130,6 +128,8 @@ struct ThreadedRegexVM
Saves* saves; Saves* saves;
}; };
using Utf8It = utf8::iterator<Iterator>;
enum class StepResult { Consumed, Matched, Failed }; enum class StepResult { Consumed, Matched, Failed };
StepResult step(Thread& thread, Vector<Thread>& threads) StepResult step(Thread& thread, Vector<Thread>& threads)
{ {
@ -181,7 +181,7 @@ struct ThreadedRegexVM
if (thread.saves->refcount > 1) if (thread.saves->refcount > 1)
{ {
--thread.saves->refcount; --thread.saves->refcount;
thread.saves = clone_saves(thread.saves); thread.saves = clone_saves(thread.saves->pos);
} }
const size_t index = *thread.inst++; const size_t index = *thread.inst++;
thread.saves->pos[index] = m_pos.base(); thread.saves->pos[index] = m_pos.base();
@ -250,30 +250,13 @@ struct ThreadedRegexVM
return StepResult::Failed; return StepResult::Failed;
} }
bool exec(Iterator begin, Iterator end, RegexExecFlags flags) bool exec_from(Utf8It start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads)
{ {
m_begin = begin; current_threads.push_back({m_program.bytecode.data(), initial_saves});
m_end = end; next_threads.clear();
m_flags = flags;
bool found_match = false; bool found_match = false;
for (m_pos = start; m_pos != m_end; ++m_pos)
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
return false;
Saves* initial_saves = nullptr;
if (not (m_flags & RegexExecFlags::NoSaves))
{
m_saves.push_back(Saves::allocate(m_program.save_count));
initial_saves = m_saves.back();
}
const bool search = (flags & RegexExecFlags::Search);
const auto start_offset = search ? 0 : CompiledRegex::search_prefix_size;
Vector<Thread> current_threads{Thread{m_program.bytecode.data() + start_offset, initial_saves}};
Vector<Thread> next_threads;
for (m_pos = Utf8It{m_begin, m_begin, m_end}; m_pos != m_end; ++m_pos)
{ {
while (not current_threads.empty()) while (not current_threads.empty())
{ {
@ -282,15 +265,15 @@ struct ThreadedRegexVM
switch (step(thread, current_threads)) switch (step(thread, current_threads))
{ {
case StepResult::Matched: case StepResult::Matched:
if (not search or // We are not at end, this is not a full match if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
(flags & RegexExecFlags::NotInitialNull and m_pos == m_begin)) (m_flags & RegexExecFlags::NotInitialNull and m_pos == m_begin))
{ {
release_saves(thread.saves); release_saves(thread.saves);
continue; continue;
} }
m_captures = thread.saves; m_captures = thread.saves;
if (flags & RegexExecFlags::AnyMatch) if (m_flags & RegexExecFlags::AnyMatch)
return true; return true;
found_match = true; found_match = true;
@ -330,6 +313,37 @@ struct ThreadedRegexVM
return false; return false;
} }
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
{
m_begin = begin;
m_end = end;
m_flags = flags;
if (flags & RegexExecFlags::NotInitialNull and m_begin == m_end)
return false;
Vector<Thread> current_threads, next_threads;
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
Vector<Iterator> empty_saves(m_program.save_count, Iterator{});
Utf8It start{m_begin, m_begin, m_end};
if (exec_from(start, no_saves ? nullptr : clone_saves(empty_saves.data()),
current_threads, next_threads))
return true;
if (not (flags & RegexExecFlags::Search))
return false;
while (start != end)
{
if (exec_from(++start, no_saves ? nullptr : clone_saves(empty_saves.data()),
current_threads, next_threads))
return true;
}
return false;
}
bool is_line_start() const bool is_line_start() const
{ {
return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or return (m_pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
@ -351,8 +365,6 @@ struct ThreadedRegexVM
const CompiledRegex& m_program; const CompiledRegex& m_program;
using Utf8It = utf8::iterator<Iterator>;
Iterator m_begin; Iterator m_begin;
Iterator m_end; Iterator m_end;
Utf8It m_pos; Utf8It m_pos;