Regex: abandon bytecode and just use a simple list of instructions

Makes the code simpler.
This commit is contained in:
Maxime Coste 2017-10-07 18:51:32 +08:00
parent 6434bca325
commit 732b8bc2a4
2 changed files with 129 additions and 179 deletions

View File

@ -505,7 +505,7 @@ struct RegexCompiler
: m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
{
compile_node(m_parsed_regex.ast);
push_op(CompiledRegex::Match);
push_inst(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
m_program.save_count = m_parsed_regex.capture_count * 2;
m_program.direction = direction;
@ -515,34 +515,30 @@ struct RegexCompiler
CompiledRegex get_compiled_regex() { return std::move(m_program); }
private:
using Offset = CompiledRegex::Offset;
Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
{
const auto start_pos = m_program.bytecode.size();
const auto start_pos = m_program.instructions.size();
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
if (capture != -1)
{
push_op(CompiledRegex::Save);
push_byte(capture * 2 + (m_forward ? 0 : 1));
}
push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 0 : 1));
Vector<Offset> goto_inner_end_offsets;
Vector<uint32_t> goto_inner_end_offsets;
switch (node->op)
{
case ParsedRegex::Literal:
push_op(node->ignore_case ? CompiledRegex::LiteralIgnoreCase
: CompiledRegex::Literal);
push_codepoint(node->ignore_case ? to_lower(node->value)
: node->value);
if (node->ignore_case)
push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value));
else
push_inst(CompiledRegex::Literal, node->value);
break;
case ParsedRegex::AnyChar:
push_op(CompiledRegex::AnyChar);
push_inst(CompiledRegex::AnyChar);
break;
case ParsedRegex::Matcher:
push_op(CompiledRegex::Matcher);
push_byte(node->value);
push_inst(CompiledRegex::Matcher, node->value);
break;
case ParsedRegex::Sequence:
{
if (m_forward)
@ -558,82 +554,77 @@ private:
auto& children = node->children;
kak_assert(children.size() == 2);
push_op(CompiledRegex::Split_PrioritizeParent);
auto offset = alloc_offset();
auto split_pos = push_inst(CompiledRegex::Split_PrioritizeParent);
compile_node(children[m_forward ? 0 : 1]);
push_op(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(alloc_offset());
auto left_pos = push_inst(CompiledRegex::Jump);
goto_inner_end_offsets.push_back(left_pos);
auto right_pos = compile_node(children[m_forward ? 1 : 0]);
set_offset(offset, right_pos);
m_program.instructions[split_pos].param = right_pos;
break;
}
case ParsedRegex::LookAhead:
push_op(m_forward ? CompiledRegex::LookAhead
: CompiledRegex::LookBehind);
push_string(node->children, false);
push_inst(m_forward ? CompiledRegex::LookAhead
: CompiledRegex::LookBehind,
push_lookaround(node->children, false));
break;
case ParsedRegex::NegativeLookAhead:
push_op(m_forward ? CompiledRegex::NegativeLookAhead
: CompiledRegex::NegativeLookBehind);
push_string(node->children, false);
push_inst(m_forward ? CompiledRegex::NegativeLookAhead
: CompiledRegex::NegativeLookBehind,
push_lookaround(node->children, false));
break;
case ParsedRegex::LookBehind:
push_op(m_forward ? CompiledRegex::LookBehind
: CompiledRegex::LookAhead);
push_string(node->children, true);
push_inst(m_forward ? CompiledRegex::LookBehind
: CompiledRegex::LookAhead,
push_lookaround(node->children, true));
break;
case ParsedRegex::NegativeLookBehind:
push_op(m_forward ? CompiledRegex::NegativeLookBehind
: CompiledRegex::NegativeLookAhead);
push_string(node->children, true);
push_inst(m_forward ? CompiledRegex::NegativeLookBehind
: CompiledRegex::NegativeLookAhead,
push_lookaround(node->children, true));
break;
case ParsedRegex::LineStart:
push_op(m_forward ? CompiledRegex::LineStart
: CompiledRegex::LineEnd);
push_inst(m_forward ? CompiledRegex::LineStart
: CompiledRegex::LineEnd);
break;
case ParsedRegex::LineEnd:
push_op(m_forward ? CompiledRegex::LineEnd
: CompiledRegex::LineStart);
push_inst(m_forward ? CompiledRegex::LineEnd
: CompiledRegex::LineStart);
break;
case ParsedRegex::WordBoundary:
push_op(CompiledRegex::WordBoundary);
push_inst(CompiledRegex::WordBoundary);
break;
case ParsedRegex::NotWordBoundary:
push_op(CompiledRegex::NotWordBoundary);
push_inst(CompiledRegex::NotWordBoundary);
break;
case ParsedRegex::SubjectBegin:
push_op(m_forward ? CompiledRegex::SubjectBegin
: CompiledRegex::SubjectEnd);
push_inst(m_forward ? CompiledRegex::SubjectBegin
: CompiledRegex::SubjectEnd);
break;
case ParsedRegex::SubjectEnd:
push_op(m_forward ? CompiledRegex::SubjectEnd
: CompiledRegex::SubjectBegin);
push_inst(m_forward ? CompiledRegex::SubjectEnd
: CompiledRegex::SubjectBegin);
break;
case ParsedRegex::ResetStart:
push_op(CompiledRegex::Save);
push_byte(0);
push_inst(CompiledRegex::Save, 0);
break;
}
for (auto& offset : goto_inner_end_offsets)
set_offset(offset, m_program.bytecode.size());
m_program.instructions[offset].param = m_program.instructions.size();
if (capture != -1)
{
push_op(CompiledRegex::Save);
push_byte(capture * 2 + (m_forward ? 1 : 0));
}
push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 1 : 0));
return start_pos;
}
Offset compile_node(const ParsedRegex::AstNodePtr& node)
uint32_t compile_node(const ParsedRegex::AstNodePtr& node)
{
Offset pos = m_program.bytecode.size();
Vector<Offset> goto_end_offsets;
uint32_t pos = m_program.instructions.size();
Vector<uint32_t> goto_ends;
auto& quantifier = node->quantifier;
@ -641,9 +632,9 @@ private:
if (quantifier.allows_none())
{
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild);
goto_end_offsets.push_back(alloc_offset());
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos);
}
auto inner_pos = compile_node_inner(node);
@ -652,66 +643,45 @@ private:
inner_pos = compile_node_inner(node);
if (quantifier.allows_infinite_repeat())
{
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
: CompiledRegex::Split_PrioritizeParent);
set_offset(alloc_offset(), inner_pos);
}
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
: CompiledRegex::Split_PrioritizeParent,
inner_pos);
// Write the node as an optional match for the min -> max counts
else for (int i = std::max(1, quantifier.min); // STILL UGLY !
i < quantifier.max; ++i)
{
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild);
goto_end_offsets.push_back(alloc_offset());
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
: CompiledRegex::Split_PrioritizeChild);
goto_ends.push_back(split_pos);
compile_node_inner(node);
}
for (auto offset : goto_end_offsets)
set_offset(offset, m_program.bytecode.size());
for (auto offset : goto_ends)
m_program.instructions[offset].param = m_program.instructions.size();
return pos;
}
Offset alloc_offset()
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
{
auto pos = m_program.bytecode.size();
m_program.bytecode.resize(pos + sizeof(Offset));
return pos;
uint32_t res = m_program.instructions.size();
m_program.instructions.push_back({ op, param });
return res;
}
void set_offset(Offset pos, Offset value)
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& literals, bool reversed = false)
{
memcpy(&m_program.bytecode[pos], &value, sizeof(Offset));
}
void push_op(CompiledRegex::Op op)
{
m_program.bytecode.push_back(op);
}
void push_byte(char byte)
{
m_program.bytecode.push_back(byte);
}
void push_codepoint(Codepoint cp)
{
utf8::dump(std::back_inserter(m_program.bytecode), cp);
}
void push_string(const Vector<ParsedRegex::AstNodePtr>& codepoints, bool reversed = false)
{
if (codepoints.size() > 127)
throw runtime_error{"Too long literal string"};
push_byte(codepoints.size());
uint32_t res = m_program.lookarounds.size();
if (reversed)
for (auto& cp : codepoints | reverse())
push_codepoint(cp->value);
for (auto& literal : literals | reverse())
m_program.lookarounds.push_back(literal->value);
else
for (auto& cp : codepoints)
push_codepoint(cp->value);
for (auto& literal : literals)
m_program.lookarounds.push_back(literal->value);
m_program.lookarounds.push_back((Codepoint)-1);
return res;
}
// Fills accepted and rejected according to which chars can start the given node,
@ -804,40 +774,35 @@ private:
void dump_regex(const CompiledRegex& program)
{
for (auto pos = program.bytecode.data(), end = program.bytecode.data() + program.bytecode.size();
pos < end; )
for (auto& inst : program.instructions)
{
printf("%4zd ", pos - program.bytecode.data());
const auto op = (CompiledRegex::Op)*pos++;
switch (op)
switch (inst.op)
{
case CompiledRegex::Literal:
printf("literal %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
printf("literal %lc\n", inst.param);
break;
case CompiledRegex::LiteralIgnoreCase:
printf("literal (ignore case) %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
printf("literal (ignore case) %lc\n", inst.param);
break;
case CompiledRegex::AnyChar:
printf("any char\n");
break;
case CompiledRegex::Jump:
printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
pos += sizeof(CompiledRegex::Offset);
printf("jump %u\n", inst.param);
break;
case CompiledRegex::Split_PrioritizeParent:
case CompiledRegex::Split_PrioritizeChild:
{
printf("split (prioritize %s) %u\n",
op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
*reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
pos += sizeof(CompiledRegex::Offset);
inst.op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
inst.param);
break;
}
case CompiledRegex::Save:
printf("save %d\n", *pos++);
printf("save %d\n", inst.param);
break;
case CompiledRegex::Matcher:
printf("matcher %d\n", *pos++);
printf("matcher %d\n", inst.param);
break;
case CompiledRegex::LineStart:
printf("line start\n");
@ -862,20 +827,20 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
{
int count = *pos++;
StringView str{pos, pos + count};
const char* name = nullptr;
if (op == CompiledRegex::LookAhead)
if (inst.op == CompiledRegex::LookAhead)
name = "look ahead";
if (op == CompiledRegex::NegativeLookAhead)
if (inst.op == CompiledRegex::NegativeLookAhead)
name = "negative look ahead";
if (op == CompiledRegex::LookBehind)
if (inst.op == CompiledRegex::LookBehind)
name = "look behind";
if (op == CompiledRegex::NegativeLookBehind)
if (inst.op == CompiledRegex::NegativeLookBehind)
name = "negative look behind";
printf("%s (%s)\n", name, (const char*)str.zstr());
pos += count;
String str;
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
utf8::dump(std::back_inserter(str), *it);
printf("%s (%s)\n", name, str.c_str());
break;
}
case CompiledRegex::Match:

View File

@ -45,11 +45,17 @@ struct CompiledRegex : RefCountable
NegativeLookBehind,
};
using Offset = unsigned;
explicit operator bool() const { return not bytecode.empty(); }
struct Instruction
{
Op op;
uint32_t param;
};
Vector<char> bytecode;
explicit operator bool() const { return not instructions.empty(); }
Vector<Instruction> instructions;
Vector<std::function<bool (Codepoint)>> matchers;
Vector<Codepoint> lookarounds;
MatchDirection direction;
size_t save_count;
@ -123,7 +129,7 @@ public:
return false;
Vector<Thread> current_threads, next_threads;
std::unique_ptr<bool[]> inst_processed{new bool[m_program.bytecode.size()]};
std::unique_ptr<bool[]> processed_inst{new bool[m_program.instructions.size()]};
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
Utf8It start{m_begin};
@ -134,7 +140,7 @@ public:
to_next_start(start, m_end, start_chars);
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads, inst_processed.get()))
current_threads, next_threads, processed_inst.get()))
return true;
if (not (flags & RegexExecFlags::Search))
@ -144,7 +150,7 @@ public:
{
to_next_start(++start, m_end, start_chars);
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads, inst_processed.get()))
current_threads, next_threads, processed_inst.get()))
return true;
}
while (start != m_end);
@ -200,7 +206,7 @@ private:
struct Thread
{
const char* inst;
uint32_t inst;
Saves* saves;
};
@ -209,58 +215,49 @@ private:
enum class StepResult { Consumed, Matched, Failed };
// Steps a thread until it consumes the current character, matches or fail
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads, bool* inst_processed)
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads, bool* processed_inst)
{
const auto prog_start = m_program.bytecode.data();
const auto prog_end = prog_start + m_program.bytecode.size();
while (true)
{
// If we have hit this instruction on this character, in this thread or another, do not try again
const auto inst_offset = thread.inst - prog_start;
if (inst_processed[inst_offset])
if (processed_inst[thread.inst])
return StepResult::Failed;
inst_processed[inst_offset] = true;
processed_inst[thread.inst] = true;
auto& inst = m_program.instructions[thread.inst++];
const Codepoint cp = pos == m_end ? 0 : *pos;
const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
switch (op)
switch (inst.op)
{
case CompiledRegex::Literal:
if (utf8::read_codepoint(thread.inst, prog_end) == cp)
if (inst.param == cp)
return StepResult::Consumed;
return StepResult::Failed;
case CompiledRegex::LiteralIgnoreCase:
if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
if (inst.param == to_lower(cp))
return StepResult::Consumed;
return StepResult::Failed;
case CompiledRegex::AnyChar:
return StepResult::Consumed;
case CompiledRegex::Jump:
thread.inst = prog_start + get_offset(thread.inst);
thread.inst = inst.param;
break;
case CompiledRegex::Split_PrioritizeParent:
{
auto parent = thread.inst + sizeof(CompiledRegex::Offset);
auto child = prog_start + get_offset(thread.inst);
thread.inst = parent;
if (thread.saves)
++thread.saves->refcount;
threads.push_back({child, thread.saves});
threads.push_back({inst.param, thread.saves});
break;
}
case CompiledRegex::Split_PrioritizeChild:
{
auto parent = thread.inst + sizeof(CompiledRegex::Offset);
auto child = prog_start + get_offset(thread.inst);
thread.inst = child;
if (thread.saves)
++thread.saves->refcount;
threads.push_back({parent, thread.saves});
threads.push_back({thread.inst, thread.saves});
thread.inst = inst.param;
break;
}
case CompiledRegex::Save:
{
const size_t index = *thread.inst++;
if (thread.saves == nullptr)
break;
if (thread.saves->refcount > 1)
@ -268,15 +265,12 @@ private:
--thread.saves->refcount;
thread.saves = new_saves<true>(thread.saves->pos);
}
thread.saves->pos[index] = get_base(pos);
thread.saves->pos[inst.param] = get_base(pos);
break;
}
case CompiledRegex::Matcher:
{
const int matcher_id = *thread.inst++;
return m_program.matchers[matcher_id](cp) ?
return m_program.matchers[inst.param](cp) ?
StepResult::Consumed : StepResult::Failed;
}
case CompiledRegex::LineStart:
if (not is_line_start(pos))
return StepResult::Failed;
@ -304,27 +298,25 @@ private:
case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead:
{
int count = *thread.inst++;
for (auto it = pos; count and it != m_end; ++it, --count)
if (*it != utf8::read(thread.inst))
auto ref = m_program.lookarounds.begin() + inst.param;
for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref)
if (*it != *ref)
break;
if ((op == CompiledRegex::LookAhead and count != 0) or
(op == CompiledRegex::NegativeLookAhead and count == 0))
if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or
(inst.op == CompiledRegex::NegativeLookAhead and *ref == -1))
return StepResult::Failed;
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
break;
}
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
{
int count = *thread.inst++;
for (auto it = pos-1; count and it >= m_begin; --it, --count)
if (*it != utf8::read(thread.inst))
auto ref = m_program.lookarounds.begin() + inst.param;
for (auto it = pos-1; *ref != -1 and it >= m_begin; --it, ++ref)
if (*it != *ref)
break;
if ((op == CompiledRegex::LookBehind and count != 0) or
(op == CompiledRegex::NegativeLookBehind and count == 0))
if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or
(inst.op == CompiledRegex::NegativeLookBehind and *ref == -1))
return StepResult::Failed;
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
break;
}
case CompiledRegex::Match:
@ -334,20 +326,20 @@ private:
return StepResult::Failed;
}
bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads, bool* inst_processed)
bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads, bool* processed_inst)
{
current_threads.push_back({m_program.bytecode.data(), initial_saves});
current_threads.push_back({0, initial_saves});
next_threads.clear();
bool found_match = false;
for (Utf8It pos = start; pos != m_end; ++pos)
{
memset(inst_processed, 0, m_program.bytecode.size() * sizeof(bool));
memset(processed_inst, 0, sizeof(bool) * m_program.instructions.size());
while (not current_threads.empty())
{
auto thread = current_threads.back();
current_threads.pop_back();
switch (step(pos, thread, current_threads, inst_processed))
switch (step(pos, thread, current_threads, processed_inst))
{
case StepResult::Matched:
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
@ -385,13 +377,13 @@ private:
if (found_match)
return true;
memset(inst_processed, 0, m_program.bytecode.size() * sizeof(bool));
memset(processed_inst, 0, sizeof(bool) * m_program.instructions.size());
// Step remaining threads to see if they match without consuming anything else
while (not current_threads.empty())
{
auto thread = current_threads.back();
current_threads.pop_back();
if (step(m_end, thread, current_threads, inst_processed) == StepResult::Matched)
if (step(m_end, thread, current_threads, processed_inst) == StepResult::Matched)
{
release_saves(m_captures);
m_captures = thread.saves;
@ -411,13 +403,6 @@ private:
++start;
}
static CompiledRegex::Offset get_offset(const char* ptr)
{
CompiledRegex::Offset res;
memcpy(&res, ptr, sizeof(CompiledRegex::Offset));
return res;
}
bool is_line_start(const Utf8It& pos) const
{
return (pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or