Regex: abandon bytecode and just use a simple list of instructions
Makes the code simpler.
This commit is contained in:
parent
6434bca325
commit
732b8bc2a4
|
@ -505,7 +505,7 @@ struct RegexCompiler
|
||||||
: m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
|
: m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
|
||||||
{
|
{
|
||||||
compile_node(m_parsed_regex.ast);
|
compile_node(m_parsed_regex.ast);
|
||||||
push_op(CompiledRegex::Match);
|
push_inst(CompiledRegex::Match);
|
||||||
m_program.matchers = m_parsed_regex.matchers;
|
m_program.matchers = m_parsed_regex.matchers;
|
||||||
m_program.save_count = m_parsed_regex.capture_count * 2;
|
m_program.save_count = m_parsed_regex.capture_count * 2;
|
||||||
m_program.direction = direction;
|
m_program.direction = direction;
|
||||||
|
@ -515,34 +515,30 @@ struct RegexCompiler
|
||||||
CompiledRegex get_compiled_regex() { return std::move(m_program); }
|
CompiledRegex get_compiled_regex() { return std::move(m_program); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
using Offset = CompiledRegex::Offset;
|
|
||||||
|
|
||||||
Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
|
uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
|
||||||
{
|
{
|
||||||
const auto start_pos = m_program.bytecode.size();
|
const auto start_pos = m_program.instructions.size();
|
||||||
|
|
||||||
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
|
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
|
||||||
if (capture != -1)
|
if (capture != -1)
|
||||||
{
|
push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 0 : 1));
|
||||||
push_op(CompiledRegex::Save);
|
|
||||||
push_byte(capture * 2 + (m_forward ? 0 : 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
Vector<Offset> goto_inner_end_offsets;
|
Vector<uint32_t> goto_inner_end_offsets;
|
||||||
switch (node->op)
|
switch (node->op)
|
||||||
{
|
{
|
||||||
case ParsedRegex::Literal:
|
case ParsedRegex::Literal:
|
||||||
push_op(node->ignore_case ? CompiledRegex::LiteralIgnoreCase
|
if (node->ignore_case)
|
||||||
: CompiledRegex::Literal);
|
push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value));
|
||||||
push_codepoint(node->ignore_case ? to_lower(node->value)
|
else
|
||||||
: node->value);
|
push_inst(CompiledRegex::Literal, node->value);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::AnyChar:
|
case ParsedRegex::AnyChar:
|
||||||
push_op(CompiledRegex::AnyChar);
|
push_inst(CompiledRegex::AnyChar);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::Matcher:
|
case ParsedRegex::Matcher:
|
||||||
push_op(CompiledRegex::Matcher);
|
push_inst(CompiledRegex::Matcher, node->value);
|
||||||
push_byte(node->value);
|
break;
|
||||||
case ParsedRegex::Sequence:
|
case ParsedRegex::Sequence:
|
||||||
{
|
{
|
||||||
if (m_forward)
|
if (m_forward)
|
||||||
|
@ -558,82 +554,77 @@ private:
|
||||||
auto& children = node->children;
|
auto& children = node->children;
|
||||||
kak_assert(children.size() == 2);
|
kak_assert(children.size() == 2);
|
||||||
|
|
||||||
push_op(CompiledRegex::Split_PrioritizeParent);
|
auto split_pos = push_inst(CompiledRegex::Split_PrioritizeParent);
|
||||||
auto offset = alloc_offset();
|
|
||||||
|
|
||||||
compile_node(children[m_forward ? 0 : 1]);
|
compile_node(children[m_forward ? 0 : 1]);
|
||||||
push_op(CompiledRegex::Jump);
|
auto left_pos = push_inst(CompiledRegex::Jump);
|
||||||
goto_inner_end_offsets.push_back(alloc_offset());
|
goto_inner_end_offsets.push_back(left_pos);
|
||||||
|
|
||||||
auto right_pos = compile_node(children[m_forward ? 1 : 0]);
|
auto right_pos = compile_node(children[m_forward ? 1 : 0]);
|
||||||
set_offset(offset, right_pos);
|
m_program.instructions[split_pos].param = right_pos;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
push_op(m_forward ? CompiledRegex::LookAhead
|
push_inst(m_forward ? CompiledRegex::LookAhead
|
||||||
: CompiledRegex::LookBehind);
|
: CompiledRegex::LookBehind,
|
||||||
push_string(node->children, false);
|
push_lookaround(node->children, false));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
push_op(m_forward ? CompiledRegex::NegativeLookAhead
|
push_inst(m_forward ? CompiledRegex::NegativeLookAhead
|
||||||
: CompiledRegex::NegativeLookBehind);
|
: CompiledRegex::NegativeLookBehind,
|
||||||
push_string(node->children, false);
|
push_lookaround(node->children, false));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LookBehind:
|
case ParsedRegex::LookBehind:
|
||||||
push_op(m_forward ? CompiledRegex::LookBehind
|
push_inst(m_forward ? CompiledRegex::LookBehind
|
||||||
: CompiledRegex::LookAhead);
|
: CompiledRegex::LookAhead,
|
||||||
push_string(node->children, true);
|
push_lookaround(node->children, true));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookBehind:
|
case ParsedRegex::NegativeLookBehind:
|
||||||
push_op(m_forward ? CompiledRegex::NegativeLookBehind
|
push_inst(m_forward ? CompiledRegex::NegativeLookBehind
|
||||||
: CompiledRegex::NegativeLookAhead);
|
: CompiledRegex::NegativeLookAhead,
|
||||||
push_string(node->children, true);
|
push_lookaround(node->children, true));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineStart:
|
case ParsedRegex::LineStart:
|
||||||
push_op(m_forward ? CompiledRegex::LineStart
|
push_inst(m_forward ? CompiledRegex::LineStart
|
||||||
: CompiledRegex::LineEnd);
|
: CompiledRegex::LineEnd);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineEnd:
|
case ParsedRegex::LineEnd:
|
||||||
push_op(m_forward ? CompiledRegex::LineEnd
|
push_inst(m_forward ? CompiledRegex::LineEnd
|
||||||
: CompiledRegex::LineStart);
|
: CompiledRegex::LineStart);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::WordBoundary:
|
case ParsedRegex::WordBoundary:
|
||||||
push_op(CompiledRegex::WordBoundary);
|
push_inst(CompiledRegex::WordBoundary);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NotWordBoundary:
|
case ParsedRegex::NotWordBoundary:
|
||||||
push_op(CompiledRegex::NotWordBoundary);
|
push_inst(CompiledRegex::NotWordBoundary);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectBegin:
|
case ParsedRegex::SubjectBegin:
|
||||||
push_op(m_forward ? CompiledRegex::SubjectBegin
|
push_inst(m_forward ? CompiledRegex::SubjectBegin
|
||||||
: CompiledRegex::SubjectEnd);
|
: CompiledRegex::SubjectEnd);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::SubjectEnd:
|
case ParsedRegex::SubjectEnd:
|
||||||
push_op(m_forward ? CompiledRegex::SubjectEnd
|
push_inst(m_forward ? CompiledRegex::SubjectEnd
|
||||||
: CompiledRegex::SubjectBegin);
|
: CompiledRegex::SubjectBegin);
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::ResetStart:
|
case ParsedRegex::ResetStart:
|
||||||
push_op(CompiledRegex::Save);
|
push_inst(CompiledRegex::Save, 0);
|
||||||
push_byte(0);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& offset : goto_inner_end_offsets)
|
for (auto& offset : goto_inner_end_offsets)
|
||||||
set_offset(offset, m_program.bytecode.size());
|
m_program.instructions[offset].param = m_program.instructions.size();
|
||||||
|
|
||||||
if (capture != -1)
|
if (capture != -1)
|
||||||
{
|
push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 1 : 0));
|
||||||
push_op(CompiledRegex::Save);
|
|
||||||
push_byte(capture * 2 + (m_forward ? 1 : 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
return start_pos;
|
return start_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
Offset compile_node(const ParsedRegex::AstNodePtr& node)
|
uint32_t compile_node(const ParsedRegex::AstNodePtr& node)
|
||||||
{
|
{
|
||||||
Offset pos = m_program.bytecode.size();
|
uint32_t pos = m_program.instructions.size();
|
||||||
Vector<Offset> goto_end_offsets;
|
Vector<uint32_t> goto_ends;
|
||||||
|
|
||||||
auto& quantifier = node->quantifier;
|
auto& quantifier = node->quantifier;
|
||||||
|
|
||||||
|
@ -641,9 +632,9 @@ private:
|
||||||
|
|
||||||
if (quantifier.allows_none())
|
if (quantifier.allows_none())
|
||||||
{
|
{
|
||||||
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
||||||
: CompiledRegex::Split_PrioritizeChild);
|
: CompiledRegex::Split_PrioritizeChild);
|
||||||
goto_end_offsets.push_back(alloc_offset());
|
goto_ends.push_back(split_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto inner_pos = compile_node_inner(node);
|
auto inner_pos = compile_node_inner(node);
|
||||||
|
@ -652,66 +643,45 @@ private:
|
||||||
inner_pos = compile_node_inner(node);
|
inner_pos = compile_node_inner(node);
|
||||||
|
|
||||||
if (quantifier.allows_infinite_repeat())
|
if (quantifier.allows_infinite_repeat())
|
||||||
{
|
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
|
||||||
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
|
: CompiledRegex::Split_PrioritizeParent,
|
||||||
: CompiledRegex::Split_PrioritizeParent);
|
inner_pos);
|
||||||
set_offset(alloc_offset(), inner_pos);
|
|
||||||
}
|
|
||||||
// Write the node as an optional match for the min -> max counts
|
// Write the node as an optional match for the min -> max counts
|
||||||
else for (int i = std::max(1, quantifier.min); // STILL UGLY !
|
else for (int i = std::max(1, quantifier.min); // STILL UGLY !
|
||||||
i < quantifier.max; ++i)
|
i < quantifier.max; ++i)
|
||||||
{
|
{
|
||||||
push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
||||||
: CompiledRegex::Split_PrioritizeChild);
|
: CompiledRegex::Split_PrioritizeChild);
|
||||||
goto_end_offsets.push_back(alloc_offset());
|
goto_ends.push_back(split_pos);
|
||||||
compile_node_inner(node);
|
compile_node_inner(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto offset : goto_end_offsets)
|
for (auto offset : goto_ends)
|
||||||
set_offset(offset, m_program.bytecode.size());
|
m_program.instructions[offset].param = m_program.instructions.size();
|
||||||
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
Offset alloc_offset()
|
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
|
||||||
{
|
{
|
||||||
auto pos = m_program.bytecode.size();
|
uint32_t res = m_program.instructions.size();
|
||||||
m_program.bytecode.resize(pos + sizeof(Offset));
|
m_program.instructions.push_back({ op, param });
|
||||||
return pos;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_offset(Offset pos, Offset value)
|
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& literals, bool reversed = false)
|
||||||
{
|
{
|
||||||
memcpy(&m_program.bytecode[pos], &value, sizeof(Offset));
|
uint32_t res = m_program.lookarounds.size();
|
||||||
}
|
|
||||||
|
|
||||||
void push_op(CompiledRegex::Op op)
|
|
||||||
{
|
|
||||||
m_program.bytecode.push_back(op);
|
|
||||||
}
|
|
||||||
|
|
||||||
void push_byte(char byte)
|
|
||||||
{
|
|
||||||
m_program.bytecode.push_back(byte);
|
|
||||||
}
|
|
||||||
|
|
||||||
void push_codepoint(Codepoint cp)
|
|
||||||
{
|
|
||||||
utf8::dump(std::back_inserter(m_program.bytecode), cp);
|
|
||||||
}
|
|
||||||
|
|
||||||
void push_string(const Vector<ParsedRegex::AstNodePtr>& codepoints, bool reversed = false)
|
|
||||||
{
|
|
||||||
if (codepoints.size() > 127)
|
|
||||||
throw runtime_error{"Too long literal string"};
|
|
||||||
|
|
||||||
push_byte(codepoints.size());
|
|
||||||
if (reversed)
|
if (reversed)
|
||||||
for (auto& cp : codepoints | reverse())
|
for (auto& literal : literals | reverse())
|
||||||
push_codepoint(cp->value);
|
m_program.lookarounds.push_back(literal->value);
|
||||||
else
|
else
|
||||||
for (auto& cp : codepoints)
|
for (auto& literal : literals)
|
||||||
push_codepoint(cp->value);
|
m_program.lookarounds.push_back(literal->value);
|
||||||
|
|
||||||
|
m_program.lookarounds.push_back((Codepoint)-1);
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fills accepted and rejected according to which chars can start the given node,
|
// Fills accepted and rejected according to which chars can start the given node,
|
||||||
|
@ -804,40 +774,35 @@ private:
|
||||||
|
|
||||||
void dump_regex(const CompiledRegex& program)
|
void dump_regex(const CompiledRegex& program)
|
||||||
{
|
{
|
||||||
for (auto pos = program.bytecode.data(), end = program.bytecode.data() + program.bytecode.size();
|
for (auto& inst : program.instructions)
|
||||||
pos < end; )
|
|
||||||
{
|
{
|
||||||
printf("%4zd ", pos - program.bytecode.data());
|
switch (inst.op)
|
||||||
const auto op = (CompiledRegex::Op)*pos++;
|
|
||||||
switch (op)
|
|
||||||
{
|
{
|
||||||
case CompiledRegex::Literal:
|
case CompiledRegex::Literal:
|
||||||
printf("literal %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
|
printf("literal %lc\n", inst.param);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LiteralIgnoreCase:
|
case CompiledRegex::LiteralIgnoreCase:
|
||||||
printf("literal (ignore case) %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
|
printf("literal (ignore case) %lc\n", inst.param);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::AnyChar:
|
case CompiledRegex::AnyChar:
|
||||||
printf("any char\n");
|
printf("any char\n");
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Jump:
|
case CompiledRegex::Jump:
|
||||||
printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
printf("jump %u\n", inst.param);
|
||||||
pos += sizeof(CompiledRegex::Offset);
|
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Split_PrioritizeParent:
|
case CompiledRegex::Split_PrioritizeParent:
|
||||||
case CompiledRegex::Split_PrioritizeChild:
|
case CompiledRegex::Split_PrioritizeChild:
|
||||||
{
|
{
|
||||||
printf("split (prioritize %s) %u\n",
|
printf("split (prioritize %s) %u\n",
|
||||||
op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
|
inst.op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
|
||||||
*reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
|
inst.param);
|
||||||
pos += sizeof(CompiledRegex::Offset);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Save:
|
case CompiledRegex::Save:
|
||||||
printf("save %d\n", *pos++);
|
printf("save %d\n", inst.param);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Matcher:
|
case CompiledRegex::Matcher:
|
||||||
printf("matcher %d\n", *pos++);
|
printf("matcher %d\n", inst.param);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LineStart:
|
case CompiledRegex::LineStart:
|
||||||
printf("line start\n");
|
printf("line start\n");
|
||||||
|
@ -862,20 +827,20 @@ void dump_regex(const CompiledRegex& program)
|
||||||
case CompiledRegex::LookBehind:
|
case CompiledRegex::LookBehind:
|
||||||
case CompiledRegex::NegativeLookBehind:
|
case CompiledRegex::NegativeLookBehind:
|
||||||
{
|
{
|
||||||
int count = *pos++;
|
|
||||||
StringView str{pos, pos + count};
|
|
||||||
const char* name = nullptr;
|
const char* name = nullptr;
|
||||||
if (op == CompiledRegex::LookAhead)
|
if (inst.op == CompiledRegex::LookAhead)
|
||||||
name = "look ahead";
|
name = "look ahead";
|
||||||
if (op == CompiledRegex::NegativeLookAhead)
|
if (inst.op == CompiledRegex::NegativeLookAhead)
|
||||||
name = "negative look ahead";
|
name = "negative look ahead";
|
||||||
if (op == CompiledRegex::LookBehind)
|
if (inst.op == CompiledRegex::LookBehind)
|
||||||
name = "look behind";
|
name = "look behind";
|
||||||
if (op == CompiledRegex::NegativeLookBehind)
|
if (inst.op == CompiledRegex::NegativeLookBehind)
|
||||||
name = "negative look behind";
|
name = "negative look behind";
|
||||||
|
|
||||||
printf("%s (%s)\n", name, (const char*)str.zstr());
|
String str;
|
||||||
pos += count;
|
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
|
||||||
|
utf8::dump(std::back_inserter(str), *it);
|
||||||
|
printf("%s (%s)\n", name, str.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
|
|
|
@ -45,11 +45,17 @@ struct CompiledRegex : RefCountable
|
||||||
NegativeLookBehind,
|
NegativeLookBehind,
|
||||||
};
|
};
|
||||||
|
|
||||||
using Offset = unsigned;
|
struct Instruction
|
||||||
explicit operator bool() const { return not bytecode.empty(); }
|
{
|
||||||
|
Op op;
|
||||||
|
uint32_t param;
|
||||||
|
};
|
||||||
|
|
||||||
Vector<char> bytecode;
|
explicit operator bool() const { return not instructions.empty(); }
|
||||||
|
|
||||||
|
Vector<Instruction> instructions;
|
||||||
Vector<std::function<bool (Codepoint)>> matchers;
|
Vector<std::function<bool (Codepoint)>> matchers;
|
||||||
|
Vector<Codepoint> lookarounds;
|
||||||
MatchDirection direction;
|
MatchDirection direction;
|
||||||
size_t save_count;
|
size_t save_count;
|
||||||
|
|
||||||
|
@ -123,7 +129,7 @@ public:
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
Vector<Thread> current_threads, next_threads;
|
Vector<Thread> current_threads, next_threads;
|
||||||
std::unique_ptr<bool[]> inst_processed{new bool[m_program.bytecode.size()]};
|
std::unique_ptr<bool[]> processed_inst{new bool[m_program.instructions.size()]};
|
||||||
|
|
||||||
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
||||||
Utf8It start{m_begin};
|
Utf8It start{m_begin};
|
||||||
|
@ -134,7 +140,7 @@ public:
|
||||||
to_next_start(start, m_end, start_chars);
|
to_next_start(start, m_end, start_chars);
|
||||||
|
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
||||||
current_threads, next_threads, inst_processed.get()))
|
current_threads, next_threads, processed_inst.get()))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (not (flags & RegexExecFlags::Search))
|
if (not (flags & RegexExecFlags::Search))
|
||||||
|
@ -144,7 +150,7 @@ public:
|
||||||
{
|
{
|
||||||
to_next_start(++start, m_end, start_chars);
|
to_next_start(++start, m_end, start_chars);
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
||||||
current_threads, next_threads, inst_processed.get()))
|
current_threads, next_threads, processed_inst.get()))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
while (start != m_end);
|
while (start != m_end);
|
||||||
|
@ -200,7 +206,7 @@ private:
|
||||||
|
|
||||||
struct Thread
|
struct Thread
|
||||||
{
|
{
|
||||||
const char* inst;
|
uint32_t inst;
|
||||||
Saves* saves;
|
Saves* saves;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -209,58 +215,49 @@ private:
|
||||||
enum class StepResult { Consumed, Matched, Failed };
|
enum class StepResult { Consumed, Matched, Failed };
|
||||||
|
|
||||||
// Steps a thread until it consumes the current character, matches or fail
|
// Steps a thread until it consumes the current character, matches or fail
|
||||||
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads, bool* inst_processed)
|
StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads, bool* processed_inst)
|
||||||
{
|
{
|
||||||
const auto prog_start = m_program.bytecode.data();
|
|
||||||
const auto prog_end = prog_start + m_program.bytecode.size();
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
// If we have hit this instruction on this character, in this thread or another, do not try again
|
if (processed_inst[thread.inst])
|
||||||
const auto inst_offset = thread.inst - prog_start;
|
|
||||||
if (inst_processed[inst_offset])
|
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
inst_processed[inst_offset] = true;
|
processed_inst[thread.inst] = true;
|
||||||
|
|
||||||
|
auto& inst = m_program.instructions[thread.inst++];
|
||||||
|
|
||||||
const Codepoint cp = pos == m_end ? 0 : *pos;
|
const Codepoint cp = pos == m_end ? 0 : *pos;
|
||||||
const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
|
switch (inst.op)
|
||||||
switch (op)
|
|
||||||
{
|
{
|
||||||
case CompiledRegex::Literal:
|
case CompiledRegex::Literal:
|
||||||
if (utf8::read_codepoint(thread.inst, prog_end) == cp)
|
if (inst.param == cp)
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
case CompiledRegex::LiteralIgnoreCase:
|
case CompiledRegex::LiteralIgnoreCase:
|
||||||
if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
|
if (inst.param == to_lower(cp))
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
case CompiledRegex::AnyChar:
|
case CompiledRegex::AnyChar:
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
case CompiledRegex::Jump:
|
case CompiledRegex::Jump:
|
||||||
thread.inst = prog_start + get_offset(thread.inst);
|
thread.inst = inst.param;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Split_PrioritizeParent:
|
case CompiledRegex::Split_PrioritizeParent:
|
||||||
{
|
{
|
||||||
auto parent = thread.inst + sizeof(CompiledRegex::Offset);
|
|
||||||
auto child = prog_start + get_offset(thread.inst);
|
|
||||||
thread.inst = parent;
|
|
||||||
if (thread.saves)
|
if (thread.saves)
|
||||||
++thread.saves->refcount;
|
++thread.saves->refcount;
|
||||||
threads.push_back({child, thread.saves});
|
threads.push_back({inst.param, thread.saves});
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Split_PrioritizeChild:
|
case CompiledRegex::Split_PrioritizeChild:
|
||||||
{
|
{
|
||||||
auto parent = thread.inst + sizeof(CompiledRegex::Offset);
|
|
||||||
auto child = prog_start + get_offset(thread.inst);
|
|
||||||
thread.inst = child;
|
|
||||||
if (thread.saves)
|
if (thread.saves)
|
||||||
++thread.saves->refcount;
|
++thread.saves->refcount;
|
||||||
threads.push_back({parent, thread.saves});
|
threads.push_back({thread.inst, thread.saves});
|
||||||
|
thread.inst = inst.param;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Save:
|
case CompiledRegex::Save:
|
||||||
{
|
{
|
||||||
const size_t index = *thread.inst++;
|
|
||||||
if (thread.saves == nullptr)
|
if (thread.saves == nullptr)
|
||||||
break;
|
break;
|
||||||
if (thread.saves->refcount > 1)
|
if (thread.saves->refcount > 1)
|
||||||
|
@ -268,15 +265,12 @@ private:
|
||||||
--thread.saves->refcount;
|
--thread.saves->refcount;
|
||||||
thread.saves = new_saves<true>(thread.saves->pos);
|
thread.saves = new_saves<true>(thread.saves->pos);
|
||||||
}
|
}
|
||||||
thread.saves->pos[index] = get_base(pos);
|
thread.saves->pos[inst.param] = get_base(pos);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Matcher:
|
case CompiledRegex::Matcher:
|
||||||
{
|
return m_program.matchers[inst.param](cp) ?
|
||||||
const int matcher_id = *thread.inst++;
|
|
||||||
return m_program.matchers[matcher_id](cp) ?
|
|
||||||
StepResult::Consumed : StepResult::Failed;
|
StepResult::Consumed : StepResult::Failed;
|
||||||
}
|
|
||||||
case CompiledRegex::LineStart:
|
case CompiledRegex::LineStart:
|
||||||
if (not is_line_start(pos))
|
if (not is_line_start(pos))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
|
@ -304,27 +298,25 @@ private:
|
||||||
case CompiledRegex::LookAhead:
|
case CompiledRegex::LookAhead:
|
||||||
case CompiledRegex::NegativeLookAhead:
|
case CompiledRegex::NegativeLookAhead:
|
||||||
{
|
{
|
||||||
int count = *thread.inst++;
|
auto ref = m_program.lookarounds.begin() + inst.param;
|
||||||
for (auto it = pos; count and it != m_end; ++it, --count)
|
for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref)
|
||||||
if (*it != utf8::read(thread.inst))
|
if (*it != *ref)
|
||||||
break;
|
break;
|
||||||
if ((op == CompiledRegex::LookAhead and count != 0) or
|
if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or
|
||||||
(op == CompiledRegex::NegativeLookAhead and count == 0))
|
(inst.op == CompiledRegex::NegativeLookAhead and *ref == -1))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::LookBehind:
|
case CompiledRegex::LookBehind:
|
||||||
case CompiledRegex::NegativeLookBehind:
|
case CompiledRegex::NegativeLookBehind:
|
||||||
{
|
{
|
||||||
int count = *thread.inst++;
|
auto ref = m_program.lookarounds.begin() + inst.param;
|
||||||
for (auto it = pos-1; count and it >= m_begin; --it, --count)
|
for (auto it = pos-1; *ref != -1 and it >= m_begin; --it, ++ref)
|
||||||
if (*it != utf8::read(thread.inst))
|
if (*it != *ref)
|
||||||
break;
|
break;
|
||||||
if ((op == CompiledRegex::LookBehind and count != 0) or
|
if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or
|
||||||
(op == CompiledRegex::NegativeLookBehind and count == 0))
|
(inst.op == CompiledRegex::NegativeLookBehind and *ref == -1))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
|
@ -334,20 +326,20 @@ private:
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads, bool* inst_processed)
|
bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads, bool* processed_inst)
|
||||||
{
|
{
|
||||||
current_threads.push_back({m_program.bytecode.data(), initial_saves});
|
current_threads.push_back({0, initial_saves});
|
||||||
next_threads.clear();
|
next_threads.clear();
|
||||||
|
|
||||||
bool found_match = false;
|
bool found_match = false;
|
||||||
for (Utf8It pos = start; pos != m_end; ++pos)
|
for (Utf8It pos = start; pos != m_end; ++pos)
|
||||||
{
|
{
|
||||||
memset(inst_processed, 0, m_program.bytecode.size() * sizeof(bool));
|
memset(processed_inst, 0, sizeof(bool) * m_program.instructions.size());
|
||||||
while (not current_threads.empty())
|
while (not current_threads.empty())
|
||||||
{
|
{
|
||||||
auto thread = current_threads.back();
|
auto thread = current_threads.back();
|
||||||
current_threads.pop_back();
|
current_threads.pop_back();
|
||||||
switch (step(pos, thread, current_threads, inst_processed))
|
switch (step(pos, thread, current_threads, processed_inst))
|
||||||
{
|
{
|
||||||
case StepResult::Matched:
|
case StepResult::Matched:
|
||||||
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
|
if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
|
||||||
|
@ -385,13 +377,13 @@ private:
|
||||||
if (found_match)
|
if (found_match)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
memset(inst_processed, 0, m_program.bytecode.size() * sizeof(bool));
|
memset(processed_inst, 0, sizeof(bool) * m_program.instructions.size());
|
||||||
// Step remaining threads to see if they match without consuming anything else
|
// Step remaining threads to see if they match without consuming anything else
|
||||||
while (not current_threads.empty())
|
while (not current_threads.empty())
|
||||||
{
|
{
|
||||||
auto thread = current_threads.back();
|
auto thread = current_threads.back();
|
||||||
current_threads.pop_back();
|
current_threads.pop_back();
|
||||||
if (step(m_end, thread, current_threads, inst_processed) == StepResult::Matched)
|
if (step(m_end, thread, current_threads, processed_inst) == StepResult::Matched)
|
||||||
{
|
{
|
||||||
release_saves(m_captures);
|
release_saves(m_captures);
|
||||||
m_captures = thread.saves;
|
m_captures = thread.saves;
|
||||||
|
@ -411,13 +403,6 @@ private:
|
||||||
++start;
|
++start;
|
||||||
}
|
}
|
||||||
|
|
||||||
static CompiledRegex::Offset get_offset(const char* ptr)
|
|
||||||
{
|
|
||||||
CompiledRegex::Offset res;
|
|
||||||
memcpy(&res, ptr, sizeof(CompiledRegex::Offset));
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_line_start(const Utf8It& pos) const
|
bool is_line_start(const Utf8It& pos) const
|
||||||
{
|
{
|
||||||
return (pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
return (pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or
|
||||||
|
|
Loading…
Reference in New Issue
Block a user