Regex: WIP support for saving captures
This commit is contained in:
parent
ad546e516a
commit
023511deff
|
@ -18,6 +18,7 @@ enum Op : char
|
||||||
AnyChar,
|
AnyChar,
|
||||||
Jump,
|
Jump,
|
||||||
Split,
|
Split,
|
||||||
|
Save,
|
||||||
LineStart,
|
LineStart,
|
||||||
LineEnd,
|
LineEnd,
|
||||||
WordBoundary,
|
WordBoundary,
|
||||||
|
@ -83,7 +84,7 @@ struct AstNode
|
||||||
|
|
||||||
using AstNodePtr = std::unique_ptr<AstNode>;
|
using AstNodePtr = std::unique_ptr<AstNode>;
|
||||||
|
|
||||||
AstNodePtr make_ast_node(Op op, char value = 0,
|
AstNodePtr make_ast_node(Op op, char value = -1,
|
||||||
Quantifier quantifier = {Quantifier::One})
|
Quantifier quantifier = {Quantifier::One})
|
||||||
{
|
{
|
||||||
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
|
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
|
||||||
|
@ -94,25 +95,29 @@ AstNodePtr make_ast_node(Op op, char value = 0,
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
struct Parser
|
struct Parser
|
||||||
{
|
{
|
||||||
static AstNodePtr parse(Iterator pos, Iterator end)
|
AstNodePtr parse(Iterator pos, Iterator end)
|
||||||
{
|
{
|
||||||
return disjunction(pos, end);
|
return disjunction(pos, end, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static AstNodePtr disjunction(Iterator& pos, Iterator end)
|
AstNodePtr disjunction(Iterator& pos, Iterator end, char capture = -1)
|
||||||
{
|
{
|
||||||
AstNodePtr node = alternative(pos, end);
|
AstNodePtr node = alternative(pos, end);
|
||||||
if (pos == end or *pos != '|')
|
if (pos == end or *pos != '|')
|
||||||
|
{
|
||||||
|
node->value = capture;
|
||||||
return node;
|
return node;
|
||||||
|
}
|
||||||
|
|
||||||
AstNodePtr res = make_ast_node(Op::Alternation);
|
AstNodePtr res = make_ast_node(Op::Alternation);
|
||||||
res->children.push_back(std::move(node));
|
res->children.push_back(std::move(node));
|
||||||
res->children.push_back(disjunction(++pos, end));
|
res->children.push_back(disjunction(++pos, end));
|
||||||
|
res->value = capture;
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr alternative(Iterator& pos, Iterator end)
|
AstNodePtr alternative(Iterator& pos, Iterator end)
|
||||||
{
|
{
|
||||||
AstNodePtr res = make_ast_node(Op::Sequence);
|
AstNodePtr res = make_ast_node(Op::Sequence);
|
||||||
while (auto node = term(pos, end))
|
while (auto node = term(pos, end))
|
||||||
|
@ -120,7 +125,7 @@ private:
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr term(Iterator& pos, Iterator end)
|
AstNodePtr term(Iterator& pos, Iterator end)
|
||||||
{
|
{
|
||||||
if (auto node = assertion(pos, end))
|
if (auto node = assertion(pos, end))
|
||||||
return node;
|
return node;
|
||||||
|
@ -132,7 +137,7 @@ private:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr assertion(Iterator& pos, Iterator end)
|
AstNodePtr assertion(Iterator& pos, Iterator end)
|
||||||
{
|
{
|
||||||
switch (*pos)
|
switch (*pos)
|
||||||
{
|
{
|
||||||
|
@ -154,7 +159,7 @@ private:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AstNodePtr atom(Iterator& pos, Iterator end)
|
AstNodePtr atom(Iterator& pos, Iterator end)
|
||||||
{
|
{
|
||||||
const auto c = *pos;
|
const auto c = *pos;
|
||||||
switch (c)
|
switch (c)
|
||||||
|
@ -163,7 +168,8 @@ private:
|
||||||
case '(':
|
case '(':
|
||||||
{
|
{
|
||||||
++pos;
|
++pos;
|
||||||
auto content = disjunction(pos, end);
|
auto content = disjunction(pos, end, m_next_capture++);
|
||||||
|
|
||||||
if (pos == end or *pos != ')')
|
if (pos == end or *pos != ')')
|
||||||
throw runtime_error{"Unclosed parenthesis"};
|
throw runtime_error{"Unclosed parenthesis"};
|
||||||
++pos;
|
++pos;
|
||||||
|
@ -177,7 +183,7 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Quantifier quantifier(Iterator& pos, Iterator end)
|
Quantifier quantifier(Iterator& pos, Iterator end)
|
||||||
{
|
{
|
||||||
auto read_int = [](Iterator& pos, Iterator begin, Iterator end) {
|
auto read_int = [](Iterator& pos, Iterator begin, Iterator end) {
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
@ -214,6 +220,8 @@ private:
|
||||||
default: return {Quantifier::One};
|
default: return {Quantifier::One};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char m_next_capture = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node);
|
RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node);
|
||||||
|
@ -234,6 +242,13 @@ RegexProgram::Offset compile_node_inner(Vector<char>& program, const AstNodePtr&
|
||||||
{
|
{
|
||||||
const auto start_pos = program.size();
|
const auto start_pos = program.size();
|
||||||
|
|
||||||
|
const char capture = (node->op == Op::Alternation or node->op == Op::Sequence) ? node->value : -1;
|
||||||
|
if (capture >= 0)
|
||||||
|
{
|
||||||
|
program.push_back(RegexProgram::Save);
|
||||||
|
program.push_back(capture * 2);
|
||||||
|
}
|
||||||
|
|
||||||
Vector<RegexProgram::Offset> goto_inner_end_offsets;
|
Vector<RegexProgram::Offset> goto_inner_end_offsets;
|
||||||
switch (node->op)
|
switch (node->op)
|
||||||
{
|
{
|
||||||
|
@ -288,6 +303,12 @@ RegexProgram::Offset compile_node_inner(Vector<char>& program, const AstNodePtr&
|
||||||
for (auto& offset : goto_inner_end_offsets)
|
for (auto& offset : goto_inner_end_offsets)
|
||||||
get_offset(program, offset) = program.size();
|
get_offset(program, offset) = program.size();
|
||||||
|
|
||||||
|
if (capture >= 0)
|
||||||
|
{
|
||||||
|
program.push_back(RegexProgram::Save);
|
||||||
|
program.push_back(capture * 2 + 1);
|
||||||
|
}
|
||||||
|
|
||||||
return start_pos;
|
return start_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -338,7 +359,7 @@ Vector<char> compile(const AstNodePtr& node)
|
||||||
template<typename Iterator>
|
template<typename Iterator>
|
||||||
Vector<char> compile(Iterator begin, Iterator end)
|
Vector<char> compile(Iterator begin, Iterator end)
|
||||||
{
|
{
|
||||||
return compile(Parser<Iterator>::parse(begin, end));
|
return compile(Parser<Iterator>{}.parse(begin, end));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -367,6 +388,9 @@ void dump(ConstArrayView<char> program)
|
||||||
pos += sizeof(RegexProgram::Offset);
|
pos += sizeof(RegexProgram::Offset);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case RegexProgram::Save:
|
||||||
|
printf("save %d\n", program[pos++]);
|
||||||
|
break;
|
||||||
case RegexProgram::LineStart:
|
case RegexProgram::LineStart:
|
||||||
printf("line start\n");
|
printf("line start\n");
|
||||||
break;
|
break;
|
||||||
|
@ -395,72 +419,87 @@ struct ThreadedExecutor
|
||||||
{
|
{
|
||||||
ThreadedExecutor(ConstArrayView<char> program) : m_program{program} {}
|
ThreadedExecutor(ConstArrayView<char> program) : m_program{program} {}
|
||||||
|
|
||||||
struct StepResult
|
struct Thread
|
||||||
{
|
{
|
||||||
enum Result { Consumed, Matched, Failed } result;
|
const char* inst;
|
||||||
const char* next = nullptr;
|
Vector<const char*> saves = {};
|
||||||
};
|
};
|
||||||
|
|
||||||
StepResult step(const char* inst)
|
enum class StepResult { Consumed, Matched, Failed };
|
||||||
|
StepResult step(size_t thread_index)
|
||||||
{
|
{
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
auto& thread = m_threads[thread_index];
|
||||||
char c = m_pos == m_subject.end() ? 0 : *m_pos;
|
char c = m_pos == m_subject.end() ? 0 : *m_pos;
|
||||||
const RegexProgram::Op op = (RegexProgram::Op)*inst++;
|
const RegexProgram::Op op = (RegexProgram::Op)*thread.inst++;
|
||||||
switch (op)
|
switch (op)
|
||||||
{
|
{
|
||||||
case RegexProgram::Literal:
|
case RegexProgram::Literal:
|
||||||
if (*inst++ == c)
|
if (*thread.inst++ == c)
|
||||||
return { StepResult::Consumed, inst };
|
return StepResult::Consumed;
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
case RegexProgram::AnyChar:
|
case RegexProgram::AnyChar:
|
||||||
return { StepResult::Consumed, inst };
|
return StepResult::Consumed;
|
||||||
case RegexProgram::Jump:
|
case RegexProgram::Jump:
|
||||||
inst = m_program.begin() + *reinterpret_cast<const RegexProgram::Offset*>(inst);
|
{
|
||||||
|
auto inst = m_program.begin() + *reinterpret_cast<const RegexProgram::Offset*>(thread.inst);
|
||||||
// if instruction is already going to be executed, drop this thread
|
// if instruction is already going to be executed, drop this thread
|
||||||
if (std::find(m_threads.begin(), m_threads.end(), inst) != m_threads.end())
|
if (std::find_if(m_threads.begin(), m_threads.end(),
|
||||||
return { StepResult::Failed };
|
[inst](const Thread& t) { return t.inst == inst; }) != m_threads.end())
|
||||||
|
return StepResult::Failed;
|
||||||
|
thread.inst = inst;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case RegexProgram::Split:
|
case RegexProgram::Split:
|
||||||
{
|
{
|
||||||
add_thread(*reinterpret_cast<const RegexProgram::Offset*>(inst));
|
add_thread(*reinterpret_cast<const RegexProgram::Offset*>(thread.inst), thread.saves);
|
||||||
inst += sizeof(RegexProgram::Offset);
|
// thread is invalidated now, as we mutated the m_thread vector
|
||||||
|
m_threads[thread_index].inst += sizeof(RegexProgram::Offset);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case RegexProgram::Save:
|
||||||
|
{
|
||||||
|
const char index = *thread.inst++;
|
||||||
|
thread.saves[index] = m_pos;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case RegexProgram::LineStart:
|
case RegexProgram::LineStart:
|
||||||
if (not is_line_start())
|
if (not is_line_start())
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case RegexProgram::LineEnd:
|
case RegexProgram::LineEnd:
|
||||||
if (not is_line_end())
|
if (not is_line_end())
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case RegexProgram::WordBoundary:
|
case RegexProgram::WordBoundary:
|
||||||
if (not is_word_boundary())
|
if (not is_word_boundary())
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case RegexProgram::NotWordBoundary:
|
case RegexProgram::NotWordBoundary:
|
||||||
if (is_word_boundary())
|
if (is_word_boundary())
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case RegexProgram::SubjectBegin:
|
case RegexProgram::SubjectBegin:
|
||||||
if (m_pos != m_subject.begin())
|
if (m_pos != m_subject.begin())
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case RegexProgram::SubjectEnd:
|
case RegexProgram::SubjectEnd:
|
||||||
if (m_pos != m_subject.end())
|
if (m_pos != m_subject.end())
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case RegexProgram::Match:
|
case RegexProgram::Match:
|
||||||
return { StepResult::Matched };
|
return StepResult::Matched;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return { StepResult::Failed };
|
return StepResult::Failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool match(ConstArrayView<char> program, StringView data)
|
bool match(ConstArrayView<char> program, StringView data)
|
||||||
{
|
{
|
||||||
m_threads = Vector<const char*>{program.begin()};
|
m_threads.clear();
|
||||||
|
add_thread(0, Vector<const char*>(10, nullptr));
|
||||||
|
|
||||||
m_subject = data;
|
m_subject = data;
|
||||||
m_pos = data.begin();
|
m_pos = data.begin();
|
||||||
|
|
||||||
|
@ -468,30 +507,39 @@ struct ThreadedExecutor
|
||||||
{
|
{
|
||||||
for (int i = 0; i < m_threads.size(); ++i)
|
for (int i = 0; i < m_threads.size(); ++i)
|
||||||
{
|
{
|
||||||
auto res = step(m_threads[i]);
|
const auto res = step(i);
|
||||||
m_threads[i] = res.next;
|
if (res == StepResult::Matched)
|
||||||
if (res.result == StepResult::Matched)
|
{
|
||||||
|
m_captures = std::move(m_threads[i].saves);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
m_threads.erase(std::remove(m_threads.begin(), m_threads.end(), nullptr), m_threads.end());
|
else if (res == StepResult::Failed)
|
||||||
|
m_threads[i].inst = nullptr;
|
||||||
|
}
|
||||||
|
m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
|
||||||
|
[](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
|
||||||
if (m_threads.empty())
|
if (m_threads.empty())
|
||||||
break;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step remaining threads to see if they match without consuming anything else
|
// Step remaining threads to see if they match without consuming anything else
|
||||||
for (int i = 0; i < m_threads.size(); ++i)
|
for (int i = 0; i < m_threads.size(); ++i)
|
||||||
{
|
{
|
||||||
if (step(m_threads[i]).result == StepResult::Matched)
|
if (step(i) == StepResult::Matched)
|
||||||
|
{
|
||||||
|
m_captures = std::move(m_threads[i].saves);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_thread(RegexProgram::Offset pos)
|
void add_thread(RegexProgram::Offset pos, Vector<const char*> saves)
|
||||||
{
|
{
|
||||||
const char* inst = m_program.begin() + pos;
|
const char* inst = m_program.begin() + pos;
|
||||||
if (std::find(m_threads.begin(), m_threads.end(), inst) == m_threads.end())
|
if (std::find_if(m_threads.begin(), m_threads.end(),
|
||||||
m_threads.push_back(inst);
|
[inst](const Thread& t) { return t.inst == inst; }) == m_threads.end())
|
||||||
|
m_threads.push_back({inst, std::move(saves)});
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_line_start() const
|
bool is_line_start() const
|
||||||
|
@ -512,7 +560,8 @@ struct ThreadedExecutor
|
||||||
}
|
}
|
||||||
|
|
||||||
ConstArrayView<char> m_program;
|
ConstArrayView<char> m_program;
|
||||||
Vector<const char*> m_threads;
|
Vector<Thread> m_threads;
|
||||||
|
Vector<const char*> m_captures;
|
||||||
StringView m_subject;
|
StringView m_subject;
|
||||||
const char* m_pos;
|
const char* m_pos;
|
||||||
};
|
};
|
||||||
|
@ -549,6 +598,7 @@ auto test_regex = UnitTest{[]{
|
||||||
RegexProgram::dump(program);
|
RegexProgram::dump(program);
|
||||||
Exec exec{program};
|
Exec exec{program};
|
||||||
kak_assert(exec.match(program, "fooquxbarbaz"));
|
kak_assert(exec.match(program, "fooquxbarbaz"));
|
||||||
|
kak_assert(StringView{exec.m_captures[2], exec.m_captures[3]} == "qux");
|
||||||
kak_assert(not exec.match(program, "fooquxbarbaze"));
|
kak_assert(not exec.match(program, "fooquxbarbaze"));
|
||||||
kak_assert(not exec.match(program, "quxbar"));
|
kak_assert(not exec.match(program, "quxbar"));
|
||||||
kak_assert(not exec.match(program, "blahblah"));
|
kak_assert(not exec.match(program, "blahblah"));
|
||||||
|
@ -562,6 +612,7 @@ auto test_regex = UnitTest{[]{
|
||||||
RegexProgram::dump(program);
|
RegexProgram::dump(program);
|
||||||
Exec exec{program};
|
Exec exec{program};
|
||||||
kak_assert(exec.match(program, "qux foo baz"));
|
kak_assert(exec.match(program, "qux foo baz"));
|
||||||
|
kak_assert(StringView{exec.m_captures[2], exec.m_captures[3]} == "foo");
|
||||||
kak_assert(not exec.match(program, "quxfoobaz"));
|
kak_assert(not exec.match(program, "quxfoobaz"));
|
||||||
kak_assert(exec.match(program, "bar"));
|
kak_assert(exec.match(program, "bar"));
|
||||||
kak_assert(not exec.match(program, "foobar"));
|
kak_assert(not exec.match(program, "foobar"));
|
||||||
|
|
Loading…
Reference in New Issue
Block a user