Regex: Add support for curly braces count expressions
This commit is contained in:
parent
d04c60b911
commit
46a113e10a
|
@ -31,12 +31,32 @@ using Offset = size_t;
|
||||||
|
|
||||||
namespace RegexCompiler
|
namespace RegexCompiler
|
||||||
{
|
{
|
||||||
enum class Quantifier
|
struct Quantifier
|
||||||
|
{
|
||||||
|
enum Type
|
||||||
{
|
{
|
||||||
One,
|
One,
|
||||||
Optional,
|
Optional,
|
||||||
RepeatZeroOrMore,
|
RepeatZeroOrMore,
|
||||||
RepeatOneOrMore
|
RepeatOneOrMore,
|
||||||
|
RepeatMinMax,
|
||||||
|
};
|
||||||
|
Type type = One;
|
||||||
|
int min = -1, max = -1;
|
||||||
|
|
||||||
|
bool allows_none() const
|
||||||
|
{
|
||||||
|
return type == Quantifier::Optional or
|
||||||
|
type == Quantifier::RepeatZeroOrMore or
|
||||||
|
(type == Quantifier::RepeatMinMax and min <= 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool allows_infinite_repeat() const
|
||||||
|
{
|
||||||
|
return type == Quantifier::RepeatZeroOrMore or
|
||||||
|
type == Quantifier::RepeatOneOrMore or
|
||||||
|
(type == Quantifier::RepeatMinMax and max == -1);
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class Op
|
enum class Op
|
||||||
|
@ -64,7 +84,7 @@ struct AstNode
|
||||||
using AstNodePtr = std::unique_ptr<AstNode>;
|
using AstNodePtr = std::unique_ptr<AstNode>;
|
||||||
|
|
||||||
AstNodePtr make_ast_node(Op op, char value = 0,
|
AstNodePtr make_ast_node(Op op, char value = 0,
|
||||||
Quantifier quantifier = Quantifier::One)
|
Quantifier quantifier = {Quantifier::One})
|
||||||
{
|
{
|
||||||
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
|
return AstNodePtr{new AstNode{op, value, quantifier, {}}};
|
||||||
}
|
}
|
||||||
|
@ -157,49 +177,62 @@ private:
|
||||||
|
|
||||||
static Quantifier quantifier(Iterator& pos, Iterator end)
|
static Quantifier quantifier(Iterator& pos, Iterator end)
|
||||||
{
|
{
|
||||||
|
auto read_int = [](Iterator& pos, Iterator begin, Iterator end) {
|
||||||
|
int res = 0;
|
||||||
|
for (; pos != end; ++pos)
|
||||||
|
{
|
||||||
|
const auto c = *pos;
|
||||||
|
if (c < '0' or c > '9')
|
||||||
|
return pos == begin ? -1 : res;
|
||||||
|
res = res * 10 + c - '0';
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
};
|
||||||
|
|
||||||
switch (*pos)
|
switch (*pos)
|
||||||
{
|
{
|
||||||
case '*': ++pos; return Quantifier::RepeatZeroOrMore;
|
case '*': ++pos; return {Quantifier::RepeatZeroOrMore};
|
||||||
case '+': ++pos; return Quantifier::RepeatOneOrMore;
|
case '+': ++pos; return {Quantifier::RepeatOneOrMore};
|
||||||
case '?': ++pos; return Quantifier::Optional;
|
case '?': ++pos; return {Quantifier::Optional};
|
||||||
default: return Quantifier::One;
|
case '{':
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node)
|
|
||||||
{
|
{
|
||||||
RegexProgram::Offset pos = program.size();
|
auto it = pos+1;
|
||||||
|
int min = read_int(it, it, end);
|
||||||
auto allow_none = [](Quantifier quantifier) {
|
int max = -1;
|
||||||
return quantifier == Quantifier::Optional or
|
if (*it == ',')
|
||||||
quantifier == Quantifier::RepeatZeroOrMore;
|
{
|
||||||
|
++it;
|
||||||
|
max = read_int(it, it, end);
|
||||||
|
}
|
||||||
|
if (*it++ != '}')
|
||||||
|
throw runtime_error{"expected closing bracket"};
|
||||||
|
pos = it;
|
||||||
|
return {Quantifier::RepeatMinMax, min, max};
|
||||||
|
}
|
||||||
|
default: return {Quantifier::One};
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto is_repeat = [](Quantifier quantifier) {
|
RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node);
|
||||||
return quantifier == Quantifier::RepeatZeroOrMore or
|
|
||||||
quantifier == Quantifier::RepeatOneOrMore;
|
|
||||||
};
|
|
||||||
|
|
||||||
auto alloc_offset = [](Vector<char>& instructions) {
|
RegexProgram::Offset alloc_offset(Vector<char>& instructions)
|
||||||
|
{
|
||||||
auto pos = instructions.size();
|
auto pos = instructions.size();
|
||||||
instructions.resize(instructions.size() + sizeof(RegexProgram::Offset));
|
instructions.resize(instructions.size() + sizeof(RegexProgram::Offset));
|
||||||
return pos;
|
return pos;
|
||||||
};
|
|
||||||
|
|
||||||
auto get_offset = [](Vector<char>& instructions, RegexProgram::Offset base) -> RegexProgram::Offset& {
|
|
||||||
return *reinterpret_cast<RegexProgram::Offset*>(&instructions[base]);
|
|
||||||
};
|
|
||||||
|
|
||||||
RegexProgram::Offset optional_offset = -1;
|
|
||||||
if (allow_none(node->quantifier))
|
|
||||||
{
|
|
||||||
program.push_back(RegexProgram::Split);
|
|
||||||
optional_offset = alloc_offset(program);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Vector<RegexProgram::Offset> goto_end_offsets;
|
RegexProgram::Offset& get_offset(Vector<char>& instructions, RegexProgram::Offset base)
|
||||||
auto content_pos = program.size();
|
{
|
||||||
|
return *reinterpret_cast<RegexProgram::Offset*>(&instructions[base]);
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexProgram::Offset compile_node_inner(Vector<char>& program, const AstNodePtr& node)
|
||||||
|
{
|
||||||
|
const auto start_pos = program.size();
|
||||||
|
|
||||||
|
Vector<RegexProgram::Offset> goto_inner_end_offsets;
|
||||||
switch (node->op)
|
switch (node->op)
|
||||||
{
|
{
|
||||||
case Op::Literal:
|
case Op::Literal:
|
||||||
|
@ -223,7 +256,7 @@ RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node)
|
||||||
|
|
||||||
compile_node(program, children[0]);
|
compile_node(program, children[0]);
|
||||||
program.push_back(RegexProgram::Jump);
|
program.push_back(RegexProgram::Jump);
|
||||||
goto_end_offsets.push_back(alloc_offset(program));
|
goto_inner_end_offsets.push_back(alloc_offset(program));
|
||||||
|
|
||||||
auto right_pos = compile_node(program, children[1]);
|
auto right_pos = compile_node(program, children[1]);
|
||||||
get_offset(program, offset) = right_pos;
|
get_offset(program, offset) = right_pos;
|
||||||
|
@ -250,17 +283,44 @@ RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& offset : goto_end_offsets)
|
for (auto& offset : goto_inner_end_offsets)
|
||||||
get_offset(program, offset) = program.size();
|
get_offset(program, offset) = program.size();
|
||||||
|
|
||||||
if (is_repeat(node->quantifier))
|
return start_pos;
|
||||||
{
|
|
||||||
program.push_back(RegexProgram::Split);
|
|
||||||
get_offset(program, alloc_offset(program)) = content_pos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (optional_offset != -1)
|
RegexProgram::Offset compile_node(Vector<char>& program, const AstNodePtr& node)
|
||||||
get_offset(program, optional_offset) = program.size();
|
{
|
||||||
|
RegexProgram::Offset pos = program.size();
|
||||||
|
Vector<RegexProgram::Offset> goto_end_offsets;
|
||||||
|
|
||||||
|
if (node->quantifier.allows_none())
|
||||||
|
{
|
||||||
|
program.push_back(RegexProgram::Split);
|
||||||
|
goto_end_offsets.push_back(alloc_offset(program));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto inner_pos = compile_node_inner(program, node);
|
||||||
|
// Write the node multiple times when we have a min count quantifier
|
||||||
|
for (int i = 1; i < node->quantifier.min; ++i)
|
||||||
|
inner_pos = compile_node_inner(program, node);
|
||||||
|
|
||||||
|
if (node->quantifier.allows_infinite_repeat())
|
||||||
|
{
|
||||||
|
program.push_back(RegexProgram::Split);
|
||||||
|
get_offset(program, alloc_offset(program)) = inner_pos;
|
||||||
|
}
|
||||||
|
// Write the node as an optional match for the min -> max counts
|
||||||
|
else for (int i = std::max(1, node->quantifier.min); // STILL UGLY !
|
||||||
|
i < node->quantifier.max; ++i)
|
||||||
|
{
|
||||||
|
program.push_back(RegexProgram::Split);
|
||||||
|
goto_end_offsets.push_back(alloc_offset(program));
|
||||||
|
compile_node_inner(program, node);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto offset : goto_end_offsets)
|
||||||
|
get_offset(program, offset) = program.size();
|
||||||
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
@ -513,6 +573,38 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(exec.match(program, "bar"));
|
kak_assert(exec.match(program, "bar"));
|
||||||
kak_assert(not exec.match(program, "foobar"));
|
kak_assert(not exec.match(program, "foobar"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
StringView re = R"(\`a{3,5}b\')";
|
||||||
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
|
RegexProgram::dump(program);
|
||||||
|
Exec exec{program};
|
||||||
|
kak_assert(not exec.match(program, "aab"));
|
||||||
|
kak_assert(exec.match(program, "aaab"));
|
||||||
|
kak_assert(not exec.match(program, "aaaaaab"));
|
||||||
|
kak_assert(exec.match(program, "aaaaab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
StringView re = R"(\`a{3,}b\')";
|
||||||
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
|
RegexProgram::dump(program);
|
||||||
|
Exec exec{program};
|
||||||
|
kak_assert(not exec.match(program, "aab"));
|
||||||
|
kak_assert(exec.match(program, "aaab"));
|
||||||
|
kak_assert(exec.match(program, "aaaaab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
StringView re = R"(\`a{,3}b\')";
|
||||||
|
auto program = RegexCompiler::compile(re.begin(), re.end());
|
||||||
|
RegexProgram::dump(program);
|
||||||
|
Exec exec{program};
|
||||||
|
kak_assert(exec.match(program, "b"));
|
||||||
|
kak_assert(exec.match(program, "ab"));
|
||||||
|
kak_assert(exec.match(program, "aaab"));
|
||||||
|
kak_assert(not exec.match(program, "aaaab"));
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user