Regex: Use a std::function based "Matcher" op to implement character classes

This is more extensible and should allow easier support for non ranges
classes.
This commit is contained in:
Maxime Coste 2017-09-26 18:03:12 +09:00
parent eb1015cdfb
commit be157453ad

View File

@ -20,8 +20,7 @@ struct CompiledRegex
Match, Match,
Literal, Literal,
AnyChar, AnyChar,
CharRange, Matcher,
NegativeCharRange,
Jump, Jump,
Split_PrioritizeParent, Split_PrioritizeParent,
Split_PrioritizeChild, Split_PrioritizeChild,
@ -37,6 +36,7 @@ struct CompiledRegex
using Offset = unsigned; using Offset = unsigned;
Vector<char> bytecode; Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count; size_t save_count;
}; };
@ -75,8 +75,7 @@ enum class Op
{ {
Literal, Literal,
AnyChar, AnyChar,
CharRange, Matcher,
NegativeCharRange,
Sequence, Sequence,
Alternation, Alternation,
LineStart, LineStart,
@ -103,7 +102,7 @@ struct ParsedRegex
{ {
AstNodePtr ast; AstNodePtr ast;
size_t capture_count; size_t capture_count;
Vector<Vector<CharRange>> ranges; Vector<std::function<bool (Codepoint)>> matchers;
}; };
AstNodePtr make_ast_node(Op op, Codepoint value = -1, AstNodePtr make_ast_node(Op op, Codepoint value = -1,
@ -261,14 +260,14 @@ private:
const auto cp = *pos++; const auto cp = *pos++;
if (cp == '-') if (cp == '-')
{ {
ranges.push_back({ '-', 0 }); ranges.push_back({ '-', '-' });
continue; continue;
} }
if (pos == end) if (pos == end)
break; break;
CharRange range = { cp, 0 }; CharRange range = { cp, cp };
if (*pos == '-') if (*pos == '-')
{ {
if (++pos == end) if (++pos == end)
@ -283,10 +282,17 @@ private:
throw runtime_error{"Unclosed character class"}; throw runtime_error{"Unclosed character class"};
++pos; ++pos;
auto ranges_id = parsed_regex.ranges.size(); auto matcher = [negative, ranges = std::move(ranges)](Codepoint cp) {
parsed_regex.ranges.push_back(std::move(ranges)); auto found = contains_that(ranges, [cp](auto& r) {
return r.min <= cp and cp <= r.max;
});
return negative ? not found : found;
};
return make_ast_node(negative ? Op::NegativeCharRange : Op::CharRange, ranges_id); auto matcher_id = parsed_regex.matchers.size();
parsed_regex.matchers.push_back(std::move(matcher));
return make_ast_node(Op::Matcher, matcher_id);
} }
static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end)
@ -371,32 +377,9 @@ CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedReg
case Op::AnyChar: case Op::AnyChar:
program.bytecode.push_back(CompiledRegex::AnyChar); program.bytecode.push_back(CompiledRegex::AnyChar);
break; break;
case Op::CharRange: case Op::NegativeCharRange: case Op::Matcher:
{ program.bytecode.push_back(CompiledRegex::Matcher);
auto& ranges = parsed_regex.ranges[node->value]; program.bytecode.push_back(node->value);
size_t single_count = std::count_if(ranges.begin(), ranges.end(),
[](auto& r) { return r.max == 0; });
program.bytecode.push_back(node->op == Op::CharRange ?
CompiledRegex::CharRange
: CompiledRegex::NegativeCharRange);
program.bytecode.push_back((char)single_count);
program.bytecode.push_back((char)(ranges.size() - single_count));
for (auto& r : ranges)
{
if (r.max == 0)
push_codepoint(program, r.min);
}
for (auto& r : ranges)
{
if (r.max != 0)
{
push_codepoint(program, r.min);
push_codepoint(program, r.max);
}
}
break;
}
case Op::Sequence: case Op::Sequence:
for (auto& child : node->children) for (auto& child : node->children)
compile_node(program, parsed_regex, child); compile_node(program, parsed_regex, child);
@ -505,6 +488,7 @@ CompiledRegex compile(const ParsedRegex& parsed_regex)
write_search_prefix(res); write_search_prefix(res);
compile_node(res, parsed_regex, parsed_regex.ast); compile_node(res, parsed_regex, parsed_regex.ast);
res.bytecode.push_back(CompiledRegex::Match); res.bytecode.push_back(CompiledRegex::Match);
res.matchers = parsed_regex.matchers;
res.save_count = parsed_regex.capture_count * 2; res.save_count = parsed_regex.capture_count * 2;
return res; return res;
} }
@ -547,24 +531,9 @@ void dump(const CompiledRegex& program)
case CompiledRegex::Save: case CompiledRegex::Save:
printf("save %d\n", *pos++); printf("save %d\n", *pos++);
break; break;
case CompiledRegex::CharRange: case CompiledRegex::NegativeCharRange: case CompiledRegex::Matcher:
{ printf("matcher %d\n", *pos++);
printf("%schar range, [", op == CompiledRegex::NegativeCharRange ? "negative " : "");
auto single_count = *pos++;
auto range_count = *pos++;
for (int i = 0; i < single_count; ++i)
printf("%lc", utf8::read_codepoint(pos, (const char*)nullptr));
printf("]");
for (int i = 0; i < range_count; ++i)
{
Codepoint min = utf8::read_codepoint(pos, (const char*)nullptr);
Codepoint max = utf8::read_codepoint(pos, (const char*)nullptr);
printf(" [%lc-%lc]", min, max);
}
printf("\n");
break; break;
}
case CompiledRegex::LineStart: case CompiledRegex::LineStart:
printf("line start\n"); printf("line start\n");
break; break;
@ -649,30 +618,11 @@ struct ThreadedRegexVM
thread.saves[index] = m_pos.base(); thread.saves[index] = m_pos.base();
break; break;
} }
case CompiledRegex::CharRange: case CompiledRegex::NegativeCharRange: case CompiledRegex::Matcher:
{ {
const int single_count = *thread.inst++; const int matcher_id = *thread.inst++;
const int range_count = *thread.inst++; return m_program.matchers[matcher_id](*m_pos) ?
for (int i = 0; i < single_count; ++i) StepResult::Consumed : StepResult::Failed;
{
auto candidate = utf8::read_codepoint(thread.inst, prog_end);
if (cp == candidate)
{
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{single_count - (i + 1) + range_count * 2});
return op == CompiledRegex::CharRange ? StepResult::Consumed : StepResult::Failed;
}
}
for (int i = 0; i < range_count; ++i)
{
auto min = utf8::read_codepoint(thread.inst, prog_end);
auto max = utf8::read_codepoint(thread.inst, prog_end);
if (min <= cp and cp <= max)
{
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{(range_count - (i + 1)) * 2});
return op == CompiledRegex::CharRange ? StepResult::Consumed : StepResult::Failed;
}
}
return op == CompiledRegex::CharRange ? StepResult::Failed : StepResult::Consumed;
} }
case CompiledRegex::LineStart: case CompiledRegex::LineStart:
if (not is_line_start()) if (not is_line_start())