Regex: Implement lookarounds for fixed literal strings

We do not support anything else than a plain literal string for
lookarounds.
This commit is contained in:
Maxime Coste 2017-09-28 20:43:45 +08:00
parent e96cd29f0e
commit 0bdfdac5c5

View File

@ -29,6 +29,10 @@ struct ParsedRegex
SubjectBegin, SubjectBegin,
SubjectEnd, SubjectEnd,
ResetStart, ResetStart,
LookAhead,
LookBehind,
NegativeLookAhead,
NegativeLookBehind,
}; };
struct Quantifier struct Quantifier
@ -116,9 +120,9 @@ private:
return res; return res;
} }
AstNodePtr alternative() AstNodePtr alternative(ParsedRegex::Op op = ParsedRegex::Sequence)
{ {
AstNodePtr res = new_node(ParsedRegex::Sequence); AstNodePtr res = new_node(op);
while (auto node = term()) while (auto node = term())
res->children.push_back(std::move(node)); res->children.push_back(std::move(node));
if (res->children.empty()) if (res->children.empty())
@ -175,8 +179,44 @@ private:
case '.': ++m_pos; return new_node(ParsedRegex::AnyChar); case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
case '(': case '(':
{ {
++m_pos; auto advance = [&]() {
auto content = disjunction(m_parsed_regex.capture_count++); if (++m_pos == m_regex.end())
parse_error("unclosed parenthesis");
return *m_pos;
};
AstNodePtr content;
if (advance() == '?')
{
auto c = advance();
if (c == ':')
content = disjunction(-1);
else if (contains("=!<", c))
{
bool behind = false;
if (c == '<')
{
advance();
behind = true;
}
auto type = *m_pos++;
if (type == '=')
content = alternative(behind ? ParsedRegex::LookBehind
: ParsedRegex::LookAhead);
else if (type == '!')
content = alternative(behind ? ParsedRegex::NegativeLookBehind
: ParsedRegex::NegativeLookAhead);
else
parse_error("invalid disjunction");
validate_lookaround(content);
}
else
parse_error("invalid disjunction");
}
else
content = disjunction(m_parsed_regex.capture_count++);
if (at_end() or *m_pos != ')') if (at_end() or *m_pos != ')')
parse_error("unclosed parenthesis"); parse_error("unclosed parenthesis");
@ -245,7 +285,7 @@ private:
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return new_node(ParsedRegex::Literal, cp); return new_node(ParsedRegex::Literal, cp);
parse_error("unknown atom escape"); parse_error(format("unknown atom escape '{}'", cp));
} }
AstNodePtr character_class() AstNodePtr character_class()
@ -395,6 +435,13 @@ private:
StringView{m_pos.base(), m_regex.end()})); StringView{m_pos.base(), m_regex.end()}));
} }
void validate_lookaround(const AstNodePtr& node)
{
for (auto& child : node->children)
if (child->op != ParsedRegex::Literal)
parse_error("Lookaround can only contain literals");
}
ParsedRegex m_parsed_regex; ParsedRegex m_parsed_regex;
StringView m_regex; StringView m_regex;
Iterator m_pos; Iterator m_pos;
@ -406,6 +453,11 @@ private:
bool neg; bool neg;
}; };
StringView peek(ByteCount count)
{
return StringView{m_pos.base(), m_regex.end()}.substr(0, count);
}
static const CharacterClassEscape character_class_escapes[8]; static const CharacterClassEscape character_class_escapes[8];
}; };
@ -439,6 +491,10 @@ struct CompiledRegex
NotWordBoundary, NotWordBoundary,
SubjectBegin, SubjectBegin,
SubjectEnd, SubjectEnd,
LookAhead,
LookBehind,
NegativeLookAhead,
NegativeLookBehind,
}; };
using Offset = unsigned; using Offset = unsigned;
@ -516,6 +572,22 @@ private:
break; break;
} }
case ParsedRegex::LookAhead:
push_op(CompiledRegex::LookAhead);
push_string(node->children);
break;
case ParsedRegex::LookBehind:
push_op(CompiledRegex::LookBehind);
push_string(node->children, true);
break;
case ParsedRegex::NegativeLookAhead:
push_op(CompiledRegex::NegativeLookAhead);
push_string(node->children);
break;
case ParsedRegex::NegativeLookBehind:
push_op(CompiledRegex::NegativeLookBehind);
push_string(node->children, true);
break;
case ParsedRegex::LineStart: case ParsedRegex::LineStart:
push_op(CompiledRegex::LineStart); push_op(CompiledRegex::LineStart);
break; break;
@ -631,6 +703,20 @@ private:
utf8::dump(std::back_inserter(m_program.bytecode), cp); utf8::dump(std::back_inserter(m_program.bytecode), cp);
} }
void push_string(const Vector<ParsedRegex::AstNodePtr>& codepoints, bool reversed = false)
{
if (codepoints.size() > 127)
throw runtime_error{"Too long literal string"};
push_byte(codepoints.size());
if (reversed)
for (auto& cp : codepoints | reverse())
push_codepoint(cp->value);
else
for (auto& cp : codepoints)
push_codepoint(cp->value);
}
CompiledRegex m_program; CompiledRegex m_program;
const ParsedRegex& m_parsed_regex; const ParsedRegex& m_parsed_regex;
}; };
@ -687,6 +773,27 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::SubjectEnd: case CompiledRegex::SubjectEnd:
printf("subject end\n"); printf("subject end\n");
break; break;
case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead:
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
{
int count = *pos++;
StringView str{pos, pos + count};
const char* name = nullptr;
if (op == CompiledRegex::LookAhead)
name = "look ahead";
if (op == CompiledRegex::NegativeLookAhead)
name = "negative look ahead";
if (op == CompiledRegex::LookBehind)
name = "look behind";
if (op == CompiledRegex::NegativeLookBehind)
name = "negative look behind";
printf("%s (%s)\n", name, (const char*)str.zstr());
pos += count;
break;
}
case CompiledRegex::Match: case CompiledRegex::Match:
printf("match\n"); printf("match\n");
} }
@ -783,6 +890,32 @@ struct ThreadedRegexVM
if (m_pos != m_end) if (m_pos != m_end)
return StepResult::Failed; return StepResult::Failed;
break; break;
case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead:
{
int count = *thread.inst++;
for (auto it = m_pos; count and it != m_end; ++it, --count)
if (*it != utf8::read(thread.inst))
break;
if ((op == CompiledRegex::LookAhead and count != 0) or
(op == CompiledRegex::NegativeLookAhead and count == 0))
return StepResult::Failed;
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
break;
}
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
{
int count = *thread.inst++;
for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
if (*it != utf8::read(thread.inst))
break;
if ((op == CompiledRegex::LookBehind and count != 0) or
(op == CompiledRegex::NegativeLookBehind and count == 0))
return StepResult::Failed;
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
break;
}
case CompiledRegex::Match: case CompiledRegex::Match:
thread.inst = nullptr; thread.inst = nullptr;
return StepResult::Matched; return StepResult::Matched;
@ -823,7 +956,7 @@ struct ThreadedRegexVM
m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(), m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
[](const Thread& t) { return t.inst == nullptr; }), m_threads.end()); [](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
if (m_threads.empty()) if (m_threads.empty())
return false; return found_match;
} }
// Step remaining threads to see if they match without consuming anything else // Step remaining threads to see if they match without consuming anything else
@ -1034,6 +1167,30 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("foooo", true, true)); kak_assert(vm.exec("foooo", true, true));
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "fo"); kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "fo");
} }
{
TestVM vm{R"((?=foo).)"};
kak_assert(vm.exec("barfoo", false, true));
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "f");
}
{
TestVM vm{R"((?!foo)...)"};
kak_assert(not vm.exec("foo"));
kak_assert(vm.exec("qux"));
}
{
TestVM vm{R"(...(?<=foo))"};
kak_assert(vm.exec("foo"));
kak_assert(not vm.exec("qux"));
}
{
TestVM vm{R"(...(?<!foo))"};
kak_assert(not vm.exec("foo"));
kak_assert(vm.exec("qux"));
}
}}; }};
} }