Regex: Implement lookarounds for fixed literal strings
We do not support anything else than a plain literal string for lookarounds.
This commit is contained in:
parent
e96cd29f0e
commit
0bdfdac5c5
|
@ -29,6 +29,10 @@ struct ParsedRegex
|
||||||
SubjectBegin,
|
SubjectBegin,
|
||||||
SubjectEnd,
|
SubjectEnd,
|
||||||
ResetStart,
|
ResetStart,
|
||||||
|
LookAhead,
|
||||||
|
LookBehind,
|
||||||
|
NegativeLookAhead,
|
||||||
|
NegativeLookBehind,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Quantifier
|
struct Quantifier
|
||||||
|
@ -116,9 +120,9 @@ private:
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
AstNodePtr alternative()
|
AstNodePtr alternative(ParsedRegex::Op op = ParsedRegex::Sequence)
|
||||||
{
|
{
|
||||||
AstNodePtr res = new_node(ParsedRegex::Sequence);
|
AstNodePtr res = new_node(op);
|
||||||
while (auto node = term())
|
while (auto node = term())
|
||||||
res->children.push_back(std::move(node));
|
res->children.push_back(std::move(node));
|
||||||
if (res->children.empty())
|
if (res->children.empty())
|
||||||
|
@ -175,8 +179,44 @@ private:
|
||||||
case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
|
case '.': ++m_pos; return new_node(ParsedRegex::AnyChar);
|
||||||
case '(':
|
case '(':
|
||||||
{
|
{
|
||||||
++m_pos;
|
auto advance = [&]() {
|
||||||
auto content = disjunction(m_parsed_regex.capture_count++);
|
if (++m_pos == m_regex.end())
|
||||||
|
parse_error("unclosed parenthesis");
|
||||||
|
return *m_pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
AstNodePtr content;
|
||||||
|
if (advance() == '?')
|
||||||
|
{
|
||||||
|
auto c = advance();
|
||||||
|
if (c == ':')
|
||||||
|
content = disjunction(-1);
|
||||||
|
else if (contains("=!<", c))
|
||||||
|
{
|
||||||
|
bool behind = false;
|
||||||
|
if (c == '<')
|
||||||
|
{
|
||||||
|
advance();
|
||||||
|
behind = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto type = *m_pos++;
|
||||||
|
if (type == '=')
|
||||||
|
content = alternative(behind ? ParsedRegex::LookBehind
|
||||||
|
: ParsedRegex::LookAhead);
|
||||||
|
else if (type == '!')
|
||||||
|
content = alternative(behind ? ParsedRegex::NegativeLookBehind
|
||||||
|
: ParsedRegex::NegativeLookAhead);
|
||||||
|
else
|
||||||
|
parse_error("invalid disjunction");
|
||||||
|
|
||||||
|
validate_lookaround(content);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
parse_error("invalid disjunction");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
content = disjunction(m_parsed_regex.capture_count++);
|
||||||
|
|
||||||
if (at_end() or *m_pos != ')')
|
if (at_end() or *m_pos != ')')
|
||||||
parse_error("unclosed parenthesis");
|
parse_error("unclosed parenthesis");
|
||||||
|
@ -245,7 +285,7 @@ private:
|
||||||
|
|
||||||
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
|
||||||
return new_node(ParsedRegex::Literal, cp);
|
return new_node(ParsedRegex::Literal, cp);
|
||||||
parse_error("unknown atom escape");
|
parse_error(format("unknown atom escape '{}'", cp));
|
||||||
}
|
}
|
||||||
|
|
||||||
AstNodePtr character_class()
|
AstNodePtr character_class()
|
||||||
|
@ -395,6 +435,13 @@ private:
|
||||||
StringView{m_pos.base(), m_regex.end()}));
|
StringView{m_pos.base(), m_regex.end()}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void validate_lookaround(const AstNodePtr& node)
|
||||||
|
{
|
||||||
|
for (auto& child : node->children)
|
||||||
|
if (child->op != ParsedRegex::Literal)
|
||||||
|
parse_error("Lookaround can only contain literals");
|
||||||
|
}
|
||||||
|
|
||||||
ParsedRegex m_parsed_regex;
|
ParsedRegex m_parsed_regex;
|
||||||
StringView m_regex;
|
StringView m_regex;
|
||||||
Iterator m_pos;
|
Iterator m_pos;
|
||||||
|
@ -406,6 +453,11 @@ private:
|
||||||
bool neg;
|
bool neg;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
StringView peek(ByteCount count)
|
||||||
|
{
|
||||||
|
return StringView{m_pos.base(), m_regex.end()}.substr(0, count);
|
||||||
|
}
|
||||||
|
|
||||||
static const CharacterClassEscape character_class_escapes[8];
|
static const CharacterClassEscape character_class_escapes[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -439,6 +491,10 @@ struct CompiledRegex
|
||||||
NotWordBoundary,
|
NotWordBoundary,
|
||||||
SubjectBegin,
|
SubjectBegin,
|
||||||
SubjectEnd,
|
SubjectEnd,
|
||||||
|
LookAhead,
|
||||||
|
LookBehind,
|
||||||
|
NegativeLookAhead,
|
||||||
|
NegativeLookBehind,
|
||||||
};
|
};
|
||||||
|
|
||||||
using Offset = unsigned;
|
using Offset = unsigned;
|
||||||
|
@ -516,6 +572,22 @@ private:
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case ParsedRegex::LookAhead:
|
||||||
|
push_op(CompiledRegex::LookAhead);
|
||||||
|
push_string(node->children);
|
||||||
|
break;
|
||||||
|
case ParsedRegex::LookBehind:
|
||||||
|
push_op(CompiledRegex::LookBehind);
|
||||||
|
push_string(node->children, true);
|
||||||
|
break;
|
||||||
|
case ParsedRegex::NegativeLookAhead:
|
||||||
|
push_op(CompiledRegex::NegativeLookAhead);
|
||||||
|
push_string(node->children);
|
||||||
|
break;
|
||||||
|
case ParsedRegex::NegativeLookBehind:
|
||||||
|
push_op(CompiledRegex::NegativeLookBehind);
|
||||||
|
push_string(node->children, true);
|
||||||
|
break;
|
||||||
case ParsedRegex::LineStart:
|
case ParsedRegex::LineStart:
|
||||||
push_op(CompiledRegex::LineStart);
|
push_op(CompiledRegex::LineStart);
|
||||||
break;
|
break;
|
||||||
|
@ -631,6 +703,20 @@ private:
|
||||||
utf8::dump(std::back_inserter(m_program.bytecode), cp);
|
utf8::dump(std::back_inserter(m_program.bytecode), cp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void push_string(const Vector<ParsedRegex::AstNodePtr>& codepoints, bool reversed = false)
|
||||||
|
{
|
||||||
|
if (codepoints.size() > 127)
|
||||||
|
throw runtime_error{"Too long literal string"};
|
||||||
|
|
||||||
|
push_byte(codepoints.size());
|
||||||
|
if (reversed)
|
||||||
|
for (auto& cp : codepoints | reverse())
|
||||||
|
push_codepoint(cp->value);
|
||||||
|
else
|
||||||
|
for (auto& cp : codepoints)
|
||||||
|
push_codepoint(cp->value);
|
||||||
|
}
|
||||||
|
|
||||||
CompiledRegex m_program;
|
CompiledRegex m_program;
|
||||||
const ParsedRegex& m_parsed_regex;
|
const ParsedRegex& m_parsed_regex;
|
||||||
};
|
};
|
||||||
|
@ -687,6 +773,27 @@ void dump_regex(const CompiledRegex& program)
|
||||||
case CompiledRegex::SubjectEnd:
|
case CompiledRegex::SubjectEnd:
|
||||||
printf("subject end\n");
|
printf("subject end\n");
|
||||||
break;
|
break;
|
||||||
|
case CompiledRegex::LookAhead:
|
||||||
|
case CompiledRegex::NegativeLookAhead:
|
||||||
|
case CompiledRegex::LookBehind:
|
||||||
|
case CompiledRegex::NegativeLookBehind:
|
||||||
|
{
|
||||||
|
int count = *pos++;
|
||||||
|
StringView str{pos, pos + count};
|
||||||
|
const char* name = nullptr;
|
||||||
|
if (op == CompiledRegex::LookAhead)
|
||||||
|
name = "look ahead";
|
||||||
|
if (op == CompiledRegex::NegativeLookAhead)
|
||||||
|
name = "negative look ahead";
|
||||||
|
if (op == CompiledRegex::LookBehind)
|
||||||
|
name = "look behind";
|
||||||
|
if (op == CompiledRegex::NegativeLookBehind)
|
||||||
|
name = "negative look behind";
|
||||||
|
|
||||||
|
printf("%s (%s)\n", name, (const char*)str.zstr());
|
||||||
|
pos += count;
|
||||||
|
break;
|
||||||
|
}
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
printf("match\n");
|
printf("match\n");
|
||||||
}
|
}
|
||||||
|
@ -783,6 +890,32 @@ struct ThreadedRegexVM
|
||||||
if (m_pos != m_end)
|
if (m_pos != m_end)
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
|
case CompiledRegex::LookAhead:
|
||||||
|
case CompiledRegex::NegativeLookAhead:
|
||||||
|
{
|
||||||
|
int count = *thread.inst++;
|
||||||
|
for (auto it = m_pos; count and it != m_end; ++it, --count)
|
||||||
|
if (*it != utf8::read(thread.inst))
|
||||||
|
break;
|
||||||
|
if ((op == CompiledRegex::LookAhead and count != 0) or
|
||||||
|
(op == CompiledRegex::NegativeLookAhead and count == 0))
|
||||||
|
return StepResult::Failed;
|
||||||
|
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case CompiledRegex::LookBehind:
|
||||||
|
case CompiledRegex::NegativeLookBehind:
|
||||||
|
{
|
||||||
|
int count = *thread.inst++;
|
||||||
|
for (auto it = m_pos-1; count and it >= m_begin; --it, --count)
|
||||||
|
if (*it != utf8::read(thread.inst))
|
||||||
|
break;
|
||||||
|
if ((op == CompiledRegex::LookBehind and count != 0) or
|
||||||
|
(op == CompiledRegex::NegativeLookBehind and count == 0))
|
||||||
|
return StepResult::Failed;
|
||||||
|
thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
|
||||||
|
break;
|
||||||
|
}
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
thread.inst = nullptr;
|
thread.inst = nullptr;
|
||||||
return StepResult::Matched;
|
return StepResult::Matched;
|
||||||
|
@ -823,7 +956,7 @@ struct ThreadedRegexVM
|
||||||
m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
|
m_threads.erase(std::remove_if(m_threads.begin(), m_threads.end(),
|
||||||
[](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
|
[](const Thread& t) { return t.inst == nullptr; }), m_threads.end());
|
||||||
if (m_threads.empty())
|
if (m_threads.empty())
|
||||||
return false;
|
return found_match;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step remaining threads to see if they match without consuming anything else
|
// Step remaining threads to see if they match without consuming anything else
|
||||||
|
@ -1034,6 +1167,30 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(vm.exec("foooo", true, true));
|
kak_assert(vm.exec("foooo", true, true));
|
||||||
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "fo");
|
kak_assert(StringView{vm.m_captures[2], vm.m_captures[3]} == "fo");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"((?=foo).)"};
|
||||||
|
kak_assert(vm.exec("barfoo", false, true));
|
||||||
|
kak_assert(StringView{vm.m_captures[0], vm.m_captures[1]} == "f");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"((?!foo)...)"};
|
||||||
|
kak_assert(not vm.exec("foo"));
|
||||||
|
kak_assert(vm.exec("qux"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"(...(?<=foo))"};
|
||||||
|
kak_assert(vm.exec("foo"));
|
||||||
|
kak_assert(not vm.exec("qux"));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"(...(?<!foo))"};
|
||||||
|
kak_assert(not vm.exec("foo"));
|
||||||
|
kak_assert(vm.exec("qux"));
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user