Regex: Support any char and character classes in lookarounds

Lookarounds still need to be fixed size, but accept character classes
as well as plain literals.
This commit is contained in:
Maxime Coste 2017-10-09 11:12:42 +08:00
parent b8cb65160a
commit cca730193c
2 changed files with 46 additions and 23 deletions

View File

@ -474,8 +474,13 @@ private:
void validate_lookaround(const AstNodePtr& node) void validate_lookaround(const AstNodePtr& node)
{ {
for (auto& child : node->children) for (auto& child : node->children)
if (child->op != ParsedRegex::Literal) {
parse_error("Lookaround can only contain literals"); if (child->op != ParsedRegex::Literal and child->op != ParsedRegex::Matcher and
child->op != ParsedRegex::AnyChar)
parse_error("Lookaround can only contain literals, any chars or character classes");
if (child->quantifier.type != ParsedRegex::Quantifier::One)
parse_error("Quantifiers cannot be used in lookarounds");
}
} }
ParsedRegex m_parsed_regex; ParsedRegex m_parsed_regex;
@ -679,15 +684,27 @@ private:
return res; return res;
} }
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& literals, bool reversed = false) uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters, bool reversed = false)
{ {
uint32_t res = m_program.lookarounds.size(); uint32_t res = m_program.lookarounds.size();
auto write_lookaround = [this](auto&& characters) {
for (auto& character : characters)
{
if (character->op == ParsedRegex::Literal)
m_program.lookarounds.push_back(character->value);
else if (character->op == ParsedRegex::AnyChar)
m_program.lookarounds.push_back(0xF000);
else if (character->op == ParsedRegex::Matcher)
m_program.lookarounds.push_back(0xF0001 + character->value);
else
kak_assert(false);
}
};
if (reversed) if (reversed)
for (auto& literal : literals | reverse()) write_lookaround(characters | reverse());
m_program.lookarounds.push_back(literal->value);
else else
for (auto& literal : literals) write_lookaround(characters);
m_program.lookarounds.push_back(literal->value);
m_program.lookarounds.push_back((Codepoint)-1); m_program.lookarounds.push_back((Codepoint)-1);
return res; return res;

View File

@ -294,28 +294,14 @@ private:
break; break;
case CompiledRegex::LookAhead: case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead: case CompiledRegex::NegativeLookAhead:
{ if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead))
auto ref = m_program.lookarounds.begin() + inst.param;
for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref)
if (*it != *ref)
break;
if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or
(inst.op == CompiledRegex::NegativeLookAhead and *ref == -1))
return StepResult::Failed; return StepResult::Failed;
break; break;
}
case CompiledRegex::LookBehind: case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind: case CompiledRegex::NegativeLookBehind:
{ if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind))
auto ref = m_program.lookarounds.begin() + inst.param;
for (auto it = pos; *ref != -1 and it > m_begin; --it, ++ref)
if (*(it-1) != *ref)
break;
if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or
(inst.op == CompiledRegex::NegativeLookBehind and *ref == -1))
return StepResult::Failed; return StepResult::Failed;
break; break;
}
case CompiledRegex::Match: case CompiledRegex::Match:
return StepResult::Matched; return StepResult::Matched;
} }
@ -392,6 +378,26 @@ private:
++start; ++start;
} }
template<MatchDirection look_direction>
bool lookaround(uint32_t index, Utf8It pos) const
{
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
{
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
return false;
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
if (ref == 0xF000)
{} // any character matches
else if (ref > 0xF0000 and ref <= 0xFFFFD and not m_program.matchers[ref - 0xF0001](cp))
return false;
else if (ref != cp)
return false;
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
}
return true;
}
bool is_line_start(const Utf8It& pos) const bool is_line_start(const Utf8It& pos) const
{ {
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin) if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)