Regex: Support any char and character classes in lookarounds
Lookarounds still need to be fixed size, but accept character classes as well as plain literals.
This commit is contained in:
parent
b8cb65160a
commit
cca730193c
|
@ -474,8 +474,13 @@ private:
|
|||
void validate_lookaround(const AstNodePtr& node)
|
||||
{
|
||||
for (auto& child : node->children)
|
||||
if (child->op != ParsedRegex::Literal)
|
||||
parse_error("Lookaround can only contain literals");
|
||||
{
|
||||
if (child->op != ParsedRegex::Literal and child->op != ParsedRegex::Matcher and
|
||||
child->op != ParsedRegex::AnyChar)
|
||||
parse_error("Lookaround can only contain literals, any chars or character classes");
|
||||
if (child->quantifier.type != ParsedRegex::Quantifier::One)
|
||||
parse_error("Quantifiers cannot be used in lookarounds");
|
||||
}
|
||||
}
|
||||
|
||||
ParsedRegex m_parsed_regex;
|
||||
|
@ -679,15 +684,27 @@ private:
|
|||
return res;
|
||||
}
|
||||
|
||||
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& literals, bool reversed = false)
|
||||
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters, bool reversed = false)
|
||||
{
|
||||
uint32_t res = m_program.lookarounds.size();
|
||||
auto write_lookaround = [this](auto&& characters) {
|
||||
for (auto& character : characters)
|
||||
{
|
||||
if (character->op == ParsedRegex::Literal)
|
||||
m_program.lookarounds.push_back(character->value);
|
||||
else if (character->op == ParsedRegex::AnyChar)
|
||||
m_program.lookarounds.push_back(0xF000);
|
||||
else if (character->op == ParsedRegex::Matcher)
|
||||
m_program.lookarounds.push_back(0xF0001 + character->value);
|
||||
else
|
||||
kak_assert(false);
|
||||
}
|
||||
};
|
||||
|
||||
if (reversed)
|
||||
for (auto& literal : literals | reverse())
|
||||
m_program.lookarounds.push_back(literal->value);
|
||||
write_lookaround(characters | reverse());
|
||||
else
|
||||
for (auto& literal : literals)
|
||||
m_program.lookarounds.push_back(literal->value);
|
||||
write_lookaround(characters);
|
||||
|
||||
m_program.lookarounds.push_back((Codepoint)-1);
|
||||
return res;
|
||||
|
|
|
@ -294,28 +294,14 @@ private:
|
|||
break;
|
||||
case CompiledRegex::LookAhead:
|
||||
case CompiledRegex::NegativeLookAhead:
|
||||
{
|
||||
auto ref = m_program.lookarounds.begin() + inst.param;
|
||||
for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref)
|
||||
if (*it != *ref)
|
||||
break;
|
||||
if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or
|
||||
(inst.op == CompiledRegex::NegativeLookAhead and *ref == -1))
|
||||
if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead))
|
||||
return StepResult::Failed;
|
||||
break;
|
||||
}
|
||||
case CompiledRegex::LookBehind:
|
||||
case CompiledRegex::NegativeLookBehind:
|
||||
{
|
||||
auto ref = m_program.lookarounds.begin() + inst.param;
|
||||
for (auto it = pos; *ref != -1 and it > m_begin; --it, ++ref)
|
||||
if (*(it-1) != *ref)
|
||||
break;
|
||||
if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or
|
||||
(inst.op == CompiledRegex::NegativeLookBehind and *ref == -1))
|
||||
if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind))
|
||||
return StepResult::Failed;
|
||||
break;
|
||||
}
|
||||
case CompiledRegex::Match:
|
||||
return StepResult::Matched;
|
||||
}
|
||||
|
@ -392,6 +378,26 @@ private:
|
|||
++start;
|
||||
}
|
||||
|
||||
template<MatchDirection look_direction>
|
||||
bool lookaround(uint32_t index, Utf8It pos) const
|
||||
{
|
||||
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
||||
{
|
||||
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
|
||||
return false;
|
||||
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
|
||||
if (ref == 0xF000)
|
||||
{} // any character matches
|
||||
else if (ref > 0xF0000 and ref <= 0xFFFFD and not m_program.matchers[ref - 0xF0001](cp))
|
||||
return false;
|
||||
else if (ref != cp)
|
||||
return false;
|
||||
|
||||
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_line_start(const Utf8It& pos) const
|
||||
{
|
||||
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
|
||||
|
|
Loading…
Reference in New Issue
Block a user