Regex: Support any char and character classes in lookarounds
Lookarounds still need to be fixed size, but accept character classes as well as plain literals.
This commit is contained in:
parent
b8cb65160a
commit
cca730193c
|
@ -474,8 +474,13 @@ private:
|
||||||
void validate_lookaround(const AstNodePtr& node)
|
void validate_lookaround(const AstNodePtr& node)
|
||||||
{
|
{
|
||||||
for (auto& child : node->children)
|
for (auto& child : node->children)
|
||||||
if (child->op != ParsedRegex::Literal)
|
{
|
||||||
parse_error("Lookaround can only contain literals");
|
if (child->op != ParsedRegex::Literal and child->op != ParsedRegex::Matcher and
|
||||||
|
child->op != ParsedRegex::AnyChar)
|
||||||
|
parse_error("Lookaround can only contain literals, any chars or character classes");
|
||||||
|
if (child->quantifier.type != ParsedRegex::Quantifier::One)
|
||||||
|
parse_error("Quantifiers cannot be used in lookarounds");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ParsedRegex m_parsed_regex;
|
ParsedRegex m_parsed_regex;
|
||||||
|
@ -679,15 +684,27 @@ private:
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& literals, bool reversed = false)
|
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters, bool reversed = false)
|
||||||
{
|
{
|
||||||
uint32_t res = m_program.lookarounds.size();
|
uint32_t res = m_program.lookarounds.size();
|
||||||
if (reversed)
|
auto write_lookaround = [this](auto&& characters) {
|
||||||
for (auto& literal : literals | reverse())
|
for (auto& character : characters)
|
||||||
m_program.lookarounds.push_back(literal->value);
|
{
|
||||||
|
if (character->op == ParsedRegex::Literal)
|
||||||
|
m_program.lookarounds.push_back(character->value);
|
||||||
|
else if (character->op == ParsedRegex::AnyChar)
|
||||||
|
m_program.lookarounds.push_back(0xF000);
|
||||||
|
else if (character->op == ParsedRegex::Matcher)
|
||||||
|
m_program.lookarounds.push_back(0xF0001 + character->value);
|
||||||
else
|
else
|
||||||
for (auto& literal : literals)
|
kak_assert(false);
|
||||||
m_program.lookarounds.push_back(literal->value);
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (reversed)
|
||||||
|
write_lookaround(characters | reverse());
|
||||||
|
else
|
||||||
|
write_lookaround(characters);
|
||||||
|
|
||||||
m_program.lookarounds.push_back((Codepoint)-1);
|
m_program.lookarounds.push_back((Codepoint)-1);
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -294,28 +294,14 @@ private:
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LookAhead:
|
case CompiledRegex::LookAhead:
|
||||||
case CompiledRegex::NegativeLookAhead:
|
case CompiledRegex::NegativeLookAhead:
|
||||||
{
|
if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead))
|
||||||
auto ref = m_program.lookarounds.begin() + inst.param;
|
|
||||||
for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref)
|
|
||||||
if (*it != *ref)
|
|
||||||
break;
|
|
||||||
if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or
|
|
||||||
(inst.op == CompiledRegex::NegativeLookAhead and *ref == -1))
|
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case CompiledRegex::LookBehind:
|
case CompiledRegex::LookBehind:
|
||||||
case CompiledRegex::NegativeLookBehind:
|
case CompiledRegex::NegativeLookBehind:
|
||||||
{
|
if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind))
|
||||||
auto ref = m_program.lookarounds.begin() + inst.param;
|
|
||||||
for (auto it = pos; *ref != -1 and it > m_begin; --it, ++ref)
|
|
||||||
if (*(it-1) != *ref)
|
|
||||||
break;
|
|
||||||
if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or
|
|
||||||
(inst.op == CompiledRegex::NegativeLookBehind and *ref == -1))
|
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
return StepResult::Matched;
|
return StepResult::Matched;
|
||||||
}
|
}
|
||||||
|
@ -392,6 +378,26 @@ private:
|
||||||
++start;
|
++start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<MatchDirection look_direction>
|
||||||
|
bool lookaround(uint32_t index, Utf8It pos) const
|
||||||
|
{
|
||||||
|
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
||||||
|
{
|
||||||
|
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
|
||||||
|
return false;
|
||||||
|
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
|
||||||
|
if (ref == 0xF000)
|
||||||
|
{} // any character matches
|
||||||
|
else if (ref > 0xF0000 and ref <= 0xFFFFD and not m_program.matchers[ref - 0xF0001](cp))
|
||||||
|
return false;
|
||||||
|
else if (ref != cp)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool is_line_start(const Utf8It& pos) const
|
bool is_line_start(const Utf8It& pos) const
|
||||||
{
|
{
|
||||||
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
|
if (not (m_flags & RegexExecFlags::PrevAvailable) and pos == m_begin)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user