Cleanup regex lookarounds implementation and reject incompatible regex
Fixes #2487
This commit is contained in:
parent
9024d41d64
commit
d652ec9ce1
|
@ -8,6 +8,7 @@
|
||||||
#include "utf8_iterator.hh"
|
#include "utf8_iterator.hh"
|
||||||
#include "string_utils.hh"
|
#include "string_utils.hh"
|
||||||
#include "vector.hh"
|
#include "vector.hh"
|
||||||
|
#include "utils.hh"
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
@ -602,12 +603,17 @@ private:
|
||||||
|
|
||||||
void validate_lookaround(NodeIndex index)
|
void validate_lookaround(NodeIndex index)
|
||||||
{
|
{
|
||||||
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) {
|
ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) {
|
||||||
auto& child = get_node(child_index);
|
auto& child = get_node(child_index);
|
||||||
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and
|
if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and
|
||||||
child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and
|
child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and
|
||||||
child.op != ParsedRegex::AnyCharExceptNewLine)
|
child.op != ParsedRegex::AnyCharExceptNewLine)
|
||||||
parse_error("Lookaround can only contain literals, any chars or character classes");
|
parse_error("Lookaround can only contain literals, any chars or character classes");
|
||||||
|
if (child.op == ParsedRegex::Literal and
|
||||||
|
to_underlying(Lookaround::OpBegin) <= child.value and
|
||||||
|
child.value < to_underlying(Lookaround::OpEnd))
|
||||||
|
parse_error("Lookaround does not support literals codepoint between 0xF0000 and 0xFFFFD");
|
||||||
if (child.quantifier.type != ParsedRegex::Quantifier::One)
|
if (child.quantifier.type != ParsedRegex::Quantifier::One)
|
||||||
parse_error("Quantifiers cannot be used in lookarounds");
|
parse_error("Quantifiers cannot be used in lookarounds");
|
||||||
return true;
|
return true;
|
||||||
|
@ -877,20 +883,22 @@ private:
|
||||||
template<MatchDirection direction>
|
template<MatchDirection direction>
|
||||||
uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case)
|
uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case)
|
||||||
{
|
{
|
||||||
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
|
|
||||||
const uint32_t res = m_program.lookarounds.size();
|
const uint32_t res = m_program.lookarounds.size();
|
||||||
auto write_matcher = [this, ignore_case](ParsedRegex::NodeIndex child) {
|
auto write_matcher = [this, ignore_case](ParsedRegex::NodeIndex child) {
|
||||||
auto& character = get_node(child);
|
auto& character = get_node(child);
|
||||||
if (character.op == ParsedRegex::Literal)
|
if (character.op == ParsedRegex::Literal)
|
||||||
m_program.lookarounds.push_back(ignore_case ? to_lower(character.value)
|
m_program.lookarounds.push_back(
|
||||||
: character.value);
|
static_cast<Lookaround>(ignore_case ? to_lower(character.value) : character.value));
|
||||||
else if (character.op == ParsedRegex::AnyChar)
|
else if (character.op == ParsedRegex::AnyChar)
|
||||||
m_program.lookarounds.push_back(0xF000);
|
m_program.lookarounds.push_back(Lookaround::AnyChar);
|
||||||
else if (character.op == ParsedRegex::AnyCharExceptNewLine)
|
else if (character.op == ParsedRegex::AnyCharExceptNewLine)
|
||||||
m_program.lookarounds.push_back(0xF001);
|
m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine);
|
||||||
else if (character.op == ParsedRegex::Class)
|
else if (character.op == ParsedRegex::Class)
|
||||||
m_program.lookarounds.push_back(0xF0001 + character.value);
|
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterClass) + character.value));
|
||||||
else if (character.op == ParsedRegex::CharacterType)
|
else if (character.op == ParsedRegex::CharacterType)
|
||||||
m_program.lookarounds.push_back(0xF8000 | character.value);
|
m_program.lookarounds.push_back(static_cast<Lookaround>(to_underlying(Lookaround::CharacterType) | character.value));
|
||||||
else
|
else
|
||||||
kak_assert(false);
|
kak_assert(false);
|
||||||
return true;
|
return true;
|
||||||
|
@ -898,7 +906,7 @@ private:
|
||||||
|
|
||||||
ForEachChild<direction>::apply(m_parsed_regex, index, write_matcher);
|
ForEachChild<direction>::apply(m_parsed_regex, index, write_matcher);
|
||||||
|
|
||||||
m_program.lookarounds.push_back((Codepoint)-1);
|
m_program.lookarounds.push_back(Lookaround::EndOfLookaround);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1121,8 +1129,9 @@ String dump_regex(const CompiledRegex& program)
|
||||||
name = "negative look behind (ignore case)";
|
name = "negative look behind (ignore case)";
|
||||||
|
|
||||||
String str;
|
String str;
|
||||||
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
|
for (auto it = program.lookarounds.begin() + inst.param;
|
||||||
utf8::dump(std::back_inserter(str), *it);
|
*it != CompiledRegex::Lookaround::EndOfLookaround; ++it)
|
||||||
|
utf8::dump(std::back_inserter(str), to_underlying(*it));
|
||||||
res += format("{} ({})\n", name, str);
|
res += format("{} ({})\n", name, str);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include "utf8.hh"
|
#include "utf8.hh"
|
||||||
#include "utf8_iterator.hh"
|
#include "utf8_iterator.hh"
|
||||||
#include "vector.hh"
|
#include "vector.hh"
|
||||||
|
#include "utils.hh"
|
||||||
|
|
||||||
namespace Kakoune
|
namespace Kakoune
|
||||||
{
|
{
|
||||||
|
@ -82,6 +83,17 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
NegativeLookBehind_IgnoreCase,
|
NegativeLookBehind_IgnoreCase,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum class Lookaround : Codepoint
|
||||||
|
{
|
||||||
|
OpBegin = 0xF0000,
|
||||||
|
AnyChar = 0xF0000,
|
||||||
|
AnyCharExceptNewLine = 0xF0001,
|
||||||
|
CharacterClass = 0xF0002,
|
||||||
|
CharacterType = 0xF8000,
|
||||||
|
OpEnd = 0xFFFFF,
|
||||||
|
EndOfLookaround = static_cast<Codepoint>(-1)
|
||||||
|
};
|
||||||
|
|
||||||
struct Instruction
|
struct Instruction
|
||||||
{
|
{
|
||||||
Op op;
|
Op op;
|
||||||
|
@ -98,7 +110,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||||
|
|
||||||
Vector<Instruction, MemoryDomain::Regex> instructions;
|
Vector<Instruction, MemoryDomain::Regex> instructions;
|
||||||
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
|
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
|
||||||
Vector<Codepoint, MemoryDomain::Regex> lookarounds;
|
Vector<Lookaround, MemoryDomain::Regex> lookarounds;
|
||||||
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
|
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
|
||||||
uint32_t save_count;
|
uint32_t save_count;
|
||||||
|
|
||||||
|
@ -522,8 +534,10 @@ private:
|
||||||
template<MatchDirection look_direction, bool ignore_case>
|
template<MatchDirection look_direction, bool ignore_case>
|
||||||
bool lookaround(uint32_t index, EffectiveIt pos, const ExecConfig& config) const
|
bool lookaround(uint32_t index, EffectiveIt pos, const ExecConfig& config) const
|
||||||
{
|
{
|
||||||
|
using Lookaround = CompiledRegex::Lookaround;
|
||||||
|
|
||||||
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
|
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
|
||||||
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it)
|
||||||
{
|
{
|
||||||
if (pos == end)
|
if (pos == end)
|
||||||
return false;
|
return false;
|
||||||
|
@ -531,25 +545,27 @@ private:
|
||||||
if (ignore_case)
|
if (ignore_case)
|
||||||
cp = to_lower(cp);
|
cp = to_lower(cp);
|
||||||
|
|
||||||
const Codepoint ref = *it;
|
const Lookaround op = *it;
|
||||||
if (ref == 0xF000)
|
if (op == Lookaround::AnyChar)
|
||||||
{} // any character matches
|
{} // any character matches
|
||||||
else if (ref == 0xF001)
|
else if (op == Lookaround::AnyCharExceptNewLine)
|
||||||
{
|
{
|
||||||
if (cp == '\n')
|
if (cp == '\n')
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (ref > 0xF0000 and ref < 0xF8000)
|
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
|
||||||
{
|
{
|
||||||
if (not is_character_class(m_program.character_classes[ref - 0xF0001], cp))
|
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
|
||||||
|
if (not is_character_class(m_program.character_classes[index], cp))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (ref >= 0xF8000 and ref <= 0xFFFFD)
|
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)
|
||||||
{
|
{
|
||||||
if (not is_ctype((CharacterType)(ref & 0xFF), cp))
|
auto ctype = static_cast<CharacterType>(to_underlying(op) & 0xFF);
|
||||||
|
if (not is_ctype(ctype, cp))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (ref != cp)
|
else if (static_cast<Codepoint>(op) != cp)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
||||||
|
|
|
@ -148,6 +148,12 @@ bool skip_while_reverse(Iterator& it, const BeginIterator& begin, T condition)
|
||||||
return condition(*it);
|
return condition(*it);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename E>
|
||||||
|
auto to_underlying(E value)
|
||||||
|
{
|
||||||
|
return static_cast<std::underlying_type_t<E>>(value);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // utils_hh_INCLUDED
|
#endif // utils_hh_INCLUDED
|
||||||
|
|
Loading…
Reference in New Issue
Block a user