Regex: Fix support for ignore case in lookarounds
This commit is contained in:
parent
80f6caee81
commit
5bf4be645a
|
@ -541,6 +541,7 @@ private:
|
||||||
uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
|
uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
|
||||||
{
|
{
|
||||||
const auto start_pos = m_program.instructions.size();
|
const auto start_pos = m_program.instructions.size();
|
||||||
|
const bool ignore_case = node->ignore_case;
|
||||||
|
|
||||||
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
|
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
|
||||||
if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs)))
|
if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs)))
|
||||||
|
@ -550,8 +551,8 @@ private:
|
||||||
switch (node->op)
|
switch (node->op)
|
||||||
{
|
{
|
||||||
case ParsedRegex::Literal:
|
case ParsedRegex::Literal:
|
||||||
if (node->ignore_case)
|
if (ignore_case)
|
||||||
push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value));
|
push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node->value));
|
||||||
else
|
else
|
||||||
push_inst(CompiledRegex::Literal, node->value);
|
push_inst(CompiledRegex::Literal, node->value);
|
||||||
break;
|
break;
|
||||||
|
@ -594,24 +595,32 @@ private:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ParsedRegex::LookAhead:
|
case ParsedRegex::LookAhead:
|
||||||
push_inst(m_forward ? CompiledRegex::LookAhead
|
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
||||||
: CompiledRegex::LookBehind,
|
: CompiledRegex::LookAhead)
|
||||||
push_lookaround(node->children, false));
|
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
||||||
|
: CompiledRegex::LookBehind),
|
||||||
|
push_lookaround(node->children, false, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookAhead:
|
case ParsedRegex::NegativeLookAhead:
|
||||||
push_inst(m_forward ? CompiledRegex::NegativeLookAhead
|
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
||||||
: CompiledRegex::NegativeLookBehind,
|
: CompiledRegex::NegativeLookAhead)
|
||||||
push_lookaround(node->children, false));
|
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
||||||
|
: CompiledRegex::NegativeLookBehind),
|
||||||
|
push_lookaround(node->children, false, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LookBehind:
|
case ParsedRegex::LookBehind:
|
||||||
push_inst(m_forward ? CompiledRegex::LookBehind
|
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
||||||
: CompiledRegex::LookAhead,
|
: CompiledRegex::LookBehind)
|
||||||
push_lookaround(node->children, true));
|
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
||||||
|
: CompiledRegex::LookAhead),
|
||||||
|
push_lookaround(node->children, true, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::NegativeLookBehind:
|
case ParsedRegex::NegativeLookBehind:
|
||||||
push_inst(m_forward ? CompiledRegex::NegativeLookBehind
|
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
||||||
: CompiledRegex::NegativeLookAhead,
|
: CompiledRegex::NegativeLookBehind)
|
||||||
push_lookaround(node->children, true));
|
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
||||||
|
: CompiledRegex::NegativeLookAhead),
|
||||||
|
push_lookaround(node->children, true, ignore_case));
|
||||||
break;
|
break;
|
||||||
case ParsedRegex::LineStart:
|
case ParsedRegex::LineStart:
|
||||||
push_inst(m_forward ? CompiledRegex::LineStart
|
push_inst(m_forward ? CompiledRegex::LineStart
|
||||||
|
@ -698,14 +707,16 @@ private:
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters, bool reversed = false)
|
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters,
|
||||||
|
bool reversed, bool ignore_case)
|
||||||
{
|
{
|
||||||
uint32_t res = m_program.lookarounds.size();
|
uint32_t res = m_program.lookarounds.size();
|
||||||
auto write_lookaround = [this](auto&& characters) {
|
auto write_lookaround = [this, ignore_case](auto&& characters) {
|
||||||
for (auto& character : characters)
|
for (auto& character : characters)
|
||||||
{
|
{
|
||||||
if (character->op == ParsedRegex::Literal)
|
if (character->op == ParsedRegex::Literal)
|
||||||
m_program.lookarounds.push_back(character->value);
|
m_program.lookarounds.push_back(ignore_case ? to_lower(character->value)
|
||||||
|
: character->value);
|
||||||
else if (character->op == ParsedRegex::AnyChar)
|
else if (character->op == ParsedRegex::AnyChar)
|
||||||
m_program.lookarounds.push_back(0xF000);
|
m_program.lookarounds.push_back(0xF000);
|
||||||
else if (character->op == ParsedRegex::Matcher)
|
else if (character->op == ParsedRegex::Matcher)
|
||||||
|
@ -841,7 +852,7 @@ void dump_regex(const CompiledRegex& program)
|
||||||
case CompiledRegex::Literal:
|
case CompiledRegex::Literal:
|
||||||
printf("literal %lc\n", inst.param);
|
printf("literal %lc\n", inst.param);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LiteralIgnoreCase:
|
case CompiledRegex::Literal_IgnoreCase:
|
||||||
printf("literal (ignore case) %lc\n", inst.param);
|
printf("literal (ignore case) %lc\n", inst.param);
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::AnyChar:
|
case CompiledRegex::AnyChar:
|
||||||
|
@ -886,6 +897,10 @@ void dump_regex(const CompiledRegex& program)
|
||||||
case CompiledRegex::NegativeLookAhead:
|
case CompiledRegex::NegativeLookAhead:
|
||||||
case CompiledRegex::LookBehind:
|
case CompiledRegex::LookBehind:
|
||||||
case CompiledRegex::NegativeLookBehind:
|
case CompiledRegex::NegativeLookBehind:
|
||||||
|
case CompiledRegex::LookAhead_IgnoreCase:
|
||||||
|
case CompiledRegex::NegativeLookAhead_IgnoreCase:
|
||||||
|
case CompiledRegex::LookBehind_IgnoreCase:
|
||||||
|
case CompiledRegex::NegativeLookBehind_IgnoreCase:
|
||||||
{
|
{
|
||||||
const char* name = nullptr;
|
const char* name = nullptr;
|
||||||
if (inst.op == CompiledRegex::LookAhead)
|
if (inst.op == CompiledRegex::LookAhead)
|
||||||
|
@ -897,6 +912,15 @@ void dump_regex(const CompiledRegex& program)
|
||||||
if (inst.op == CompiledRegex::NegativeLookBehind)
|
if (inst.op == CompiledRegex::NegativeLookBehind)
|
||||||
name = "negative look behind";
|
name = "negative look behind";
|
||||||
|
|
||||||
|
if (inst.op == CompiledRegex::LookAhead_IgnoreCase)
|
||||||
|
name = "look ahead (ignore case)";
|
||||||
|
if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase)
|
||||||
|
name = "negative look ahead (ignore case)";
|
||||||
|
if (inst.op == CompiledRegex::LookBehind_IgnoreCase)
|
||||||
|
name = "look behind (ignore case)";
|
||||||
|
if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase)
|
||||||
|
name = "negative look behind (ignore case)";
|
||||||
|
|
||||||
String str;
|
String str;
|
||||||
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
|
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
|
||||||
utf8::dump(std::back_inserter(str), *it);
|
utf8::dump(std::back_inserter(str), *it);
|
||||||
|
@ -1183,6 +1207,12 @@ auto test_regex = UnitTest{[]{
|
||||||
TestVM<> vm{R"((?=))"};
|
TestVM<> vm{R"((?=))"};
|
||||||
kak_assert(vm.exec(""));
|
kak_assert(vm.exec(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<> vm{R"((?i)(?=Foo))"};
|
||||||
|
kak_assert(vm.exec("fOO", RegexExecFlags::Search));
|
||||||
|
kak_assert(*vm.captures()[0] == 'f');
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ struct CompiledRegex : RefCountable
|
||||||
{
|
{
|
||||||
Match,
|
Match,
|
||||||
Literal,
|
Literal,
|
||||||
LiteralIgnoreCase,
|
Literal_IgnoreCase,
|
||||||
AnyChar,
|
AnyChar,
|
||||||
Matcher,
|
Matcher,
|
||||||
Jump,
|
Jump,
|
||||||
|
@ -46,6 +46,10 @@ struct CompiledRegex : RefCountable
|
||||||
NegativeLookAhead,
|
NegativeLookAhead,
|
||||||
LookBehind,
|
LookBehind,
|
||||||
NegativeLookBehind,
|
NegativeLookBehind,
|
||||||
|
LookAhead_IgnoreCase,
|
||||||
|
NegativeLookAhead_IgnoreCase,
|
||||||
|
LookBehind_IgnoreCase,
|
||||||
|
NegativeLookBehind_IgnoreCase,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Instruction
|
struct Instruction
|
||||||
|
@ -240,7 +244,7 @@ private:
|
||||||
if (pos != m_end and inst.param == *pos)
|
if (pos != m_end and inst.param == *pos)
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
case CompiledRegex::LiteralIgnoreCase:
|
case CompiledRegex::Literal_IgnoreCase:
|
||||||
if (pos != m_end and inst.param == to_lower(*pos))
|
if (pos != m_end and inst.param == to_lower(*pos))
|
||||||
return StepResult::Consumed;
|
return StepResult::Consumed;
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
|
@ -307,12 +311,26 @@ private:
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LookAhead:
|
case CompiledRegex::LookAhead:
|
||||||
case CompiledRegex::NegativeLookAhead:
|
case CompiledRegex::NegativeLookAhead:
|
||||||
if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead))
|
if (lookaround<MatchDirection::Forward, false>(inst.param, pos) !=
|
||||||
|
(inst.op == CompiledRegex::LookAhead))
|
||||||
|
return StepResult::Failed;
|
||||||
|
break;
|
||||||
|
case CompiledRegex::LookAhead_IgnoreCase:
|
||||||
|
case CompiledRegex::NegativeLookAhead_IgnoreCase:
|
||||||
|
if (lookaround<MatchDirection::Forward, true>(inst.param, pos) !=
|
||||||
|
(inst.op == CompiledRegex::LookAhead_IgnoreCase))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::LookBehind:
|
case CompiledRegex::LookBehind:
|
||||||
case CompiledRegex::NegativeLookBehind:
|
case CompiledRegex::NegativeLookBehind:
|
||||||
if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind))
|
if (lookaround<MatchDirection::Backward, false>(inst.param, pos) !=
|
||||||
|
(inst.op == CompiledRegex::LookBehind))
|
||||||
|
return StepResult::Failed;
|
||||||
|
break;
|
||||||
|
case CompiledRegex::LookBehind_IgnoreCase:
|
||||||
|
case CompiledRegex::NegativeLookBehind_IgnoreCase:
|
||||||
|
if (lookaround<MatchDirection::Backward, true>(inst.param, pos) !=
|
||||||
|
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
|
||||||
return StepResult::Failed;
|
return StepResult::Failed;
|
||||||
break;
|
break;
|
||||||
case CompiledRegex::Match:
|
case CompiledRegex::Match:
|
||||||
|
@ -391,7 +409,7 @@ private:
|
||||||
++start;
|
++start;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<MatchDirection look_direction>
|
template<MatchDirection look_direction, bool ignore_case>
|
||||||
bool lookaround(uint32_t index, Utf8It pos) const
|
bool lookaround(uint32_t index, Utf8It pos) const
|
||||||
{
|
{
|
||||||
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
||||||
|
@ -399,6 +417,9 @@ private:
|
||||||
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
|
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
|
||||||
return false;
|
return false;
|
||||||
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
|
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
|
||||||
|
if (ignore_case)
|
||||||
|
cp = to_lower(cp);
|
||||||
|
|
||||||
if (ref == 0xF000)
|
if (ref == 0xF000)
|
||||||
{} // any character matches
|
{} // any character matches
|
||||||
else if (ref > 0xF0000 and ref <= 0xFFFFD)
|
else if (ref > 0xF0000 and ref <= 0xFFFFD)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user