Regex: Fix support for ignore case in lookarounds

This commit is contained in:
Maxime Coste 2017-10-10 11:21:21 +08:00
parent 80f6caee81
commit 5bf4be645a
2 changed files with 74 additions and 23 deletions

View File

@ -541,6 +541,7 @@ private:
uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
{
const auto start_pos = m_program.instructions.size();
const bool ignore_case = node->ignore_case;
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs)))
@ -550,8 +551,8 @@ private:
switch (node->op)
{
case ParsedRegex::Literal:
if (node->ignore_case)
push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value));
if (ignore_case)
push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node->value));
else
push_inst(CompiledRegex::Literal, node->value);
break;
@ -594,24 +595,32 @@ private:
break;
}
case ParsedRegex::LookAhead:
push_inst(m_forward ? CompiledRegex::LookAhead
: CompiledRegex::LookBehind,
push_lookaround(node->children, false));
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead)
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind),
push_lookaround(node->children, false, ignore_case));
break;
case ParsedRegex::NegativeLookAhead:
push_inst(m_forward ? CompiledRegex::NegativeLookAhead
: CompiledRegex::NegativeLookBehind,
push_lookaround(node->children, false));
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead)
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind),
push_lookaround(node->children, false, ignore_case));
break;
case ParsedRegex::LookBehind:
push_inst(m_forward ? CompiledRegex::LookBehind
: CompiledRegex::LookAhead,
push_lookaround(node->children, true));
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind)
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead),
push_lookaround(node->children, true, ignore_case));
break;
case ParsedRegex::NegativeLookBehind:
push_inst(m_forward ? CompiledRegex::NegativeLookBehind
: CompiledRegex::NegativeLookAhead,
push_lookaround(node->children, true));
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind)
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead),
push_lookaround(node->children, true, ignore_case));
break;
case ParsedRegex::LineStart:
push_inst(m_forward ? CompiledRegex::LineStart
@ -698,14 +707,16 @@ private:
return res;
}
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters, bool reversed = false)
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters,
bool reversed, bool ignore_case)
{
uint32_t res = m_program.lookarounds.size();
auto write_lookaround = [this](auto&& characters) {
auto write_lookaround = [this, ignore_case](auto&& characters) {
for (auto& character : characters)
{
if (character->op == ParsedRegex::Literal)
m_program.lookarounds.push_back(character->value);
m_program.lookarounds.push_back(ignore_case ? to_lower(character->value)
: character->value);
else if (character->op == ParsedRegex::AnyChar)
m_program.lookarounds.push_back(0xF000);
else if (character->op == ParsedRegex::Matcher)
@ -841,7 +852,7 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::Literal:
printf("literal %lc\n", inst.param);
break;
case CompiledRegex::LiteralIgnoreCase:
case CompiledRegex::Literal_IgnoreCase:
printf("literal (ignore case) %lc\n", inst.param);
break;
case CompiledRegex::AnyChar:
@ -886,6 +897,10 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::NegativeLookAhead:
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase:
case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase:
{
const char* name = nullptr;
if (inst.op == CompiledRegex::LookAhead)
@ -897,6 +912,15 @@ void dump_regex(const CompiledRegex& program)
if (inst.op == CompiledRegex::NegativeLookBehind)
name = "negative look behind";
if (inst.op == CompiledRegex::LookAhead_IgnoreCase)
name = "look ahead (ignore case)";
if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase)
name = "negative look ahead (ignore case)";
if (inst.op == CompiledRegex::LookBehind_IgnoreCase)
name = "look behind (ignore case)";
if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase)
name = "negative look behind (ignore case)";
String str;
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
utf8::dump(std::back_inserter(str), *it);
@ -1183,6 +1207,12 @@ auto test_regex = UnitTest{[]{
TestVM<> vm{R"((?=))"};
kak_assert(vm.exec(""));
}
{
TestVM<> vm{R"((?i)(?=Foo))"};
kak_assert(vm.exec("fOO", RegexExecFlags::Search));
kak_assert(*vm.captures()[0] == 'f');
}
}};
}

View File

@ -29,7 +29,7 @@ struct CompiledRegex : RefCountable
{
Match,
Literal,
LiteralIgnoreCase,
Literal_IgnoreCase,
AnyChar,
Matcher,
Jump,
@ -46,6 +46,10 @@ struct CompiledRegex : RefCountable
NegativeLookAhead,
LookBehind,
NegativeLookBehind,
LookAhead_IgnoreCase,
NegativeLookAhead_IgnoreCase,
LookBehind_IgnoreCase,
NegativeLookBehind_IgnoreCase,
};
struct Instruction
@ -240,7 +244,7 @@ private:
if (pos != m_end and inst.param == *pos)
return StepResult::Consumed;
return StepResult::Failed;
case CompiledRegex::LiteralIgnoreCase:
case CompiledRegex::Literal_IgnoreCase:
if (pos != m_end and inst.param == to_lower(*pos))
return StepResult::Consumed;
return StepResult::Failed;
@ -307,12 +311,26 @@ private:
break;
case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead:
if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead))
if (lookaround<MatchDirection::Forward, false>(inst.param, pos) !=
(inst.op == CompiledRegex::LookAhead))
return StepResult::Failed;
break;
case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase:
if (lookaround<MatchDirection::Forward, true>(inst.param, pos) !=
(inst.op == CompiledRegex::LookAhead_IgnoreCase))
return StepResult::Failed;
break;
case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind:
if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind))
if (lookaround<MatchDirection::Backward, false>(inst.param, pos) !=
(inst.op == CompiledRegex::LookBehind))
return StepResult::Failed;
break;
case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase:
if (lookaround<MatchDirection::Backward, true>(inst.param, pos) !=
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
return StepResult::Failed;
break;
case CompiledRegex::Match:
@ -391,7 +409,7 @@ private:
++start;
}
template<MatchDirection look_direction>
template<MatchDirection look_direction, bool ignore_case>
bool lookaround(uint32_t index, Utf8It pos) const
{
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
@ -399,6 +417,9 @@ private:
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
return false;
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
if (ignore_case)
cp = to_lower(cp);
if (ref == 0xF000)
{} // any character matches
else if (ref > 0xF0000 and ref <= 0xFFFFD)