Regex: Fix support for ignore case in lookarounds

This commit is contained in:
Maxime Coste 2017-10-10 11:21:21 +08:00
parent 80f6caee81
commit 5bf4be645a
2 changed files with 74 additions and 23 deletions

View File

@ -541,6 +541,7 @@ private:
uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node) uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
{ {
const auto start_pos = m_program.instructions.size(); const auto start_pos = m_program.instructions.size();
const bool ignore_case = node->ignore_case;
const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1; const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs))) if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs)))
@ -550,8 +551,8 @@ private:
switch (node->op) switch (node->op)
{ {
case ParsedRegex::Literal: case ParsedRegex::Literal:
if (node->ignore_case) if (ignore_case)
push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value)); push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node->value));
else else
push_inst(CompiledRegex::Literal, node->value); push_inst(CompiledRegex::Literal, node->value);
break; break;
@ -594,24 +595,32 @@ private:
break; break;
} }
case ParsedRegex::LookAhead: case ParsedRegex::LookAhead:
push_inst(m_forward ? CompiledRegex::LookAhead push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookBehind, : CompiledRegex::LookAhead)
push_lookaround(node->children, false)); : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookBehind),
push_lookaround(node->children, false, ignore_case));
break; break;
case ParsedRegex::NegativeLookAhead: case ParsedRegex::NegativeLookAhead:
push_inst(m_forward ? CompiledRegex::NegativeLookAhead push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookBehind, : CompiledRegex::NegativeLookAhead)
push_lookaround(node->children, false)); : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookBehind),
push_lookaround(node->children, false, ignore_case));
break; break;
case ParsedRegex::LookBehind: case ParsedRegex::LookBehind:
push_inst(m_forward ? CompiledRegex::LookBehind push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
: CompiledRegex::LookAhead, : CompiledRegex::LookBehind)
push_lookaround(node->children, true)); : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
: CompiledRegex::LookAhead),
push_lookaround(node->children, true, ignore_case));
break; break;
case ParsedRegex::NegativeLookBehind: case ParsedRegex::NegativeLookBehind:
push_inst(m_forward ? CompiledRegex::NegativeLookBehind push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
: CompiledRegex::NegativeLookAhead, : CompiledRegex::NegativeLookBehind)
push_lookaround(node->children, true)); : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
: CompiledRegex::NegativeLookAhead),
push_lookaround(node->children, true, ignore_case));
break; break;
case ParsedRegex::LineStart: case ParsedRegex::LineStart:
push_inst(m_forward ? CompiledRegex::LineStart push_inst(m_forward ? CompiledRegex::LineStart
@ -698,14 +707,16 @@ private:
return res; return res;
} }
uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters, bool reversed = false) uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& characters,
bool reversed, bool ignore_case)
{ {
uint32_t res = m_program.lookarounds.size(); uint32_t res = m_program.lookarounds.size();
auto write_lookaround = [this](auto&& characters) { auto write_lookaround = [this, ignore_case](auto&& characters) {
for (auto& character : characters) for (auto& character : characters)
{ {
if (character->op == ParsedRegex::Literal) if (character->op == ParsedRegex::Literal)
m_program.lookarounds.push_back(character->value); m_program.lookarounds.push_back(ignore_case ? to_lower(character->value)
: character->value);
else if (character->op == ParsedRegex::AnyChar) else if (character->op == ParsedRegex::AnyChar)
m_program.lookarounds.push_back(0xF000); m_program.lookarounds.push_back(0xF000);
else if (character->op == ParsedRegex::Matcher) else if (character->op == ParsedRegex::Matcher)
@ -841,7 +852,7 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::Literal: case CompiledRegex::Literal:
printf("literal %lc\n", inst.param); printf("literal %lc\n", inst.param);
break; break;
case CompiledRegex::LiteralIgnoreCase: case CompiledRegex::Literal_IgnoreCase:
printf("literal (ignore case) %lc\n", inst.param); printf("literal (ignore case) %lc\n", inst.param);
break; break;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
@ -886,6 +897,10 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::NegativeLookAhead: case CompiledRegex::NegativeLookAhead:
case CompiledRegex::LookBehind: case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind: case CompiledRegex::NegativeLookBehind:
case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase:
case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase:
{ {
const char* name = nullptr; const char* name = nullptr;
if (inst.op == CompiledRegex::LookAhead) if (inst.op == CompiledRegex::LookAhead)
@ -897,6 +912,15 @@ void dump_regex(const CompiledRegex& program)
if (inst.op == CompiledRegex::NegativeLookBehind) if (inst.op == CompiledRegex::NegativeLookBehind)
name = "negative look behind"; name = "negative look behind";
if (inst.op == CompiledRegex::LookAhead_IgnoreCase)
name = "look ahead (ignore case)";
if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase)
name = "negative look ahead (ignore case)";
if (inst.op == CompiledRegex::LookBehind_IgnoreCase)
name = "look behind (ignore case)";
if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase)
name = "negative look behind (ignore case)";
String str; String str;
for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it) for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
utf8::dump(std::back_inserter(str), *it); utf8::dump(std::back_inserter(str), *it);
@ -1183,6 +1207,12 @@ auto test_regex = UnitTest{[]{
TestVM<> vm{R"((?=))"}; TestVM<> vm{R"((?=))"};
kak_assert(vm.exec("")); kak_assert(vm.exec(""));
} }
{
TestVM<> vm{R"((?i)(?=Foo))"};
kak_assert(vm.exec("fOO", RegexExecFlags::Search));
kak_assert(*vm.captures()[0] == 'f');
}
}}; }};
} }

View File

@ -29,7 +29,7 @@ struct CompiledRegex : RefCountable
{ {
Match, Match,
Literal, Literal,
LiteralIgnoreCase, Literal_IgnoreCase,
AnyChar, AnyChar,
Matcher, Matcher,
Jump, Jump,
@ -46,6 +46,10 @@ struct CompiledRegex : RefCountable
NegativeLookAhead, NegativeLookAhead,
LookBehind, LookBehind,
NegativeLookBehind, NegativeLookBehind,
LookAhead_IgnoreCase,
NegativeLookAhead_IgnoreCase,
LookBehind_IgnoreCase,
NegativeLookBehind_IgnoreCase,
}; };
struct Instruction struct Instruction
@ -240,7 +244,7 @@ private:
if (pos != m_end and inst.param == *pos) if (pos != m_end and inst.param == *pos)
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::LiteralIgnoreCase: case CompiledRegex::Literal_IgnoreCase:
if (pos != m_end and inst.param == to_lower(*pos)) if (pos != m_end and inst.param == to_lower(*pos))
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
@ -307,12 +311,26 @@ private:
break; break;
case CompiledRegex::LookAhead: case CompiledRegex::LookAhead:
case CompiledRegex::NegativeLookAhead: case CompiledRegex::NegativeLookAhead:
if (lookaround<MatchDirection::Forward>(inst.param, pos) != (inst.op == CompiledRegex::LookAhead)) if (lookaround<MatchDirection::Forward, false>(inst.param, pos) !=
(inst.op == CompiledRegex::LookAhead))
return StepResult::Failed;
break;
case CompiledRegex::LookAhead_IgnoreCase:
case CompiledRegex::NegativeLookAhead_IgnoreCase:
if (lookaround<MatchDirection::Forward, true>(inst.param, pos) !=
(inst.op == CompiledRegex::LookAhead_IgnoreCase))
return StepResult::Failed; return StepResult::Failed;
break; break;
case CompiledRegex::LookBehind: case CompiledRegex::LookBehind:
case CompiledRegex::NegativeLookBehind: case CompiledRegex::NegativeLookBehind:
if (lookaround<MatchDirection::Backward>(inst.param, pos) != (inst.op == CompiledRegex::LookBehind)) if (lookaround<MatchDirection::Backward, false>(inst.param, pos) !=
(inst.op == CompiledRegex::LookBehind))
return StepResult::Failed;
break;
case CompiledRegex::LookBehind_IgnoreCase:
case CompiledRegex::NegativeLookBehind_IgnoreCase:
if (lookaround<MatchDirection::Backward, true>(inst.param, pos) !=
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
return StepResult::Failed; return StepResult::Failed;
break; break;
case CompiledRegex::Match: case CompiledRegex::Match:
@ -391,7 +409,7 @@ private:
++start; ++start;
} }
template<MatchDirection look_direction> template<MatchDirection look_direction, bool ignore_case>
bool lookaround(uint32_t index, Utf8It pos) const bool lookaround(uint32_t index, Utf8It pos) const
{ {
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it) for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
@ -399,6 +417,9 @@ private:
if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin)) if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin))
return false; return false;
auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it; auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it;
if (ignore_case)
cp = to_lower(cp);
if (ref == 0xF000) if (ref == 0xF000)
{} // any character matches {} // any character matches
else if (ref > 0xF0000 and ref <= 0xFFFFD) else if (ref > 0xF0000 and ref <= 0xFFFFD)