From 5bf4be645a10f9cbabf2ed7ed5d96fdfdf1ab839 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Tue, 10 Oct 2017 11:21:21 +0800 Subject: [PATCH] Regex: Fix support for ignore case in lookarounds --- src/regex_impl.cc | 66 ++++++++++++++++++++++++++++++++++------------- src/regex_impl.hh | 31 ++++++++++++++++++---- 2 files changed, 74 insertions(+), 23 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 847044b7..bb1f3537 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -541,6 +541,7 @@ private: uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node) { const auto start_pos = m_program.instructions.size(); + const bool ignore_case = node->ignore_case; const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1; if (capture != -1 and (capture == 0 or not (m_flags & RegexCompileFlags::NoSubs))) @@ -550,8 +551,8 @@ private: switch (node->op) { case ParsedRegex::Literal: - if (node->ignore_case) - push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value)); + if (ignore_case) + push_inst(CompiledRegex::Literal_IgnoreCase, to_lower(node->value)); else push_inst(CompiledRegex::Literal, node->value); break; @@ -594,24 +595,32 @@ private: break; } case ParsedRegex::LookAhead: - push_inst(m_forward ? CompiledRegex::LookAhead - : CompiledRegex::LookBehind, - push_lookaround(node->children, false)); + push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase + : CompiledRegex::LookAhead) + : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase + : CompiledRegex::LookBehind), + push_lookaround(node->children, false, ignore_case)); break; case ParsedRegex::NegativeLookAhead: - push_inst(m_forward ? CompiledRegex::NegativeLookAhead - : CompiledRegex::NegativeLookBehind, - push_lookaround(node->children, false)); + push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase + : CompiledRegex::NegativeLookAhead) + : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase + : CompiledRegex::NegativeLookBehind), + push_lookaround(node->children, false, ignore_case)); break; case ParsedRegex::LookBehind: - push_inst(m_forward ? CompiledRegex::LookBehind - : CompiledRegex::LookAhead, - push_lookaround(node->children, true)); + push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase + : CompiledRegex::LookBehind) + : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase + : CompiledRegex::LookAhead), + push_lookaround(node->children, true, ignore_case)); break; case ParsedRegex::NegativeLookBehind: - push_inst(m_forward ? CompiledRegex::NegativeLookBehind - : CompiledRegex::NegativeLookAhead, - push_lookaround(node->children, true)); + push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase + : CompiledRegex::NegativeLookBehind) + : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase + : CompiledRegex::NegativeLookAhead), + push_lookaround(node->children, true, ignore_case)); break; case ParsedRegex::LineStart: push_inst(m_forward ? CompiledRegex::LineStart @@ -698,14 +707,16 @@ private: return res; } - uint32_t push_lookaround(const Vector& characters, bool reversed = false) + uint32_t push_lookaround(const Vector& characters, + bool reversed, bool ignore_case) { uint32_t res = m_program.lookarounds.size(); - auto write_lookaround = [this](auto&& characters) { + auto write_lookaround = [this, ignore_case](auto&& characters) { for (auto& character : characters) { if (character->op == ParsedRegex::Literal) - m_program.lookarounds.push_back(character->value); + m_program.lookarounds.push_back(ignore_case ? to_lower(character->value) + : character->value); else if (character->op == ParsedRegex::AnyChar) m_program.lookarounds.push_back(0xF000); else if (character->op == ParsedRegex::Matcher) @@ -841,7 +852,7 @@ void dump_regex(const CompiledRegex& program) case CompiledRegex::Literal: printf("literal %lc\n", inst.param); break; - case CompiledRegex::LiteralIgnoreCase: + case CompiledRegex::Literal_IgnoreCase: printf("literal (ignore case) %lc\n", inst.param); break; case CompiledRegex::AnyChar: @@ -886,6 +897,10 @@ void dump_regex(const CompiledRegex& program) case CompiledRegex::NegativeLookAhead: case CompiledRegex::LookBehind: case CompiledRegex::NegativeLookBehind: + case CompiledRegex::LookAhead_IgnoreCase: + case CompiledRegex::NegativeLookAhead_IgnoreCase: + case CompiledRegex::LookBehind_IgnoreCase: + case CompiledRegex::NegativeLookBehind_IgnoreCase: { const char* name = nullptr; if (inst.op == CompiledRegex::LookAhead) @@ -897,6 +912,15 @@ void dump_regex(const CompiledRegex& program) if (inst.op == CompiledRegex::NegativeLookBehind) name = "negative look behind"; + if (inst.op == CompiledRegex::LookAhead_IgnoreCase) + name = "look ahead (ignore case)"; + if (inst.op == CompiledRegex::NegativeLookAhead_IgnoreCase) + name = "negative look ahead (ignore case)"; + if (inst.op == CompiledRegex::LookBehind_IgnoreCase) + name = "look behind (ignore case)"; + if (inst.op == CompiledRegex::NegativeLookBehind_IgnoreCase) + name = "negative look behind (ignore case)"; + String str; for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it) utf8::dump(std::back_inserter(str), *it); @@ -1183,6 +1207,12 @@ auto test_regex = UnitTest{[]{ TestVM<> vm{R"((?=))"}; kak_assert(vm.exec("")); } + + { + TestVM<> vm{R"((?i)(?=Foo))"}; + kak_assert(vm.exec("fOO", RegexExecFlags::Search)); + kak_assert(*vm.captures()[0] == 'f'); + } }}; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 2d8da322..9aa736d6 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -29,7 +29,7 @@ struct CompiledRegex : RefCountable { Match, Literal, - LiteralIgnoreCase, + Literal_IgnoreCase, AnyChar, Matcher, Jump, @@ -46,6 +46,10 @@ struct CompiledRegex : RefCountable NegativeLookAhead, LookBehind, NegativeLookBehind, + LookAhead_IgnoreCase, + NegativeLookAhead_IgnoreCase, + LookBehind_IgnoreCase, + NegativeLookBehind_IgnoreCase, }; struct Instruction @@ -240,7 +244,7 @@ private: if (pos != m_end and inst.param == *pos) return StepResult::Consumed; return StepResult::Failed; - case CompiledRegex::LiteralIgnoreCase: + case CompiledRegex::Literal_IgnoreCase: if (pos != m_end and inst.param == to_lower(*pos)) return StepResult::Consumed; return StepResult::Failed; @@ -307,12 +311,26 @@ private: break; case CompiledRegex::LookAhead: case CompiledRegex::NegativeLookAhead: - if (lookaround(inst.param, pos) != (inst.op == CompiledRegex::LookAhead)) + if (lookaround(inst.param, pos) != + (inst.op == CompiledRegex::LookAhead)) + return StepResult::Failed; + break; + case CompiledRegex::LookAhead_IgnoreCase: + case CompiledRegex::NegativeLookAhead_IgnoreCase: + if (lookaround(inst.param, pos) != + (inst.op == CompiledRegex::LookAhead_IgnoreCase)) return StepResult::Failed; break; case CompiledRegex::LookBehind: case CompiledRegex::NegativeLookBehind: - if (lookaround(inst.param, pos) != (inst.op == CompiledRegex::LookBehind)) + if (lookaround(inst.param, pos) != + (inst.op == CompiledRegex::LookBehind)) + return StepResult::Failed; + break; + case CompiledRegex::LookBehind_IgnoreCase: + case CompiledRegex::NegativeLookBehind_IgnoreCase: + if (lookaround(inst.param, pos) != + (inst.op == CompiledRegex::LookBehind_IgnoreCase)) return StepResult::Failed; break; case CompiledRegex::Match: @@ -391,7 +409,7 @@ private: ++start; } - template + template bool lookaround(uint32_t index, Utf8It pos) const { for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it) @@ -399,6 +417,9 @@ private: if (pos == (look_direction == MatchDirection::Forward ? m_end : m_begin)) return false; auto cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1)), ref = *it; + if (ignore_case) + cp = to_lower(cp); + if (ref == 0xF000) {} // any character matches else if (ref > 0xF0000 and ref <= 0xFFFFD)