diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 057dbc11..a6e4a3dc 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -69,6 +69,7 @@ struct ParsedRegex Op op; Codepoint value; Quantifier quantifier; + bool ignore_case; Vector> children; }; @@ -210,6 +211,14 @@ private: validate_lookaround(content); } + else if (c == 'i' or c == 'I') + { + m_ignore_case = c == 'i'; + if (advance() != ')') + parse_error("unclosed parenthesis"); + ++m_pos; + return atom(); // get next atom + } else parse_error("invalid disjunction"); } @@ -351,9 +360,24 @@ private: parse_error("unclosed character class"); ++m_pos; + if (m_ignore_case) + { + for (auto& range : ranges) + { + range.min = to_lower(range.max); + range.max = to_lower(range.max); + } + for (auto& cp : excluded) + cp = to_lower(cp); + } + auto matcher = [ranges = std::move(ranges), ctypes = std::move(ctypes), - excluded = std::move(excluded), negative] (Codepoint cp) { + excluded = std::move(excluded), + negative, ignore_case = m_ignore_case] (Codepoint cp) { + if (ignore_case) + cp = to_lower(cp); + auto found = contains_that(ranges, [cp](auto& r) { return r.min <= cp and cp <= r.max; }) or contains_that(ctypes, [cp](auto& c) { @@ -416,13 +440,12 @@ private: } } - static AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1, - ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One}) + AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1, + ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One}) { - return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, {}}}; + return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, m_ignore_case, {}}}; } - bool at_end() const { return m_pos == m_regex.end(); } [[gnu::noreturn]] @@ -443,6 +466,7 @@ private: ParsedRegex m_parsed_regex; StringView m_regex; Iterator m_pos; + bool m_ignore_case = false; struct CharacterClassEscape { Codepoint cp; @@ -451,11 +475,6 @@ private: bool neg; }; - StringView peek(ByteCount count) - { - return StringView{m_pos.base(), m_regex.end()}.substr(0, count); - } - static const CharacterClassEscape character_class_escapes[8]; }; @@ -477,6 +496,7 @@ struct CompiledRegex { Match, Literal, + LiteralIgnoreCase, AnyChar, Matcher, Jump, @@ -540,8 +560,10 @@ private: switch (node->op) { case ParsedRegex::Literal: - push_op(CompiledRegex::Literal); - push_codepoint(node->value); + push_op(node->ignore_case ? CompiledRegex::LiteralIgnoreCase + : CompiledRegex::Literal); + push_codepoint(node->ignore_case ? to_lower(node->value) + : node->value); break; case ParsedRegex::AnyChar: push_op(CompiledRegex::AnyChar); @@ -731,6 +753,9 @@ void dump_regex(const CompiledRegex& program) case CompiledRegex::Literal: printf("literal %lc\n", utf8::read_codepoint(pos, (const char*)nullptr)); break; + case CompiledRegex::LiteralIgnoreCase: + printf("literal (ignore case) %lc\n", utf8::read_codepoint(pos, (const char*)nullptr)); + break; case CompiledRegex::AnyChar: printf("any char\n"); break; @@ -826,6 +851,10 @@ struct ThreadedRegexVM if (utf8::read_codepoint(thread.inst, prog_end) == cp) return StepResult::Consumed; return StepResult::Failed; + case CompiledRegex::LiteralIgnoreCase: + if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp)) + return StepResult::Consumed; + return StepResult::Failed; case CompiledRegex::AnyChar: return StepResult::Consumed; case CompiledRegex::Jump: @@ -1189,6 +1218,11 @@ auto test_regex = UnitTest{[]{ kak_assert(not vm.exec("foo")); kak_assert(vm.exec("qux")); } + + { + TestVM vm{R"(Foo(?i)f[oB]+)"}; + kak_assert(vm.exec("FooFOoBb")); + } }}; }