Regex: add support for case insensitive matching, controlled by (?i)

This commit is contained in:
Maxime Coste 2017-09-29 11:22:09 +08:00
parent 7673781751
commit 3d2262bebf

View File

@ -69,6 +69,7 @@ struct ParsedRegex
Op op; Op op;
Codepoint value; Codepoint value;
Quantifier quantifier; Quantifier quantifier;
bool ignore_case;
Vector<std::unique_ptr<AstNode>> children; Vector<std::unique_ptr<AstNode>> children;
}; };
@ -210,6 +211,14 @@ private:
validate_lookaround(content); validate_lookaround(content);
} }
else if (c == 'i' or c == 'I')
{
m_ignore_case = c == 'i';
if (advance() != ')')
parse_error("unclosed parenthesis");
++m_pos;
return atom(); // get next atom
}
else else
parse_error("invalid disjunction"); parse_error("invalid disjunction");
} }
@ -351,9 +360,24 @@ private:
parse_error("unclosed character class"); parse_error("unclosed character class");
++m_pos; ++m_pos;
if (m_ignore_case)
{
for (auto& range : ranges)
{
range.min = to_lower(range.max);
range.max = to_lower(range.max);
}
for (auto& cp : excluded)
cp = to_lower(cp);
}
auto matcher = [ranges = std::move(ranges), auto matcher = [ranges = std::move(ranges),
ctypes = std::move(ctypes), ctypes = std::move(ctypes),
excluded = std::move(excluded), negative] (Codepoint cp) { excluded = std::move(excluded),
negative, ignore_case = m_ignore_case] (Codepoint cp) {
if (ignore_case)
cp = to_lower(cp);
auto found = contains_that(ranges, [cp](auto& r) { auto found = contains_that(ranges, [cp](auto& r) {
return r.min <= cp and cp <= r.max; return r.min <= cp and cp <= r.max;
}) or contains_that(ctypes, [cp](auto& c) { }) or contains_that(ctypes, [cp](auto& c) {
@ -416,13 +440,12 @@ private:
} }
} }
static AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1, AstNodePtr new_node(ParsedRegex::Op op, Codepoint value = -1,
ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One}) ParsedRegex::Quantifier quantifier = {ParsedRegex::Quantifier::One})
{ {
return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, {}}}; return AstNodePtr{new ParsedRegex::AstNode{op, value, quantifier, m_ignore_case, {}}};
} }
bool at_end() const { return m_pos == m_regex.end(); } bool at_end() const { return m_pos == m_regex.end(); }
[[gnu::noreturn]] [[gnu::noreturn]]
@ -443,6 +466,7 @@ private:
ParsedRegex m_parsed_regex; ParsedRegex m_parsed_regex;
StringView m_regex; StringView m_regex;
Iterator m_pos; Iterator m_pos;
bool m_ignore_case = false;
struct CharacterClassEscape { struct CharacterClassEscape {
Codepoint cp; Codepoint cp;
@ -451,11 +475,6 @@ private:
bool neg; bool neg;
}; };
StringView peek(ByteCount count)
{
return StringView{m_pos.base(), m_regex.end()}.substr(0, count);
}
static const CharacterClassEscape character_class_escapes[8]; static const CharacterClassEscape character_class_escapes[8];
}; };
@ -477,6 +496,7 @@ struct CompiledRegex
{ {
Match, Match,
Literal, Literal,
LiteralIgnoreCase,
AnyChar, AnyChar,
Matcher, Matcher,
Jump, Jump,
@ -540,8 +560,10 @@ private:
switch (node->op) switch (node->op)
{ {
case ParsedRegex::Literal: case ParsedRegex::Literal:
push_op(CompiledRegex::Literal); push_op(node->ignore_case ? CompiledRegex::LiteralIgnoreCase
push_codepoint(node->value); : CompiledRegex::Literal);
push_codepoint(node->ignore_case ? to_lower(node->value)
: node->value);
break; break;
case ParsedRegex::AnyChar: case ParsedRegex::AnyChar:
push_op(CompiledRegex::AnyChar); push_op(CompiledRegex::AnyChar);
@ -731,6 +753,9 @@ void dump_regex(const CompiledRegex& program)
case CompiledRegex::Literal: case CompiledRegex::Literal:
printf("literal %lc\n", utf8::read_codepoint(pos, (const char*)nullptr)); printf("literal %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
break; break;
case CompiledRegex::LiteralIgnoreCase:
printf("literal (ignore case) %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
break;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
printf("any char\n"); printf("any char\n");
break; break;
@ -826,6 +851,10 @@ struct ThreadedRegexVM
if (utf8::read_codepoint(thread.inst, prog_end) == cp) if (utf8::read_codepoint(thread.inst, prog_end) == cp)
return StepResult::Consumed; return StepResult::Consumed;
return StepResult::Failed; return StepResult::Failed;
case CompiledRegex::LiteralIgnoreCase:
if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
return StepResult::Consumed;
return StepResult::Failed;
case CompiledRegex::AnyChar: case CompiledRegex::AnyChar:
return StepResult::Consumed; return StepResult::Consumed;
case CompiledRegex::Jump: case CompiledRegex::Jump:
@ -1189,6 +1218,11 @@ auto test_regex = UnitTest{[]{
kak_assert(not vm.exec("foo")); kak_assert(not vm.exec("foo"));
kak_assert(vm.exec("qux")); kak_assert(vm.exec("qux"));
} }
{
TestVM vm{R"(Foo(?i)f[oB]+)"};
kak_assert(vm.exec("FooFOoBb"));
}
}}; }};
} }