Regex: use binary search to for character class ranges check

This commit is contained in:
Maxime Coste 2017-10-14 22:10:56 +08:00
parent 6e65589a34
commit 9ec175f2f8

View File

@ -315,13 +315,38 @@ private:
parse_error(format("unknown atom escape '{}'", cp)); parse_error(format("unknown atom escape '{}'", cp));
} }
struct CharRange { Codepoint min, max; };
void normalize_ranges(Vector<CharRange>& ranges)
{
if (ranges.empty())
return;
// Sort ranges so that we can use binary search
std::sort(ranges.begin(), ranges.end(),
[](auto& lhs, auto& rhs) { return lhs.min < rhs.min; });
// merge overlapping ranges
auto pos = ranges.begin();
for (auto next = pos+1; next != ranges.end(); ++next)
{
if (pos->max + 1 >= next->min)
{
if (next->max > pos->max)
pos->max = next->max;
}
else
*++pos = *next;
}
ranges.erase(pos+1, ranges.end());
}
AstNodePtr character_class() AstNodePtr character_class()
{ {
const bool negative = m_pos != m_regex.end() and *m_pos == '^'; const bool negative = m_pos != m_regex.end() and *m_pos == '^';
if (negative) if (negative)
++m_pos; ++m_pos;
struct CharRange { Codepoint min, max; };
Vector<CharRange> ranges; Vector<CharRange> ranges;
Vector<Codepoint> excluded; Vector<Codepoint> excluded;
Vector<std::pair<wctype_t, bool>> ctypes; Vector<std::pair<wctype_t, bool>> ctypes;
@ -404,6 +429,8 @@ private:
cp = to_lower(cp); cp = to_lower(cp);
} }
normalize_ranges(ranges);
// Optimize the relatively common case of using a character class to // Optimize the relatively common case of using a character class to
// escape a character, such as [*] // escape a character, such as [*]
if (ctypes.empty() and excluded.empty() and not negative and if (ctypes.empty() and excluded.empty() and not negative and
@ -417,9 +444,12 @@ private:
if (ignore_case) if (ignore_case)
cp = to_lower(cp); cp = to_lower(cp);
auto found = contains_that(ranges, [cp](auto& r) { auto it = std::lower_bound(ranges.begin(), ranges.end(), cp,
return r.min <= cp and cp <= r.max; [](auto& range, Codepoint cp)
}) or contains_that(ctypes, [cp](auto& c) { { return range.max < cp; });
auto found = (it != ranges.end() and it->min <= cp) or
contains_that(ctypes, [cp](auto& c) {
return (bool)iswctype(cp, c.first) == c.second; return (bool)iswctype(cp, c.first) == c.second;
}) or (not excluded.empty() and not contains(excluded, cp)); }) or (not excluded.empty() and not contains(excluded, cp));
return negative ? not found : found; return negative ? not found : found;
@ -1239,6 +1269,11 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("fOO", RegexExecFlags::Search)); kak_assert(vm.exec("fOO", RegexExecFlags::Search));
kak_assert(*vm.captures()[0] == 'f'); kak_assert(*vm.captures()[0] == 'f');
} }
{
TestVM<> vm{R"([d-ea-dcf-k]+)"};
kak_assert(vm.exec("abcde"));
}
}}; }};
} }