Regex: use binary search to for character class ranges check
This commit is contained in:
parent
6e65589a34
commit
9ec175f2f8
|
@ -315,13 +315,38 @@ private:
|
||||||
parse_error(format("unknown atom escape '{}'", cp));
|
parse_error(format("unknown atom escape '{}'", cp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct CharRange { Codepoint min, max; };
|
||||||
|
|
||||||
|
void normalize_ranges(Vector<CharRange>& ranges)
|
||||||
|
{
|
||||||
|
if (ranges.empty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Sort ranges so that we can use binary search
|
||||||
|
std::sort(ranges.begin(), ranges.end(),
|
||||||
|
[](auto& lhs, auto& rhs) { return lhs.min < rhs.min; });
|
||||||
|
|
||||||
|
// merge overlapping ranges
|
||||||
|
auto pos = ranges.begin();
|
||||||
|
for (auto next = pos+1; next != ranges.end(); ++next)
|
||||||
|
{
|
||||||
|
if (pos->max + 1 >= next->min)
|
||||||
|
{
|
||||||
|
if (next->max > pos->max)
|
||||||
|
pos->max = next->max;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*++pos = *next;
|
||||||
|
}
|
||||||
|
ranges.erase(pos+1, ranges.end());
|
||||||
|
}
|
||||||
|
|
||||||
AstNodePtr character_class()
|
AstNodePtr character_class()
|
||||||
{
|
{
|
||||||
const bool negative = m_pos != m_regex.end() and *m_pos == '^';
|
const bool negative = m_pos != m_regex.end() and *m_pos == '^';
|
||||||
if (negative)
|
if (negative)
|
||||||
++m_pos;
|
++m_pos;
|
||||||
|
|
||||||
struct CharRange { Codepoint min, max; };
|
|
||||||
Vector<CharRange> ranges;
|
Vector<CharRange> ranges;
|
||||||
Vector<Codepoint> excluded;
|
Vector<Codepoint> excluded;
|
||||||
Vector<std::pair<wctype_t, bool>> ctypes;
|
Vector<std::pair<wctype_t, bool>> ctypes;
|
||||||
|
@ -404,6 +429,8 @@ private:
|
||||||
cp = to_lower(cp);
|
cp = to_lower(cp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
normalize_ranges(ranges);
|
||||||
|
|
||||||
// Optimize the relatively common case of using a character class to
|
// Optimize the relatively common case of using a character class to
|
||||||
// escape a character, such as [*]
|
// escape a character, such as [*]
|
||||||
if (ctypes.empty() and excluded.empty() and not negative and
|
if (ctypes.empty() and excluded.empty() and not negative and
|
||||||
|
@ -417,9 +444,12 @@ private:
|
||||||
if (ignore_case)
|
if (ignore_case)
|
||||||
cp = to_lower(cp);
|
cp = to_lower(cp);
|
||||||
|
|
||||||
auto found = contains_that(ranges, [cp](auto& r) {
|
auto it = std::lower_bound(ranges.begin(), ranges.end(), cp,
|
||||||
return r.min <= cp and cp <= r.max;
|
[](auto& range, Codepoint cp)
|
||||||
}) or contains_that(ctypes, [cp](auto& c) {
|
{ return range.max < cp; });
|
||||||
|
|
||||||
|
auto found = (it != ranges.end() and it->min <= cp) or
|
||||||
|
contains_that(ctypes, [cp](auto& c) {
|
||||||
return (bool)iswctype(cp, c.first) == c.second;
|
return (bool)iswctype(cp, c.first) == c.second;
|
||||||
}) or (not excluded.empty() and not contains(excluded, cp));
|
}) or (not excluded.empty() and not contains(excluded, cp));
|
||||||
return negative ? not found : found;
|
return negative ? not found : found;
|
||||||
|
@ -1239,6 +1269,11 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(vm.exec("fOO", RegexExecFlags::Search));
|
kak_assert(vm.exec("fOO", RegexExecFlags::Search));
|
||||||
kak_assert(*vm.captures()[0] == 'f');
|
kak_assert(*vm.captures()[0] == 'f');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM<> vm{R"([d-ea-dcf-k]+)"};
|
||||||
|
kak_assert(vm.exec("abcde"));
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user