Regex: Find potential start position using a map of valid start chars
With this optimization we get close to performance parity with boost regex on the common use cases in Kakoune.
This commit is contained in:
parent
741772aef9
commit
3b69dda04e
|
@ -11,7 +11,8 @@ using Utf8It = RegexUtf8It<const char*>;
|
||||||
Regex::Regex(StringView re, flag_type flags) try
|
Regex::Regex(StringView re, flag_type flags) try
|
||||||
: RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
|
: RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
|
||||||
{
|
{
|
||||||
m_impl = compile_regex(re);
|
if (auto compiled_regex = compile_regex(re))
|
||||||
|
m_impl = new CompiledRegex{std::move(compiled_regex)};
|
||||||
} catch (std::runtime_error& err) { throw regex_error(err.what()); }
|
} catch (std::runtime_error& err) { throw regex_error(err.what()); }
|
||||||
|
|
||||||
String option_to_string(const Regex& re)
|
String option_to_string(const Regex& re)
|
||||||
|
|
12
src/regex.hh
12
src/regex.hh
|
@ -36,11 +36,11 @@ public:
|
||||||
|
|
||||||
static constexpr const char* option_type_name = "regex";
|
static constexpr const char* option_type_name = "regex";
|
||||||
|
|
||||||
const CompiledRegex& impl() const { return m_impl; }
|
const CompiledRegex* impl() const { return m_impl.get(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
String m_str;
|
String m_str;
|
||||||
CompiledRegex m_impl;
|
RefPtr<CompiledRegex> m_impl;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename It>
|
template<typename It>
|
||||||
|
@ -143,7 +143,7 @@ bool regex_match(It begin, It end, const Regex& re)
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
|
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
|
||||||
if (re.impl() and matched != regex_match(begin, end, re.impl()))
|
if (re.impl() and matched != regex_match(begin, end, *re.impl()))
|
||||||
regex_mismatch(re);
|
regex_mismatch(re);
|
||||||
return matched;
|
return matched;
|
||||||
}
|
}
|
||||||
|
@ -160,7 +160,7 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
|
||||||
{
|
{
|
||||||
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
|
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
|
||||||
Vector<It> captures;
|
Vector<It> captures;
|
||||||
if (re.impl() and matched != regex_match(begin, end, captures, re.impl()))
|
if (re.impl() and matched != regex_match(begin, end, captures, *re.impl()))
|
||||||
regex_mismatch(re);
|
regex_mismatch(re);
|
||||||
if (re.impl() and matched)
|
if (re.impl() and matched)
|
||||||
check_captures(re, res, captures);
|
check_captures(re, res, captures);
|
||||||
|
@ -179,7 +179,7 @@ bool regex_search(It begin, It end, const Regex& re,
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
|
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
|
||||||
if (re.impl() and matched != regex_search(begin, end, re.impl(), convert_flags(flags)))
|
if (re.impl() and matched != regex_search(begin, end, *re.impl(), convert_flags(flags)))
|
||||||
regex_mismatch(re);
|
regex_mismatch(re);
|
||||||
return matched;
|
return matched;
|
||||||
}
|
}
|
||||||
|
@ -197,7 +197,7 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
|
||||||
{
|
{
|
||||||
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
|
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
|
||||||
Vector<It> captures;
|
Vector<It> captures;
|
||||||
if (re.impl() and matched != regex_search(begin, end, captures, re.impl(), convert_flags(flags)))
|
if (re.impl() and matched != regex_search(begin, end, captures, *re.impl(), convert_flags(flags)))
|
||||||
regex_mismatch(re);
|
regex_mismatch(re);
|
||||||
if (re.impl() and matched)
|
if (re.impl() and matched)
|
||||||
check_captures(re, res, captures);
|
check_captures(re, res, captures);
|
||||||
|
|
|
@ -515,6 +515,7 @@ struct RegexCompiler
|
||||||
push_op(CompiledRegex::Match);
|
push_op(CompiledRegex::Match);
|
||||||
m_program.matchers = m_parsed_regex.matchers;
|
m_program.matchers = m_parsed_regex.matchers;
|
||||||
m_program.save_count = m_parsed_regex.capture_count * 2;
|
m_program.save_count = m_parsed_regex.capture_count * 2;
|
||||||
|
m_program.start_chars = compute_start_chars();
|
||||||
}
|
}
|
||||||
|
|
||||||
CompiledRegex get_compiled_regex() { return std::move(m_program); }
|
CompiledRegex get_compiled_regex() { return std::move(m_program); }
|
||||||
|
@ -708,6 +709,87 @@ private:
|
||||||
push_codepoint(cp->value);
|
push_codepoint(cp->value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fills accepted and rejected according to which chars can start the given node,
|
||||||
|
// returns true if the node did not consume the char, hence a following node in
|
||||||
|
// sequence would be still relevant for the parent node start chars computation.
|
||||||
|
bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
|
||||||
|
bool (&accepted)[256], bool (&rejected)[256]) const
|
||||||
|
{
|
||||||
|
switch (node->op)
|
||||||
|
{
|
||||||
|
case ParsedRegex::Literal:
|
||||||
|
if (node->value < 256)
|
||||||
|
accepted[node->value] = true;
|
||||||
|
return node->quantifier.allows_none();
|
||||||
|
case ParsedRegex::AnyChar:
|
||||||
|
for (auto& b : accepted)
|
||||||
|
b = true;
|
||||||
|
return node->quantifier.allows_none();
|
||||||
|
case ParsedRegex::Matcher:
|
||||||
|
for (auto& b : accepted) // treat matcher as everything can match for now
|
||||||
|
b = true;
|
||||||
|
return node->quantifier.allows_none();
|
||||||
|
case ParsedRegex::Sequence:
|
||||||
|
{
|
||||||
|
bool consumed = false;
|
||||||
|
for (auto& child : node->children)
|
||||||
|
{
|
||||||
|
if (not compute_start_chars(child, accepted, rejected))
|
||||||
|
{
|
||||||
|
consumed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return not consumed or node->quantifier.allows_none();
|
||||||
|
}
|
||||||
|
case ParsedRegex::Alternation:
|
||||||
|
{
|
||||||
|
bool all_consumed = not node->quantifier.allows_none();
|
||||||
|
for (auto& child : node->children)
|
||||||
|
{
|
||||||
|
if (compute_start_chars(child, accepted, rejected))
|
||||||
|
all_consumed = false;
|
||||||
|
}
|
||||||
|
return not all_consumed;
|
||||||
|
}
|
||||||
|
case ParsedRegex::LineStart:
|
||||||
|
case ParsedRegex::LineEnd:
|
||||||
|
case ParsedRegex::WordBoundary:
|
||||||
|
case ParsedRegex::NotWordBoundary:
|
||||||
|
case ParsedRegex::SubjectBegin:
|
||||||
|
case ParsedRegex::SubjectEnd:
|
||||||
|
case ParsedRegex::ResetStart:
|
||||||
|
return true;
|
||||||
|
case ParsedRegex::LookAhead:
|
||||||
|
if (node->children.empty())
|
||||||
|
compute_start_chars(node->children.front(), accepted, rejected);
|
||||||
|
return true;
|
||||||
|
case ParsedRegex::NegativeLookAhead:
|
||||||
|
if (node->children.empty())
|
||||||
|
compute_start_chars(node->children.front(), rejected, accepted);
|
||||||
|
return true;
|
||||||
|
case ParsedRegex::LookBehind:
|
||||||
|
return true;
|
||||||
|
case ParsedRegex::NegativeLookBehind:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
|
||||||
|
{
|
||||||
|
bool accepted[256] = {};
|
||||||
|
bool rejected[256] = {};
|
||||||
|
if (compute_start_chars(m_parsed_regex.ast, accepted, rejected))
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
auto start_chars = std::make_unique<CompiledRegex::StartChars>();
|
||||||
|
for (int i = 0; i < 256; ++i)
|
||||||
|
start_chars->map[i] = accepted[i] and not rejected[i];
|
||||||
|
|
||||||
|
return start_chars;
|
||||||
|
}
|
||||||
|
|
||||||
CompiledRegex m_program;
|
CompiledRegex m_program;
|
||||||
const ParsedRegex& m_parsed_regex;
|
const ParsedRegex& m_parsed_regex;
|
||||||
};
|
};
|
||||||
|
@ -1020,6 +1102,17 @@ auto test_regex = UnitTest{[]{
|
||||||
kak_assert(vm.exec("foofoofoo"));
|
kak_assert(vm.exec("foofoofoo"));
|
||||||
kak_assert(not vm.exec("barbarbar"));
|
kak_assert(not vm.exec("barbarbar"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"((?<!\\)(?:\\\\)*")"};
|
||||||
|
kak_assert(vm.exec("foo\"", false));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
TestVM vm{R"($)"};
|
||||||
|
kak_assert(vm.exec("foo\n", false, true));
|
||||||
|
kak_assert(*vm.m_captures->pos[0] == '\n');
|
||||||
|
}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,11 +6,12 @@
|
||||||
#include "utf8_iterator.hh"
|
#include "utf8_iterator.hh"
|
||||||
#include "vector.hh"
|
#include "vector.hh"
|
||||||
#include "flags.hh"
|
#include "flags.hh"
|
||||||
|
#include "ref_ptr.hh"
|
||||||
|
|
||||||
namespace Kakoune
|
namespace Kakoune
|
||||||
{
|
{
|
||||||
|
|
||||||
struct CompiledRegex
|
struct CompiledRegex : RefCountable
|
||||||
{
|
{
|
||||||
enum Op : char
|
enum Op : char
|
||||||
{
|
{
|
||||||
|
@ -41,6 +42,9 @@ struct CompiledRegex
|
||||||
Vector<char> bytecode;
|
Vector<char> bytecode;
|
||||||
Vector<std::function<bool (Codepoint)>> matchers;
|
Vector<std::function<bool (Codepoint)>> matchers;
|
||||||
size_t save_count;
|
size_t save_count;
|
||||||
|
|
||||||
|
struct StartChars { bool map[256]; };
|
||||||
|
std::unique_ptr<StartChars> start_chars;
|
||||||
};
|
};
|
||||||
|
|
||||||
CompiledRegex compile_regex(StringView re);
|
CompiledRegex compile_regex(StringView re);
|
||||||
|
@ -311,6 +315,16 @@ struct ThreadedRegexVM
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars)
|
||||||
|
{
|
||||||
|
if (not start_chars)
|
||||||
|
return;
|
||||||
|
|
||||||
|
while (start != end and *start >= 0 and *start < 256 and
|
||||||
|
not start_chars[*start])
|
||||||
|
++start;
|
||||||
|
}
|
||||||
|
|
||||||
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
|
||||||
{
|
{
|
||||||
m_begin = begin;
|
m_begin = begin;
|
||||||
|
@ -324,6 +338,12 @@ struct ThreadedRegexVM
|
||||||
|
|
||||||
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
|
||||||
Utf8It start{m_begin, m_begin, m_end};
|
Utf8It start{m_begin, m_begin, m_end};
|
||||||
|
|
||||||
|
const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
|
||||||
|
|
||||||
|
if (flags & RegexExecFlags::Search)
|
||||||
|
to_next_start(start, end, start_chars);
|
||||||
|
|
||||||
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
||||||
current_threads, next_threads))
|
current_threads, next_threads))
|
||||||
return true;
|
return true;
|
||||||
|
@ -331,12 +351,15 @@ struct ThreadedRegexVM
|
||||||
if (not (flags & RegexExecFlags::Search))
|
if (not (flags & RegexExecFlags::Search))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
while (start != end)
|
do
|
||||||
{
|
{
|
||||||
if (exec_from(++start, no_saves ? nullptr : new_saves<false>(nullptr),
|
to_next_start(++start, end, start_chars);
|
||||||
|
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
|
||||||
current_threads, next_threads))
|
current_threads, next_threads))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
while (start != end);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user