#include "ranked_match.hh" #include "flags.hh" #include "unit_tests.hh" #include "utf8_iterator.hh" #include "optional.hh" #include namespace Kakoune { UsedLetters used_letters(StringView str) { UsedLetters res = 0; for (auto c : str) { if (c >= 'a' and c <= 'z') res |= 1uLL << (c - 'a'); else if (c >= 'A' and c <= 'Z') res |= 1uLL << (c - 'A' + 26); else if (c == '_') res |= 1uLL << 53; else if (c == '-') res |= 1uLL << 54; else res |= 1uLL << 63; } return res; } bool matches(UsedLetters query, UsedLetters letters) { return (query & letters) == query; } using Utf8It = utf8::iterator; static int count_word_boundaries_match(StringView candidate, StringView query) { int count = 0; Utf8It query_it{query.begin(), query}; Codepoint prev = 0; for (Utf8It it{candidate.begin(), candidate}; it != candidate.end(); ++it) { const Codepoint c = *it; const bool is_word_boundary = prev == 0 or (!iswalnum((wchar_t)prev) and iswalnum((wchar_t)c)) or (iswlower((wchar_t)prev) and iswupper((wchar_t)c)); prev = c; if (not is_word_boundary) continue; const Codepoint lc = to_lower(c); for (auto qit = query_it; qit != query.end(); ++qit) { const Codepoint qc = *qit; if (qc == (iswlower((wchar_t)qc) ? lc : c)) { ++count; query_it = qit+1; break; } } if (query_it == query.end()) break; } return count; } static bool smartcase_eq(Codepoint candidate, Codepoint query) { return query == (iswlower((wchar_t)query) ? to_lower(candidate) : candidate); } struct SubseqRes { int max_index; bool single_word; }; static Optional subsequence_match_smart_case(StringView str, StringView subseq) { bool single_word = true; int max_index = -1; auto it = str.begin(); int index = 0; for (auto subseq_it = subseq.begin(); subseq_it != subseq.end();) { if (it == str.end()) return {}; const Codepoint c = utf8::read_codepoint(subseq_it, subseq.end()); while (true) { auto str_c = utf8::read_codepoint(it, str.end()); if (smartcase_eq(str_c, c)) break; if (max_index != -1 and single_word and not is_word(str_c)) single_word = false; ++index; if (it == str.end()) return {}; } max_index = index++; } return SubseqRes{max_index, single_word}; } template RankedMatch::RankedMatch(StringView candidate, StringView query, TestFunc func) { if (candidate.empty() or query.length() > candidate.length()) return; if (query.empty()) { m_candidate = candidate; return; } if (not func()) return; auto res = subsequence_match_smart_case(candidate, query); if (not res) return; m_candidate = candidate; m_max_index = res->max_index; if (res->single_word) m_flags |= Flags::SingleWord; if (smartcase_eq(candidate[0], query[0])) m_flags |= Flags::FirstCharMatch; auto it = std::search(candidate.begin(), candidate.end(), query.begin(), query.end(), smartcase_eq); if (it != candidate.end()) { m_flags |= Flags::Contiguous; if (it == candidate.begin()) { m_flags |= Flags::Prefix; if (query.length() == candidate.length()) { m_flags |= Flags::SmartFullMatch; if (candidate == query) m_flags |= Flags::FullMatch; } } } m_word_boundary_match_count = count_word_boundaries_match(candidate, query); if (m_word_boundary_match_count == query.length()) m_flags |= Flags::OnlyWordBoundary; } RankedMatch::RankedMatch(StringView candidate, UsedLetters candidate_letters, StringView query, UsedLetters query_letters) : RankedMatch{candidate, query, [&] { return matches(to_lower(query_letters), to_lower(candidate_letters)) and matches(query_letters & upper_mask, candidate_letters & upper_mask); }} {} RankedMatch::RankedMatch(StringView candidate, StringView query) : RankedMatch{candidate, query, [] { return true; }} { } static bool is_word_boundary(Codepoint prev, Codepoint c) { return (iswalnum((wchar_t)prev)) != iswalnum((wchar_t)c) or (iswlower((wchar_t)prev) != iswlower((wchar_t)c)); } bool RankedMatch::operator<(const RankedMatch& other) const { kak_assert((bool)*this and (bool)other); const auto diff = m_flags ^ other.m_flags; // flags are different, use their ordering to return the first match if (diff != Flags::None) return (int)(m_flags & diff) > (int)(other.m_flags & diff); // If we are SingleWord, FirstCharMatch will do the job, and we dont want to take // other words boundaries into account. if (not (m_flags & (Flags::Prefix | Flags::SingleWord)) and m_word_boundary_match_count != other.m_word_boundary_match_count) return m_word_boundary_match_count > other.m_word_boundary_match_count; if (m_max_index != other.m_max_index) return m_max_index < other.m_max_index; // Reorder codepoints to improve matching behaviour auto order = [](Codepoint cp) { return cp == '/' ? 0 : cp; }; auto it1 = m_candidate.begin(), it2 = other.m_candidate.begin(); const auto begin1 = it1, begin2 = it2; const auto end1 = m_candidate.end(), end2 = other.m_candidate.end(); auto last1 = it1, last2 = it2; while (true) { // find next mismatch while (it1 != end1 and it2 != end2 and *it1 == *it2) ++it1, ++it2; if (it1 == end1 or it2 == end2) return it1 == end1 and it2 != end2; // compare codepoints it1 = utf8::character_start(it1, last1); it2 = utf8::character_start(it2, last2); const auto itsave1 = it1, itsave2 = it2; const auto cp1 = utf8::read_codepoint(it1, end1); const auto cp2 = utf8::read_codepoint(it2, end2); if (cp1 != cp2) { const auto cplast1 = utf8::prev_codepoint(itsave1, begin1).value_or(Codepoint{0}); const auto cplast2 = utf8::prev_codepoint(itsave2, begin2).value_or(Codepoint{0}); const bool is_wb1 = is_word_boundary(cplast1, cp1); const bool is_wb2 = is_word_boundary(cplast2, cp2); if (is_wb1 != is_wb2) return is_wb1; const bool low1 = iswlower((wchar_t)cp1); const bool low2 = iswlower((wchar_t)cp2); if (low1 != low2) return low1; return order(cp1) < order(cp2); } last1 = it1; last2 = it2; } } UnitTest test_ranked_match{[] { kak_assert(count_word_boundaries_match("run_all_tests", "rat") == 3); kak_assert(count_word_boundaries_match("run_all_tests", "at") == 2); kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "wm") == 2); kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cobm") == 3); kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cWBM") == 4); kak_assert(RankedMatch{"source", "so"} < RankedMatch{"source_data", "so"}); kak_assert(not (RankedMatch{"source_data", "so"} < RankedMatch{"source", "so"})); kak_assert(not (RankedMatch{"source", "so"} < RankedMatch{"source", "so"})); kak_assert(RankedMatch{"single/word", "wo"} < RankedMatch{"multiw/ord", "wo"}); kak_assert(RankedMatch{"foo/bar/foobar", "foobar"} < RankedMatch{"foo/bar/baz", "foobar"}); kak_assert(RankedMatch{"delete-buffer", "db"} < RankedMatch{"debug", "db"}); kak_assert(RankedMatch{"create_task", "ct"} < RankedMatch{"constructor", "ct"}); kak_assert(RankedMatch{"class", "cla"} < RankedMatch{"class::attr", "cla"}); kak_assert(RankedMatch{"meta/", "meta"} < RankedMatch{"meta-a/", "meta"}); kak_assert(RankedMatch{"find(1p)", "find"} < RankedMatch{"findfs(8)", "find"}); kak_assert(RankedMatch{"find(1p)", "fin"} < RankedMatch{"findfs(8)", "fin"}); kak_assert(RankedMatch{"sys_find(1p)", "sys_find"} < RankedMatch{"sys_findfs(8)", "sys_find"}); kak_assert(RankedMatch{"init", ""} < RankedMatch{"__init__", ""}); kak_assert(RankedMatch{"init", "ini"} < RankedMatch{"__init__", "ini"}); kak_assert(RankedMatch{"a", ""} < RankedMatch{"b", ""}); kak_assert(RankedMatch{"expresions", "expresins"} < RankedMatch{"expressionism's", "expresins"}); }}; UnitTest test_used_letters{[]() { kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD"))); }}; }