home/src/ranked_match.cc

#include "ranked_match.hh"

#include "utf8_iterator.hh"
#include "unit_tests.hh"

namespace Kakoune
{

UsedLetters used_letters(StringView str)
{
    UsedLetters res = 0;
    for (auto c : str)
    {
        if (c >= 'a' and c <= 'z')
            res |= 1uL << (c - 'a');
        else if (c >= 'A' and c <= 'Z')
            res |= 1uL << (c - 'A' + 26);
        else if (c == '_')
            res |= 1uL << 53;
        else if (c == '-')
            res |= 1uL << 54;
        else
            res |= 1uL << 63;
    }
    return res;
}

bool matches(UsedLetters query, UsedLetters letters)
{
    return (query & letters) == query;
}

using Utf8It = utf8::iterator<const char*>;

static int count_word_boundaries_match(StringView candidate, StringView query)
{
    int count = 0;
    Utf8It query_it{query.begin(), query};
    Codepoint prev = 0;
    for (Utf8It it{candidate.begin(), candidate}; it != candidate.end(); ++it)
    {
        const Codepoint c = *it;
        const bool is_word_boundary = prev == 0 or
                                      (!iswalnum((wchar_t)prev) and iswalnum((wchar_t)c)) or
                                      (iswlower((wchar_t)prev) and iswupper((wchar_t)c));
        prev = c;

        if (not is_word_boundary)
            continue;

        const Codepoint lc = to_lower(c);
        for (auto qit = query_it; qit != query.end(); ++qit)
        {
            const Codepoint qc = *qit;
            if (qc == (islower(qc) ? lc  : c))
            {
                ++count;
                query_it = qit+1;
                break;
            }
        }
        if (query_it == query.end())
            break;
    }
    return count;
}

static bool smartcase_eq(Codepoint query, Codepoint candidate)
{
    return query == (islower(query) ? to_lower(candidate) : candidate);
}

static bool subsequence_match_smart_case(StringView str, StringView subseq, int& out_index_sum)
{
    int index_sum = 0;
    auto it = str.begin();
    int index = 0;
    for (auto subseq_it = subseq.begin(); subseq_it != subseq.end();)
    {
        if (it == str.end())
            return false;
        const Codepoint c = utf8::read_codepoint(subseq_it, subseq.end());
        while (not smartcase_eq(c, utf8::read_codepoint(it, subseq.end())))
        {
            ++index;
            if (it == str.end())
                return false;
        }
        index_sum += index++;
    }
    out_index_sum = index_sum;
    return true;
}

template<typename TestFunc>
RankedMatch::RankedMatch(StringView candidate, StringView query, TestFunc func)
{
    if (candidate.empty() or query.length() > candidate.length())
        return;

    if (query.empty())
        m_candidate = candidate;
    else if (func() and  subsequence_match_smart_case(candidate, query, m_match_index_sum))
    {
        m_candidate = candidate;

        if (smartcase_eq(query[0], candidate[0]))
            m_flags |= Flags::FirstCharMatch;
        if (std::equal(query.begin(), query.end(), candidate.begin()))
        {
            m_flags |= Flags::Prefix;
            if (query.length() == candidate.length())
                m_flags |= Flags::FullMatch;
        }
        m_word_boundary_match_count = count_word_boundaries_match(candidate, query);
        if (m_word_boundary_match_count == query.length())
            m_flags |= Flags::OnlyWordBoundary;
    }
}

RankedMatch::RankedMatch(StringView candidate, UsedLetters candidate_letters,
                         StringView query, UsedLetters query_letters)
    : RankedMatch{candidate, query, [&] {
        return matches(to_lower(query_letters), to_lower(candidate_letters)) and
               matches(query_letters & upper_mask, candidate_letters & upper_mask);
    }} {}


RankedMatch::RankedMatch(StringView candidate, StringView query)
    : RankedMatch{candidate, query, [] { return true; }}
{
}

bool RankedMatch::operator<(const RankedMatch& other) const
{
    kak_assert((bool)*this and (bool)other);

    const auto diff = m_flags ^ other.m_flags;
    // flags are different, use their ordering to return the first match
    if (diff != Flags::None)
        return (int)(m_flags & diff) > (int)(other.m_flags & diff);

    if (m_word_boundary_match_count != other.m_word_boundary_match_count)
        return m_word_boundary_match_count > other.m_word_boundary_match_count;

    if (m_match_index_sum != other.m_match_index_sum)
        return m_match_index_sum < other.m_match_index_sum;

    Utf8It it1{m_candidate.begin(), m_candidate}, it2{other.m_candidate.begin(), other.m_candidate};
    for (; it1 != m_candidate.end() and it2 != other.m_candidate.end(); ++it1, ++it2)
    {
        const auto cp1 = *it1, cp2 = *it2;
        if (cp1 != cp2)
        {
            const bool low1 = islower(cp1), low2 = islower(cp2);
            return low1 == low2 ? cp1 < cp2 : low1;
        }
    }

    return it1 == m_candidate.end() and it2 != other.m_candidate.end();
}

UnitTest test_ranked_match{[] {
    kak_assert(count_word_boundaries_match("run_all_tests", "rat") == 3);
    kak_assert(count_word_boundaries_match("run_all_tests", "at") == 2);
    kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "wm") == 2);
    kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cobm") == 3);
    kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cWBM") == 4);
    kak_assert(RankedMatch{"source", "so"} < RankedMatch{"source_data", "so"});
    kak_assert(not (RankedMatch{"source_data", "so"} < RankedMatch{"source", "so"}));
    kak_assert(not (RankedMatch{"source", "so"} < RankedMatch{"source", "so"}));
}};

UnitTest test_used_letters{[]()
{
    kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD")));
}};

}
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`#include "ranked_match.hh"`

Make word insert completion work better with unicode char 2015-10-30 14:57:46 +01:00			`#include "utf8_iterator.hh"`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`#include "unit_tests.hh"`

Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`namespace Kakoune`
			`{`

Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`UsedLetters used_letters(StringView str)`
			`{`
			`UsedLetters res = 0;`
			`for (auto c : str)`
			`{`
			`if (c >= 'a' and c <= 'z')`
			`res \|= 1uL << (c - 'a');`
			`else if (c >= 'A' and c <= 'Z')`
			`res \|= 1uL << (c - 'A' + 26);`
			`else if (c == '_')`
			`res \|= 1uL << 53;`
			`else if (c == '-')`
			`res \|= 1uL << 54;`
			`else`
			`res \|= 1uL << 63;`
			`}`
			`return res;`
			`}`

			`bool matches(UsedLetters query, UsedLetters letters)`
			`{`
			`return (query & letters) == query;`
			`}`

Make word insert completion work better with unicode char 2015-10-30 14:57:46 +01:00			`using Utf8It = utf8::iterator<const char*>;`

Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`static int count_word_boundaries_match(StringView candidate, StringView query)`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`{`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`int count = 0;`
Fix count_word_boundaries_match 2016-02-18 00:05:08 +01:00			`Utf8It query_it{query.begin(), query};`
Make word insert completion work better with unicode char 2015-10-30 14:57:46 +01:00			`Codepoint prev = 0;`
			`for (Utf8It it{candidate.begin(), candidate}; it != candidate.end(); ++it)`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`{`
Make word insert completion work better with unicode char 2015-10-30 14:57:46 +01:00			`const Codepoint c = *it;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`const bool is_word_boundary = prev == 0 or`
Go back to libc locale and use c_regex_traits Unfortunately, cygwin does not support c++ locales. 2016-05-19 22:45:23 +02:00			`(!iswalnum((wchar_t)prev) and iswalnum((wchar_t)c)) or`
			`(iswlower((wchar_t)prev) and iswupper((wchar_t)c));`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`prev = c;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`if (not is_word_boundary)`
			`continue;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00			`const Codepoint lc = to_lower(c);`
Fix count_word_boundaries_match 2016-02-18 00:05:08 +01:00			`for (auto qit = query_it; qit != query.end(); ++qit)`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`{`
Make word insert completion work better with unicode char 2015-10-30 14:57:46 +01:00			`const Codepoint qc = *qit;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`if (qc == (islower(qc) ? lc : c))`
			`{`
			`++count;`
Fix count_word_boundaries_match 2016-02-18 00:05:08 +01:00			`query_it = qit+1;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`break;`
			`}`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`}`
Fix count_word_boundaries_match 2016-02-18 00:05:08 +01:00			`if (query_it == query.end())`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`break;`
			`}`
			`return count;`
			`}`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00
Make word insert completion work better with unicode char 2015-10-30 14:57:46 +01:00			`static bool smartcase_eq(Codepoint query, Codepoint candidate)`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`{`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00			`return query == (islower(query) ? to_lower(candidate) : candidate);`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`}`

Use flags and bit operations instead of bools in RankedMatch full match is now the most important flag for comparison. 2016-08-30 00:56:22 +02:00			`static bool subsequence_match_smart_case(StringView str, StringView subseq, int& out_index_sum)`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`{`
Use flags and bit operations instead of bools in RankedMatch full match is now the most important flag for comparison. 2016-08-30 00:56:22 +02:00			`int index_sum = 0;`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 00:45:56 +01:00			`auto it = str.begin();`
Take subsequence matches index when sorting RankedMatch 2016-02-29 00:05:51 +01:00			`int index = 0;`
Tweak subsequence_match_smart_case 2016-03-28 16:18:15 +02:00			`for (auto subseq_it = subseq.begin(); subseq_it != subseq.end();)`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`{`
			`if (it == str.end())`
			`return false;`
Tweak subsequence_match_smart_case 2016-03-28 16:18:15 +02:00			`const Codepoint c = utf8::read_codepoint(subseq_it, subseq.end());`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 00:45:56 +01:00			`while (not smartcase_eq(c, utf8::read_codepoint(it, subseq.end())))`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`{`
Take subsequence matches index when sorting RankedMatch 2016-02-29 00:05:51 +01:00			`++index;`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 00:45:56 +01:00			`if (it == str.end())`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`return false;`
			`}`
Take subsequence matches index when sorting RankedMatch 2016-02-29 00:05:51 +01:00			`index_sum += index++;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`}`
Use flags and bit operations instead of bools in RankedMatch full match is now the most important flag for comparison. 2016-08-30 00:56:22 +02:00			`out_index_sum = index_sum;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`return true;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`}`

Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`template<typename TestFunc>`
			`RankedMatch::RankedMatch(StringView candidate, StringView query, TestFunc func)`
Move more logic into RankedMatch 2015-10-27 22:25:18 +01:00			`{`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`if (candidate.empty() or query.length() > candidate.length())`
			`return;`

			`if (query.empty())`
Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`m_candidate = candidate;`
			`else if (func() and subsequence_match_smart_case(candidate, query, m_match_index_sum))`
Move more logic into RankedMatch 2015-10-27 22:25:18 +01:00			`{`
			`m_candidate = candidate;`
Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00
Use flags and bit operations instead of bools in RankedMatch full match is now the most important flag for comparison. 2016-08-30 00:56:22 +02:00			`if (smartcase_eq(query[0], candidate[0]))`
			`m_flags \|= Flags::FirstCharMatch;`
			`if (std::equal(query.begin(), query.end(), candidate.begin()))`
			`{`
			`m_flags \|= Flags::Prefix;`
			`if (query.length() == candidate.length())`
			`m_flags \|= Flags::FullMatch;`
			`}`
Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`m_word_boundary_match_count = count_word_boundaries_match(candidate, query);`
Use flags and bit operations instead of bools in RankedMatch full match is now the most important flag for comparison. 2016-08-30 00:56:22 +02:00			`if (m_word_boundary_match_count == query.length())`
			`m_flags \|= Flags::OnlyWordBoundary;`
Move more logic into RankedMatch 2015-10-27 22:25:18 +01:00			`}`
Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`}`
Move more logic into RankedMatch 2015-10-27 22:25:18 +01:00
Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`RankedMatch::RankedMatch(StringView candidate, UsedLetters candidate_letters,`
			`StringView query, UsedLetters query_letters)`
			`: RankedMatch{candidate, query, [&] {`
			`return matches(to_lower(query_letters), to_lower(candidate_letters)) and`
			`matches(query_letters & upper_mask, candidate_letters & upper_mask);`
			`}} {}`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00

Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`RankedMatch::RankedMatch(StringView candidate, StringView query)`
			`: RankedMatch{candidate, query, [] { return true; }}`
			`{`
Move more logic into RankedMatch 2015-10-27 22:25:18 +01:00			`}`

			`bool RankedMatch::operator<(const RankedMatch& other) const`
			`{`
Fix uninitialized value in RankedMatch 2016-03-24 23:04:56 +01:00			`kak_assert((bool)*this and (bool)other);`

Use flags and bit operations instead of bools in RankedMatch full match is now the most important flag for comparison. 2016-08-30 00:56:22 +02:00			`const auto diff = m_flags ^ other.m_flags;`
			`// flags are different, use their ordering to return the first match`
			`if (diff != Flags::None)`
			`return (int)(m_flags & diff) > (int)(other.m_flags & diff);`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00
			`if (m_word_boundary_match_count != other.m_word_boundary_match_count)`
			`return m_word_boundary_match_count > other.m_word_boundary_match_count;`

Take subsequence matches index when sorting RankedMatch 2016-02-29 00:05:51 +01:00			`if (m_match_index_sum != other.m_match_index_sum)`
			`return m_match_index_sum < other.m_match_index_sum;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00
Fix RankedMatch::operator< with matching prefix candidates 2016-05-17 20:40:36 +02:00			`Utf8It it1{m_candidate.begin(), m_candidate}, it2{other.m_candidate.begin(), other.m_candidate};`
			`for (; it1 != m_candidate.end() and it2 != other.m_candidate.end(); ++it1, ++it2)`
Use manual lexicographic comparison in RankedMatch::operator< 2016-03-28 15:44:49 +02:00			`{`
			`const auto cp1 = it1, cp2 = it2;`
			`if (cp1 != cp2)`
			`{`
			`const bool low1 = islower(cp1), low2 = islower(cp2);`
Give priority to lower case in RankedMatch 2016-04-03 19:49:33 +02:00			`return low1 == low2 ? cp1 < cp2 : low1;`
Use manual lexicographic comparison in RankedMatch::operator< 2016-03-28 15:44:49 +02:00			`}`
			`}`

Fix RankedMatch ordering where 'a < a' was true Fixes #679 2016-05-20 19:42:01 +02:00			`return it1 == m_candidate.end() and it2 != other.m_candidate.end();`
Move more logic into RankedMatch 2015-10-27 22:25:18 +01:00			`}`

Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`UnitTest test_ranked_match{[] {`
			`kak_assert(count_word_boundaries_match("run_all_tests", "rat") == 3);`
Fix count_word_boundaries_match 2016-02-18 00:05:08 +01:00			`kak_assert(count_word_boundaries_match("run_all_tests", "at") == 2);`
			`kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "wm") == 2);`
			`kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cobm") == 3);`
			`kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cWBM") == 4);`
Fix RankedMatch::operator< with matching prefix candidates 2016-05-17 20:40:36 +02:00			`kak_assert(RankedMatch{"source", "so"} < RankedMatch{"source_data", "so"});`
			`kak_assert(not (RankedMatch{"source_data", "so"} < RankedMatch{"source", "so"}));`
Fix RankedMatch ordering where 'a < a' was true Fixes #679 2016-05-20 19:42:01 +02:00			`kak_assert(not (RankedMatch{"source", "so"} < RankedMatch{"source", "so"}));`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 14:36:30 +01:00			`}};`

Move UsedLetters with RankedMatch 2016-03-25 21:35:57 +01:00			`UnitTest test_used_letters{[]()`
			`{`
			`kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD")));`
			`}};`

Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 20:49:08 +02:00			`}`