kakoune/src/unicode.hh

#ifndef unicode_hh_INCLUDED
#define unicode_hh_INCLUDED

#include <cwctype>
#include <cwchar>
#include <locale>

#include "array_view.hh"
#include "ranges.hh"
#include "units.hh"

namespace Kakoune
{

using Codepoint = char32_t;

inline bool is_eol(Codepoint c) noexcept
{
    return c == '\n';
}

inline bool is_horizontal_blank(Codepoint c) noexcept
{
    return c == ' ' or c == '\t';
}

inline bool is_blank(Codepoint c) noexcept
{
    return c == ' ' or c == '\t' or c == '\n';
}

enum WordType { Word, WORD };

template<WordType word_type = Word>
inline bool is_word(Codepoint c, ConstArrayView<Codepoint> extra_word_chars = {}) noexcept
{
    return c == '_' or iswalnum((wchar_t)c) or contains(extra_word_chars, c);
}

template<>
inline bool is_word<WORD>(Codepoint c, ConstArrayView<Codepoint>) noexcept
{
    return not is_blank(c);
}

inline bool is_punctuation(Codepoint c) noexcept
{
    return not (is_word(c) or is_blank(c));
}

inline bool is_basic_alpha(Codepoint c) noexcept
{
    return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}

inline ColumnCount codepoint_width(Codepoint c) noexcept
{
    if (c == '\n')
        return 1;
    const auto width = wcwidth((wchar_t)c);
    return width > 0 ? width : 0;
}

enum class CharCategories
{
    Blank,
    EndOfLine,
    Word,
    Punctuation,
};

template<WordType word_type = Word>
inline CharCategories categorize(Codepoint c, ConstArrayView<Codepoint> extra_word_chars) noexcept
{
    if (is_eol(c))
        return CharCategories::EndOfLine;
    if (is_horizontal_blank(c))
        return CharCategories::Blank;
    if (word_type == WORD or is_word(c, extra_word_chars))
        return CharCategories::Word;
    return CharCategories::Punctuation;
}

inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }
inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }

inline bool is_lower(Codepoint cp) noexcept { return iswlower((wchar_t)cp); }
inline bool is_upper(Codepoint cp) noexcept { return iswupper((wchar_t)cp); }

inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }
inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }

inline bool is_lower(char c) noexcept { return c >= 'a' and c <= 'z'; }
inline bool is_upper(char c) noexcept { return c >= 'A' and c <= 'Z'; }

}

#endif // unicode_hh_INCLUDED
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`#ifndef unicode_hh_INCLUDED`
			`#define unicode_hh_INCLUDED`

Apply clang-tidy modernize to the codebase 2017-01-08 23:30:15 +01:00			`#include <cwctype>`
			`#include <cwchar>`
Use C++ locale based functions instead of the libc ones 2016-05-11 10:49:45 +02:00			`#include <locale>`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`#include "array_view.hh"`
Rename containers.hh to ranges.hh (and Container to Range) 2017-08-29 10:23:03 +02:00			`#include "ranges.hh"`
			`#include "units.hh"`
Cleanup include dependencies a bit 2016-11-29 00:53:50 +01:00
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`namespace Kakoune`
			`{`

Use char32_t for Codepoint 2015-04-29 14:51:15 +02:00			`using Codepoint = char32_t;`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_eol(Codepoint c) noexcept`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`{`
			`return c == '\n';`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_horizontal_blank(Codepoint c) noexcept`
move is_horizontal_blank to unicode.hh 2013-11-17 23:54:26 +01:00			`{`
			`return c == ' ' or c == '\t';`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_blank(Codepoint c) noexcept`
Refactor select_arguments and slightly change behaviour for non-inner non inner argument contains the argument, preceeding whitespaces, and eventual ending comma, except for first arguments (that contains the whitespaces after the comma), and last argument (that contains the comma before it). 2015-07-02 00:47:22 +02:00			`{`
			`return c == ' ' or c == '\t' or c == '\n';`
			`}`

Move template selectors to the header 2013-12-14 15:49:10 +01:00			`enum WordType { Word, WORD };`

			`template<WordType word_type = Word>`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`inline bool is_word(Codepoint c, ConstArrayView<Codepoint> extra_word_chars = {}) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`return c == '_' or iswalnum((wchar_t)c) or contains(extra_word_chars, c);`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

			`template<>`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`inline bool is_word<WORD>(Codepoint c, ConstArrayView<Codepoint>) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Tweak some character categorization function implementations 2017-02-23 01:56:40 +01:00			`return not is_blank(c);`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_punctuation(Codepoint c) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Tweak some character categorization function implementations 2017-02-23 01:56:40 +01:00			`return not (is_word(c) or is_blank(c));`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_basic_alpha(Codepoint c) noexcept`
Move is_basic_alpha to unicode.hh 2015-11-15 14:24:39 +01:00			`{`
			`return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline ColumnCount codepoint_width(Codepoint c) noexcept`
Support codepoints of variable width Add a ColumnCount type and use it in place of CharCount whenever more appropriate, take column size of codepoints into account for vertical movements and docstring wrapping. Fixes #811 2016-09-22 21:36:26 +02:00			`{`
Treat non printable characters as zero-width instead of -1 width This fix a bug when opening a file where a line has a lot of unprintable chars (like a binary file) which was confusing Kakoune into considering that the line length in column was negative. 2017-07-07 03:57:32 +02:00			`if (c == '\n')`
			`return 1;`
			`const auto width = wcwidth((wchar_t)c);`
			`return width > 0 ? width : 0;`
Support codepoints of variable width Add a ColumnCount type and use it in place of CharCount whenever more appropriate, take column size of codepoints into account for vertical movements and docstring wrapping. Fixes #811 2016-09-22 21:36:26 +02:00			`}`

Move template selectors to the header 2013-12-14 15:49:10 +01:00			`enum class CharCategories`
			`{`
			`Blank,`
			`EndOfLine,`
			`Word,`
			`Punctuation,`
			`};`

			`template<WordType word_type = Word>`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`inline CharCategories categorize(Codepoint c, ConstArrayView<Codepoint> extra_word_chars) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
			`if (is_eol(c))`
			`return CharCategories::EndOfLine;`
Remove is_blank, which is identical to is_horizontal_blank 2015-04-15 01:34:00 +02:00			`if (is_horizontal_blank(c))`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`return CharCategories::Blank;`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`if (word_type == WORD or is_word(c, extra_word_chars))`
Tweak categorize(Codepoint) implementation 2016-04-03 19:25:48 +02:00			`return CharCategories::Word;`
			`return CharCategories::Punctuation;`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }`
			`inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00
Add is_upper and is_lower helper unicode functions 2017-10-06 13:51:09 +02:00			`inline bool is_lower(Codepoint cp) noexcept { return iswlower((wchar_t)cp); }`
			`inline bool is_upper(Codepoint cp) noexcept { return iswupper((wchar_t)cp); }`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }`
			`inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00
Add is_upper and is_lower helper unicode functions 2017-10-06 13:51:09 +02:00			`inline bool is_lower(char c) noexcept { return c >= 'a' and c <= 'z'; }`
			`inline bool is_upper(char c) noexcept { return c >= 'A' and c <= 'Z'; }`

add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`}`

			`#endif // unicode_hh_INCLUDED`