kakoune/src/unicode.hh

#ifndef unicode_hh_INCLUDED
#define unicode_hh_INCLUDED

#include <cwctype>
#include <cwchar>
#include <locale>

#include "units.hh"

namespace Kakoune
{

using Codepoint = char32_t;

inline bool is_eol(Codepoint c) noexcept
{
    return c == '\n';
}

inline bool is_horizontal_blank(Codepoint c) noexcept
{
    return c == ' ' or c == '\t';
}

inline bool is_blank(Codepoint c) noexcept
{
    return c == ' ' or c == '\t' or c == '\n';
}

enum WordType { Word, WORD };

template<WordType word_type = Word>
inline bool is_word(Codepoint c) noexcept
{
    return c == '_' or iswalnum((wchar_t)c);
}

template<>
inline bool is_word<WORD>(Codepoint c) noexcept
{
    return not is_blank(c);
}

inline bool is_punctuation(Codepoint c) noexcept
{
    return not (is_word(c) or is_blank(c));
}

inline bool is_basic_alpha(Codepoint c) noexcept
{
    return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}

inline ColumnCount codepoint_width(Codepoint c) noexcept
{
    return c == '\n' ? 1 : wcwidth((wchar_t)c);
}

enum class CharCategories
{
    Blank,
    EndOfLine,
    Word,
    Punctuation,
};

template<WordType word_type = Word>
inline CharCategories categorize(Codepoint c) noexcept
{
    if (is_eol(c))
        return CharCategories::EndOfLine;
    if (is_horizontal_blank(c))
        return CharCategories::Blank;
    if (word_type == WORD or is_word(c))
        return CharCategories::Word;
    return CharCategories::Punctuation;
}

inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }
inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }

inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }
inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }

}

#endif // unicode_hh_INCLUDED
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`#ifndef unicode_hh_INCLUDED`
			`#define unicode_hh_INCLUDED`

Apply clang-tidy modernize to the codebase 2017-01-08 23:30:15 +01:00			`#include <cwctype>`
			`#include <cwchar>`
Use C++ locale based functions instead of the libc ones 2016-05-11 10:49:45 +02:00			`#include <locale>`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00
Cleanup include dependencies a bit 2016-11-29 00:53:50 +01:00			`#include "units.hh"`

add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`namespace Kakoune`
			`{`

Use char32_t for Codepoint 2015-04-29 14:51:15 +02:00			`using Codepoint = char32_t;`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_eol(Codepoint c) noexcept`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`{`
			`return c == '\n';`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_horizontal_blank(Codepoint c) noexcept`
move is_horizontal_blank to unicode.hh 2013-11-17 23:54:26 +01:00			`{`
			`return c == ' ' or c == '\t';`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_blank(Codepoint c) noexcept`
Refactor select_arguments and slightly change behaviour for non-inner non inner argument contains the argument, preceeding whitespaces, and eventual ending comma, except for first arguments (that contains the whitespaces after the comma), and last argument (that contains the comma before it). 2015-07-02 00:47:22 +02:00			`{`
			`return c == ' ' or c == '\t' or c == '\n';`
			`}`

Move template selectors to the header 2013-12-14 15:49:10 +01:00			`enum WordType { Word, WORD };`

			`template<WordType word_type = Word>`
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_word(Codepoint c) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Go back to libc locale and use c_regex_traits Unfortunately, cygwin does not support c++ locales. 2016-05-19 22:45:23 +02:00			`return c == '_' or iswalnum((wchar_t)c);`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

			`template<>`
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_word<WORD>(Codepoint c) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Tweak some character categorization function implementations 2017-02-23 01:56:40 +01:00			`return not is_blank(c);`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_punctuation(Codepoint c) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Tweak some character categorization function implementations 2017-02-23 01:56:40 +01:00			`return not (is_word(c) or is_blank(c));`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_basic_alpha(Codepoint c) noexcept`
Move is_basic_alpha to unicode.hh 2015-11-15 14:24:39 +01:00			`{`
			`return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline ColumnCount codepoint_width(Codepoint c) noexcept`
Support codepoints of variable width Add a ColumnCount type and use it in place of CharCount whenever more appropriate, take column size of codepoints into account for vertical movements and docstring wrapping. Fixes #811 2016-09-22 21:36:26 +02:00			`{`
Treat '\n' as 1 column wide Fixes #842 2016-10-04 20:37:43 +02:00			`return c == '\n' ? 1 : wcwidth((wchar_t)c);`
Support codepoints of variable width Add a ColumnCount type and use it in place of CharCount whenever more appropriate, take column size of codepoints into account for vertical movements and docstring wrapping. Fixes #811 2016-09-22 21:36:26 +02:00			`}`

Move template selectors to the header 2013-12-14 15:49:10 +01:00			`enum class CharCategories`
			`{`
			`Blank,`
			`EndOfLine,`
			`Word,`
			`Punctuation,`
			`};`

			`template<WordType word_type = Word>`
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline CharCategories categorize(Codepoint c) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
			`if (is_eol(c))`
			`return CharCategories::EndOfLine;`
Remove is_blank, which is identical to is_horizontal_blank 2015-04-15 01:34:00 +02:00			`if (is_horizontal_blank(c))`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`return CharCategories::Blank;`
Tweak categorize(Codepoint) implementation 2016-04-03 19:25:48 +02:00			`if (word_type == WORD or is_word(c))`
			`return CharCategories::Word;`
			`return CharCategories::Punctuation;`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }`
			`inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }`
			`inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`}`

			`#endif // unicode_hh_INCLUDED`