kakoune/src/unicode.hh

#ifndef unicode_hh_INCLUDED
#define unicode_hh_INCLUDED

#include <cwctype>
#include <cwchar>

#include "array_view.hh"
#include "ranges.hh"
#include "units.hh"

namespace Kakoune
{

using Codepoint = char32_t;

inline bool is_eol(Codepoint c) noexcept
{
    return c == '\n';
}

inline bool is_horizontal_blank(Codepoint c) noexcept
{
    // Characters considered whitespace by ECMA Regex Spec
    //  minus vertical tab
    // <https://262.ecma-international.org/11.0/#sec-white-space>
    return c == '\t'      or
           c == '\f'      or
           c == ' '       or
           c == U'\u00A0' or
           c == U'\uFEFF' or
           c == U'\u1680' or
           c == U'\u2000' or
           c == U'\u2001' or
           c == U'\u2002' or
           c == U'\u2003' or
           c == U'\u2004' or
           c == U'\u2005' or
           c == U'\u2006' or
           c == U'\u2007' or
           c == U'\u2008' or
           c == U'\u2009' or
           c == U'\u200A' or
           c == U'\u2028' or
           c == U'\u2029' or
           c == U'\u202F' or
           c == U'\u205F' or
           c == U'\u3000' ;
}

inline bool is_blank(Codepoint c) noexcept
{
    // Characters considered Line Terminators by ECMA Regex Spec
    //  plus vertical tab
    // <https://262.ecma-international.org/11.0/#sec-line-terminators>
    return c == '\n'              or
           c == '\r'              or
           c == '\v'              or
           c == U'\u2028'         or
           c == U'\u2029'         or
           is_horizontal_blank(c) ;
}

enum WordType { Word, WORD };

template<WordType word_type = Word>
inline bool is_word(Codepoint c, ConstArrayView<Codepoint> extra_word_chars = {'_'}) noexcept
{
    return iswalnum((wchar_t)c) or contains(extra_word_chars, c);
}

template<>
inline bool is_word<WORD>(Codepoint c, ConstArrayView<Codepoint>) noexcept
{
    return not is_blank(c);
}

inline bool is_punctuation(Codepoint c, ConstArrayView<Codepoint> extra_word_chars = {'_'}) noexcept
{
    return not (is_word(c, extra_word_chars) or is_blank(c));
}

inline bool is_basic_alpha(Codepoint c) noexcept
{
    return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}

inline bool is_basic_digit(Codepoint c) noexcept
{
    return c >= '0' and c <= '9';
}

inline bool is_identifier(Codepoint c) noexcept
{
    return is_basic_alpha(c) or is_basic_digit(c) or
           c == '_' or c == '-';
}

inline ColumnCount codepoint_width(Codepoint c) noexcept
{
    if (c == '\n')
        return 1;
    const auto width = wcwidth((wchar_t)c);
    return width >= 0 ? width : 1;
}

enum class CharCategories
{
    Blank,
    EndOfLine,
    Word,
    Punctuation,
};

template<WordType word_type = Word>
inline CharCategories categorize(Codepoint c, ConstArrayView<Codepoint> extra_word_chars) noexcept
{
    if (is_eol(c))
        return CharCategories::EndOfLine;
    if (is_horizontal_blank(c))
        return CharCategories::Blank;
    if (word_type == WORD or is_word(c, extra_word_chars))
        return CharCategories::Word;
    return CharCategories::Punctuation;
}

inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }
inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }

inline bool is_lower(Codepoint cp) noexcept { return iswlower((wchar_t)cp); }
inline bool is_upper(Codepoint cp) noexcept { return iswupper((wchar_t)cp); }

inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }
inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }

inline bool is_lower(char c) noexcept { return c >= 'a' and c <= 'z'; }
inline bool is_upper(char c) noexcept { return c >= 'A' and c <= 'Z'; }

}

#endif // unicode_hh_INCLUDED
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`#ifndef unicode_hh_INCLUDED`
			`#define unicode_hh_INCLUDED`

Apply clang-tidy modernize to the codebase 2017-01-08 23:30:15 +01:00			`#include <cwctype>`
			`#include <cwchar>`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`#include "array_view.hh"`
Rename containers.hh to ranges.hh (and Container to Range) 2017-08-29 10:23:03 +02:00			`#include "ranges.hh"`
			`#include "units.hh"`
Cleanup include dependencies a bit 2016-11-29 00:53:50 +01:00
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`namespace Kakoune`
			`{`

Use char32_t for Codepoint 2015-04-29 14:51:15 +02:00			`using Codepoint = char32_t;`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00
Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_eol(Codepoint c) noexcept`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`{`
			`return c == '\n';`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_horizontal_blank(Codepoint c) noexcept`
move is_horizontal_blank to unicode.hh 2013-11-17 23:54:26 +01:00			`{`
Follow ECMA specification for regex whitespace Changes the behaviour of the \s and \h character classes to include all WhiteSpace and LineTerminator characters defined in the ECMA specification. - <https://262.ecma-international.org/11.0/#sec-white-space> - <https://262.ecma-international.org/11.0/#sec-line-terminators> - <https://262.ecma-international.org/11.0/#sec-characterclassescape> Fixes #4034 2021-02-03 06:04:05 +01:00			`// Characters considered whitespace by ECMA Regex Spec`
			`// minus vertical tab`
			`// <https://262.ecma-international.org/11.0/#sec-white-space>`
			`return c == '\t' or`
			`c == '\f' or`
			`c == ' ' or`
			`c == U'\u00A0' or`
			`c == U'\uFEFF' or`
			`c == U'\u1680' or`
			`c == U'\u2000' or`
			`c == U'\u2001' or`
			`c == U'\u2002' or`
			`c == U'\u2003' or`
			`c == U'\u2004' or`
			`c == U'\u2005' or`
			`c == U'\u2006' or`
			`c == U'\u2007' or`
			`c == U'\u2008' or`
			`c == U'\u2009' or`
			`c == U'\u200A' or`
			`c == U'\u2028' or`
			`c == U'\u2029' or`
			`c == U'\u202F' or`
			`c == U'\u205F' or`
			`c == U'\u3000' ;`
move is_horizontal_blank to unicode.hh 2013-11-17 23:54:26 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_blank(Codepoint c) noexcept`
Refactor select_arguments and slightly change behaviour for non-inner non inner argument contains the argument, preceeding whitespaces, and eventual ending comma, except for first arguments (that contains the whitespaces after the comma), and last argument (that contains the comma before it). 2015-07-02 00:47:22 +02:00			`{`
Follow ECMA specification for regex whitespace Changes the behaviour of the \s and \h character classes to include all WhiteSpace and LineTerminator characters defined in the ECMA specification. - <https://262.ecma-international.org/11.0/#sec-white-space> - <https://262.ecma-international.org/11.0/#sec-line-terminators> - <https://262.ecma-international.org/11.0/#sec-characterclassescape> Fixes #4034 2021-02-03 06:04:05 +01:00			`// Characters considered Line Terminators by ECMA Regex Spec`
			`// plus vertical tab`
			`// <https://262.ecma-international.org/11.0/#sec-line-terminators>`
			`return c == '\n' or`
			`c == '\r' or`
			`c == '\v' or`
			`c == U'\u2028' or`
			`c == U'\u2029' or`
			`is_horizontal_blank(c) ;`
Refactor select_arguments and slightly change behaviour for non-inner non inner argument contains the argument, preceeding whitespaces, and eventual ending comma, except for first arguments (that contains the whitespaces after the comma), and last argument (that contains the comma before it). 2015-07-02 00:47:22 +02:00			`}`

Move template selectors to the header 2013-12-14 15:49:10 +01:00			`enum WordType { Word, WORD };`

			`template<WordType word_type = Word>`
Make '_' the default extra_word_chars, and remove built-in support Fixes #2599 2018-11-27 08:13:29 +01:00			`inline bool is_word(Codepoint c, ConstArrayView<Codepoint> extra_word_chars = {'_'}) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Make '_' the default extra_word_chars, and remove built-in support Fixes #2599 2018-11-27 08:13:29 +01:00			`return iswalnum((wchar_t)c) or contains(extra_word_chars, c);`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

			`template<>`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`inline bool is_word<WORD>(Codepoint c, ConstArrayView<Codepoint>) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Tweak some character categorization function implementations 2017-02-23 01:56:40 +01:00			`return not is_blank(c);`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Make '_' the default extra_word_chars, and remove built-in support Fixes #2599 2018-11-27 08:13:29 +01:00			`inline bool is_punctuation(Codepoint c, ConstArrayView<Codepoint> extra_word_chars = {'_'}) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
Make '_' the default extra_word_chars, and remove built-in support Fixes #2599 2018-11-27 08:13:29 +01:00			`return not (is_word(c, extra_word_chars) or is_blank(c));`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline bool is_basic_alpha(Codepoint c) noexcept`
Move is_basic_alpha to unicode.hh 2015-11-15 14:24:39 +01:00			`{`
			`return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');`
			`}`

Unify code that validates identifiers in Kakoune Session/Client/User modes names are now requiered to be "identifiers" they must be in [a-zA-Z0-9_-]. Option names are the same except they do not allow '-' as they need to be made available through the env vars and '-' is not supported there. Fixes #1946 2018-03-25 07:35:33 +02:00			`inline bool is_basic_digit(Codepoint c) noexcept`
			`{`
			`return c >= '0' and c <= '9';`
			`}`

			`inline bool is_identifier(Codepoint c) noexcept`
			`{`
			`return is_basic_alpha(c) or is_basic_digit(c) or`
			`c == '_' or c == '-';`
			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline ColumnCount codepoint_width(Codepoint c) noexcept`
Support codepoints of variable width Add a ColumnCount type and use it in place of CharCount whenever more appropriate, take column size of codepoints into account for vertical movements and docstring wrapping. Fixes #811 2016-09-22 21:36:26 +02:00			`{`
Treat non printable characters as zero-width instead of -1 width This fix a bug when opening a file where a line has a lot of unprintable chars (like a binary file) which was confusing Kakoune into considering that the line length in column was negative. 2017-07-07 03:57:32 +02:00			`if (c == '\n')`
			`return 1;`
			`const auto width = wcwidth((wchar_t)c);`
Refactor column highlighter to make it more robust Support arbitrary orders for column highlighters (it was previously failing when column highlighters were not applied in column order). Fix show_matching tab handling at the same time (horizontal scrolling, tab characters and show_matching were behaving badly). Window highlighting now runs user highlighters, then built-ins for each phases, instead of running all phases for user highlighters, then all phases for built-ins. We now consider unprintable character to be 1-column width as we know we will display them as "�". Fixes #1615 Fixes #1023 2017-10-12 08:38:19 +02:00			`return width >= 0 ? width : 1;`
Support codepoints of variable width Add a ColumnCount type and use it in place of CharCount whenever more appropriate, take column size of codepoints into account for vertical movements and docstring wrapping. Fixes #811 2016-09-22 21:36:26 +02:00			`}`

Move template selectors to the header 2013-12-14 15:49:10 +01:00			`enum class CharCategories`
			`{`
			`Blank,`
			`EndOfLine,`
			`Word,`
			`Punctuation,`
			`};`

			`template<WordType word_type = Word>`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`inline CharCategories categorize(Codepoint c, ConstArrayView<Codepoint> extra_word_chars) noexcept`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`{`
			`if (is_eol(c))`
			`return CharCategories::EndOfLine;`
Remove is_blank, which is identical to is_horizontal_blank 2015-04-15 01:34:00 +02:00			`if (is_horizontal_blank(c))`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`return CharCategories::Blank;`
Use the extra_word_chars option in word based normal commands the completion_extra_word_chars is now gone, superseeded by extra_word_chars that gets used both for completion and for normal mode. Fixes #1304 2017-06-26 16:28:41 +02:00			`if (word_type == WORD or is_word(c, extra_word_chars))`
Tweak categorize(Codepoint) implementation 2016-04-03 19:25:48 +02:00			`return CharCategories::Word;`
			`return CharCategories::Punctuation;`
Move template selectors to the header 2013-12-14 15:49:10 +01:00			`}`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }`
			`inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00
Add is_upper and is_lower helper unicode functions 2017-10-06 13:51:09 +02:00			`inline bool is_lower(Codepoint cp) noexcept { return iswlower((wchar_t)cp); }`
			`inline bool is_upper(Codepoint cp) noexcept { return iswupper((wchar_t)cp); }`

Add noexcept specifiers to unicode and utf8 functions 2017-04-23 13:47:26 +02:00			`inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }`
			`inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 01:21:20 +01:00
Add is_upper and is_lower helper unicode functions 2017-10-06 13:51:09 +02:00			`inline bool is_lower(char c) noexcept { return c >= 'a' and c <= 'z'; }`
			`inline bool is_upper(char c) noexcept { return c >= 'A' and c <= 'Z'; }`

add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`}`

			`#endif // unicode_hh_INCLUDED`