home/src/utf8.hh

#ifndef utf8_hh_INCLUDED
#define utf8_hh_INCLUDED

#include "assert.hh"
#include "unicode.hh"
#include "units.hh"

#include <cstddef>

namespace Kakoune
{

namespace utf8
{

// returns an iterator to next character first byte
template<typename Iterator>
Iterator next(Iterator it)
{
    if (*it++ & 0x80)
        while ((*(it) & 0xC0) == 0x80)
            ++it;
    return it;
}

// returns it's parameter if it points to a character first byte,
// or else returns next character first byte
template<typename Iterator>
Iterator finish(Iterator it)
{
    while ((*(it) & 0xC0) == 0x80)
        ++it;
    return it;
}

// returns an iterator to the previous character first byte
template<typename Iterator>
Iterator previous(Iterator it)
{
    while ((*(--it) & 0xC0) == 0x80)
           ;
    return it;
}

// returns an iterator pointing to the first byte of the
// dth character after (or before if d < 0) the character
// pointed by it
template<typename Iterator>
Iterator advance(Iterator it, Iterator end, CharCount d)
{
    if (d < 0)
    {
       while (it != end and d++)
           it = utf8::previous(it);
    }
    else
    {
        while (it != end and d--)
           it = utf8::next(it);
    }
    return it;
}

// returns the character count between begin and end
template<typename Iterator>
CharCount distance(Iterator begin, Iterator end)
{
    CharCount dist = 0;
    while (begin != end)
    {
        if ((*begin++ & 0xC0) != 0x80)
            ++dist;
    }
    return dist;
}

// return true if it points to the first byte of a (either single or
// multibyte) character
inline bool is_character_start(char c)
{
    return (c & 0xC0) != 0x80;
}

// returns an iterator to the first byte of the character it is into
template<typename Iterator>
Iterator character_start(Iterator it)
{
    while (not is_character_start(*it))
        --it;
    return it;
}

namespace InvalidBytePolicy
{

struct Assert
{
    Codepoint operator()(char byte) const { kak_assert(false); return byte; }
};

struct Pass
{
    Codepoint operator()(char byte) const { return byte; }
};

}

// returns the codepoint of the character whose first byte
// is pointed by it
template<typename InvalidPolicy = InvalidBytePolicy::Assert,
         typename Iterator>
Codepoint codepoint(Iterator it)
{
    // According to rfc3629, UTF-8 allows only up to 4 bytes.
    // (21 bits codepoint)
    Codepoint cp;
    char byte = *it++;
    if (not (byte & 0x80)) // 0xxxxxxx
        cp = byte;
    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
    {
        cp = ((byte & 0x1F) << 6) | (*it & 0x3F);
    }
    else if ((byte & 0xF0) == 0xE0) // 1110xxxx
    {
        cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
        cp |= (*it & 0x3F);
    }
    else if ((byte & 0xF8) == 0xF0) // 11110xxx
    {
        cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
        cp |= (*it++ & 0x3F) << 6;
        cp |= (*it & 0x3F);
    }
    else
        cp = InvalidPolicy{}(byte);
    return cp;
}

template<typename InvalidPolicy = InvalidBytePolicy::Assert,
         typename Iterator>
ByteCount codepoint_size(Iterator it)
{
    char byte = *it;
    if (not (byte & 0x80)) // 0xxxxxxx
        return 1;
    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
        return 2;
    else if ((byte & 0xF0) == 0xE0) // 1110xxxx
        return 3;
    else if ((byte & 0xF8) == 0xF0) // 11110xxx
        return 4;
    else
    {
        InvalidPolicy{}(byte);
        return -1;
    }
}

struct invalid_codepoint{};

template<typename OutputIterator>
void dump(OutputIterator&& it, Codepoint cp)
{
    if (cp <= 0x7F)
        *it++ = cp;
    else if (cp <= 0x7FF)
    {
        *it++ = 0xC0 | (cp >> 6);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else if (cp <= 0xFFFF)
    {
        *it++ = 0xE0 | (cp >> 12);
        *it++ = 0x80 | ((cp >> 6) & 0x3F);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else if (cp <= 0x10FFFF)
    {
        *it++ = 0xF0 | (cp >> 18);
        *it++ = 0x80 | ((cp >> 12) & 0x3F);
        *it++ = 0x80 | ((cp >> 6)  & 0x3F);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else
        throw invalid_codepoint{};
}

}

}

#endif // utf8_hh_INCLUDED
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`#ifndef utf8_hh_INCLUDED`
			`#define utf8_hh_INCLUDED`

sort includes directives 2013-04-09 20:05:40 +02:00			`#include "assert.hh"`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`#include "unicode.hh"`
utf8: use CharCount instead of size_t 2012-10-27 13:26:40 +02:00			`#include "units.hh"`
sort includes directives 2013-04-09 20:05:40 +02:00
			`#include <cstddef>`
utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 14:29:37 +02:00
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`namespace Kakoune`
			`{`

			`namespace utf8`
			`{`

			`// returns an iterator to next character first byte`
			`template<typename Iterator>`
			`Iterator next(Iterator it)`
			`{`
			`if (*it++ & 0x80)`
			`while ((*(it) & 0xC0) == 0x80)`
			`++it;`
			`return it;`
			`}`

			`// returns it's parameter if it points to a character first byte,`
			`// or else returns next character first byte`
			`template<typename Iterator>`
			`Iterator finish(Iterator it)`
			`{`
			`while ((*(it) & 0xC0) == 0x80)`
			`++it;`
			`return it;`
			`}`

			`// returns an iterator to the previous character first byte`
			`template<typename Iterator>`
			`Iterator previous(Iterator it)`
			`{`
			`while ((*(--it) & 0xC0) == 0x80)`
			`;`
			`return it;`
			`}`

			`// returns an iterator pointing to the first byte of the`
			`// dth character after (or before if d < 0) the character`
			`// pointed by it`
utf8: use CharCount instead of size_t 2012-10-27 13:26:40 +02:00			`template<typename Iterator>`
			`Iterator advance(Iterator it, Iterator end, CharCount d)`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`{`
			`if (d < 0)`
			`{`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 00:41:48 +02:00			`while (it != end and d++)`
			`it = utf8::previous(it);`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`
			`else`
			`{`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 00:41:48 +02:00			`while (it != end and d--)`
			`it = utf8::next(it);`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`
			`return it;`
			`}`

			`// returns the character count between begin and end`
			`template<typename Iterator>`
utf8: use CharCount instead of size_t 2012-10-27 13:26:40 +02:00			`CharCount distance(Iterator begin, Iterator end)`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`{`
utf8: use CharCount instead of size_t 2012-10-27 13:26:40 +02:00			`CharCount dist = 0;`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`while (begin != end)`
			`{`
			`if ((*begin++ & 0xC0) != 0x80)`
			`++dist;`
			`}`
Return something in utf8::distance, thanks again gcc for letting this work 2012-10-11 00:39:17 +02:00			`return dist;`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

			`// return true if it points to the first byte of a (either single or`
			`// multibyte) character`
utf8::is_character_start takes directly the char value 2014-05-14 20:21:19 +02:00			`inline bool is_character_start(char c)`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`{`
utf8::is_character_start takes directly the char value 2014-05-14 20:21:19 +02:00			`return (c & 0xC0) != 0x80;`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

Add utf8::character_start function 2013-02-26 14:05:51 +01:00			`// returns an iterator to the first byte of the character it is into`
			`template<typename Iterator>`
			`Iterator character_start(Iterator it)`
			`{`
utf8::is_character_start takes directly the char value 2014-05-14 20:21:19 +02:00			`while (not is_character_start(*it))`
Add utf8::character_start function 2013-02-26 14:05:51 +01:00			`--it;`
			`return it;`
			`}`

utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`namespace InvalidBytePolicy`
			`{`

utf8: replace InvalidBytePolicy::Throw with InvalidBytePolicy::Assert 2012-10-17 17:01:51 +02:00			`struct Assert`
utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`{`
rename assert to kak_assert to avoid collisions 2013-04-09 20:04:11 +02:00			`Codepoint operator()(char byte) const { kak_assert(false); return byte; }`
utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`};`

			`struct Pass`
			`{`
			`Codepoint operator()(char byte) const { return byte; }`
			`};`

			`}`

add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`// returns the codepoint of the character whose first byte`
			`// is pointed by it`
utf8: replace InvalidBytePolicy::Throw with InvalidBytePolicy::Assert 2012-10-17 17:01:51 +02:00			`template<typename InvalidPolicy = InvalidBytePolicy::Assert,`
utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`typename Iterator>`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`Codepoint codepoint(Iterator it)`
			`{`
			`// According to rfc3629, UTF-8 allows only up to 4 bytes.`
			`// (21 bits codepoint)`
			`Codepoint cp;`
			`char byte = *it++;`
			`if (not (byte & 0x80)) // 0xxxxxxx`
			`cp = byte;`
			`else if ((byte & 0xE0) == 0xC0) // 110xxxxx`
			`{`
			`cp = ((byte & 0x1F) << 6) \| (*it & 0x3F);`
			`}`
			`else if ((byte & 0xF0) == 0xE0) // 1110xxxx`
			`{`
			`cp = ((byte & 0x0F) << 12) \| ((*it++ & 0x3F) << 6);`
			`cp \|= (*it & 0x3F);`
			`}`
			`else if ((byte & 0xF8) == 0xF0) // 11110xxx`
			`{`
			`cp = ((byte & 0x0F) << 18) \| ((*it++ & 0x3F) << 12);`
			`cp \|= (*it++ & 0x3F) << 6;`
			`cp \|= (*it & 0x3F);`
			`}`
			`else`
utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`cp = InvalidPolicy{}(byte);`
Actually return something in utf8::codepoint, thanks gcc for using rax 2012-10-10 19:14:18 +02:00			`return cp;`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

Add utf8::codepoint_size function 2013-05-30 18:49:50 +02:00			`template<typename InvalidPolicy = InvalidBytePolicy::Assert,`
			`typename Iterator>`
			`ByteCount codepoint_size(Iterator it)`
			`{`
			`char byte = *it;`
			`if (not (byte & 0x80)) // 0xxxxxxx`
			`return 1;`
			`else if ((byte & 0xE0) == 0xC0) // 110xxxxx`
			`return 2;`
			`else if ((byte & 0xF0) == 0xE0) // 1110xxxx`
			`return 3;`
			`else if ((byte & 0xF8) == 0xF0) // 11110xxx`
			`return 4;`
			`else`
			`{`
			`InvalidPolicy{}(byte);`
			`return -1;`
			`}`
			`}`

utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 14:29:37 +02:00			`struct invalid_codepoint{};`

			`template<typename OutputIterator>`
utf8::dump uses a copy of the output iterator instead of a reference 2013-02-27 21:36:28 +01:00			`void dump(OutputIterator&& it, Codepoint cp)`
utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 14:29:37 +02:00			`{`
			`if (cp <= 0x7F)`
			`*it++ = cp;`
			`else if (cp <= 0x7FF)`
			`{`
			`*it++ = 0xC0 \| (cp >> 6);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else if (cp <= 0xFFFF)`
			`{`
			`*it++ = 0xE0 \| (cp >> 12);`
			`*it++ = 0x80 \| ((cp >> 6) & 0x3F);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else if (cp <= 0x10FFFF)`
			`{`
			`*it++ = 0xF0 \| (cp >> 18);`
			`*it++ = 0x80 \| ((cp >> 12) & 0x3F);`
			`*it++ = 0x80 \| ((cp >> 6) & 0x3F);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else`
			`throw invalid_codepoint{};`
			`}`

add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

			`}`

			`#endif // utf8_hh_INCLUDED`