home/src/utf8.hh

#ifndef utf8_hh_INCLUDED
#define utf8_hh_INCLUDED

#include <cstddef>
#include "unicode.hh"

namespace Kakoune
{

namespace utf8
{

// returns an iterator to next character first byte
template<typename Iterator>
Iterator next(Iterator it)
{
    if (*it++ & 0x80)
        while ((*(it) & 0xC0) == 0x80)
            ++it;
    return it;
}

// returns it's parameter if it points to a character first byte,
// or else returns next character first byte
template<typename Iterator>
Iterator finish(Iterator it)
{
    while ((*(it) & 0xC0) == 0x80)
        ++it;
    return it;
}

// returns an iterator to the previous character first byte
template<typename Iterator>
Iterator previous(Iterator it)
{
    while ((*(--it) & 0xC0) == 0x80)
           ;
    return it;
}

// returns an iterator pointing to the first byte of the
// dth character after (or before if d < 0) the character
// pointed by it
template<typename Iterator, typename Distance>
Iterator advance(Iterator it, Iterator end, Distance d)
{
    if (d < 0)
    {
       while (it != end and d++)
           it = utf8::previous(it);
    }
    else
    {
        while (it != end and d--)
           it = utf8::next(it);
    }
    return it;
}

// returns the character count between begin and end
template<typename Iterator>
size_t distance(Iterator begin, Iterator end)
{
    size_t dist = 0;
    while (begin != end)
    {
        if ((*begin++ & 0xC0) != 0x80)
            ++dist;
    }
    return dist;
}

// return true if it points to the first byte of a (either single or
// multibyte) character
template<typename Iterator>
bool is_character_start(Iterator it)
{
    return (*it & 0xC0) != 0x80;
}

struct invalid_utf8_sequence{};

namespace InvalidBytePolicy
{

struct Throw
{
    Codepoint operator()(char byte) const { throw invalid_utf8_sequence{}; }
};

struct Pass
{
    Codepoint operator()(char byte) const { return byte; }
};

}

// returns the codepoint of the character whose first byte
// is pointed by it
template<typename InvalidPolicy = InvalidBytePolicy::Throw,
         typename Iterator>
Codepoint codepoint(Iterator it)
{
    // According to rfc3629, UTF-8 allows only up to 4 bytes.
    // (21 bits codepoint)
    Codepoint cp;
    char byte = *it++;
    if (not (byte & 0x80)) // 0xxxxxxx
        cp = byte;
    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
    {
        cp = ((byte & 0x1F) << 6) | (*it & 0x3F);
    }
    else if ((byte & 0xF0) == 0xE0) // 1110xxxx
    {
        cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
        cp |= (*it & 0x3F);
    }
    else if ((byte & 0xF8) == 0xF0) // 11110xxx
    {
        cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
        cp |= (*it++ & 0x3F) << 6;
        cp |= (*it & 0x3F);
    }
    else
        cp = InvalidPolicy{}(byte);
    return cp;
}

struct invalid_codepoint{};

template<typename OutputIterator>
void dump(OutputIterator& it, Codepoint cp)
{
    if (cp <= 0x7F)
        *it++ = cp;
    else if (cp <= 0x7FF)
    {
        *it++ = 0xC0 | (cp >> 6);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else if (cp <= 0xFFFF)
    {
        *it++ = 0xE0 | (cp >> 12);
        *it++ = 0x80 | ((cp >> 6) & 0x3F);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else if (cp <= 0x10FFFF)
    {
        *it++ = 0xF0 | (cp >> 18);
        *it++ = 0x80 | ((cp >> 12) & 0x3F);
        *it++ = 0x80 | ((cp >> 6)  & 0x3F);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else
        throw invalid_codepoint{};
}

}

}

#endif // utf8_hh_INCLUDED
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`#ifndef utf8_hh_INCLUDED`
			`#define utf8_hh_INCLUDED`

utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 14:29:37 +02:00			`#include <cstddef>`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 19:15:05 +02:00			`#include "unicode.hh"`
utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 14:29:37 +02:00
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`namespace Kakoune`
			`{`

			`namespace utf8`
			`{`

			`// returns an iterator to next character first byte`
			`template<typename Iterator>`
			`Iterator next(Iterator it)`
			`{`
			`if (*it++ & 0x80)`
			`while ((*(it) & 0xC0) == 0x80)`
			`++it;`
			`return it;`
			`}`

			`// returns it's parameter if it points to a character first byte,`
			`// or else returns next character first byte`
			`template<typename Iterator>`
			`Iterator finish(Iterator it)`
			`{`
			`while ((*(it) & 0xC0) == 0x80)`
			`++it;`
			`return it;`
			`}`

			`// returns an iterator to the previous character first byte`
			`template<typename Iterator>`
			`Iterator previous(Iterator it)`
			`{`
			`while ((*(--it) & 0xC0) == 0x80)`
			`;`
			`return it;`
			`}`

			`// returns an iterator pointing to the first byte of the`
			`// dth character after (or before if d < 0) the character`
			`// pointed by it`
			`template<typename Iterator, typename Distance>`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 00:41:48 +02:00			`Iterator advance(Iterator it, Iterator end, Distance d)`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`{`
			`if (d < 0)`
			`{`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 00:41:48 +02:00			`while (it != end and d++)`
			`it = utf8::previous(it);`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`
			`else`
			`{`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 00:41:48 +02:00			`while (it != end and d--)`
			`it = utf8::next(it);`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`
			`return it;`
			`}`

			`// returns the character count between begin and end`
			`template<typename Iterator>`
			`size_t distance(Iterator begin, Iterator end)`
			`{`
			`size_t dist = 0;`
			`while (begin != end)`
			`{`
			`if ((*begin++ & 0xC0) != 0x80)`
			`++dist;`
			`}`
Return something in utf8::distance, thanks again gcc for letting this work 2012-10-11 00:39:17 +02:00			`return dist;`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

			`// return true if it points to the first byte of a (either single or`
			`// multibyte) character`
			`template<typename Iterator>`
			`bool is_character_start(Iterator it)`
			`{`
			`return (*it & 0xC0) != 0x80;`
			`}`

			`struct invalid_utf8_sequence{};`

utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`namespace InvalidBytePolicy`
			`{`

			`struct Throw`
			`{`
			`Codepoint operator()(char byte) const { throw invalid_utf8_sequence{}; }`
			`};`

			`struct Pass`
			`{`
			`Codepoint operator()(char byte) const { return byte; }`
			`};`

			`}`

add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`// returns the codepoint of the character whose first byte`
			`// is pointed by it`
utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`template<typename InvalidPolicy = InvalidBytePolicy::Throw,`
			`typename Iterator>`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`Codepoint codepoint(Iterator it)`
			`{`
			`// According to rfc3629, UTF-8 allows only up to 4 bytes.`
			`// (21 bits codepoint)`
			`Codepoint cp;`
			`char byte = *it++;`
			`if (not (byte & 0x80)) // 0xxxxxxx`
			`cp = byte;`
			`else if ((byte & 0xE0) == 0xC0) // 110xxxxx`
			`{`
			`cp = ((byte & 0x1F) << 6) \| (*it & 0x3F);`
			`}`
			`else if ((byte & 0xF0) == 0xE0) // 1110xxxx`
			`{`
			`cp = ((byte & 0x0F) << 12) \| ((*it++ & 0x3F) << 6);`
			`cp \|= (*it & 0x3F);`
			`}`
			`else if ((byte & 0xF8) == 0xF0) // 11110xxx`
			`{`
			`cp = ((byte & 0x0F) << 18) \| ((*it++ & 0x3F) << 12);`
			`cp \|= (*it++ & 0x3F) << 6;`
			`cp \|= (*it & 0x3F);`
			`}`
			`else`
utf8::codepoint: configurable invalid byte policy 2012-10-13 18:31:29 +02:00			`cp = InvalidPolicy{}(byte);`
Actually return something in utf8::codepoint, thanks gcc for using rax 2012-10-10 19:14:18 +02:00			`return cp;`
add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 14:29:37 +02:00			`struct invalid_codepoint{};`

			`template<typename OutputIterator>`
			`void dump(OutputIterator& it, Codepoint cp)`
			`{`
			`if (cp <= 0x7F)`
			`*it++ = cp;`
			`else if (cp <= 0x7FF)`
			`{`
			`*it++ = 0xC0 \| (cp >> 6);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else if (cp <= 0xFFFF)`
			`{`
			`*it++ = 0xE0 \| (cp >> 12);`
			`*it++ = 0x80 \| ((cp >> 6) & 0x3F);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else if (cp <= 0x10FFFF)`
			`{`
			`*it++ = 0xF0 \| (cp >> 18);`
			`*it++ = 0x80 \| ((cp >> 12) & 0x3F);`
			`*it++ = 0x80 \| ((cp >> 6) & 0x3F);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else`
			`throw invalid_codepoint{};`
			`}`

add utf8 helpers in utf8.hh 2012-10-08 14:25:05 +02:00			`}`

			`}`

			`#endif // utf8_hh_INCLUDED`