home/src/utf8.hh

228 lines
5.1 KiB
C++
Raw Normal View History

2012-10-08 14:25:05 +02:00
#ifndef utf8_hh_INCLUDED
#define utf8_hh_INCLUDED
2013-04-09 20:05:40 +02:00
#include "assert.hh"
#include "unicode.hh"
2012-10-27 13:26:40 +02:00
#include "units.hh"
2013-04-09 20:05:40 +02:00
#include <cstddef>
2012-10-08 14:25:05 +02:00
namespace Kakoune
{
namespace utf8
{
2015-09-25 14:19:21 +02:00
template<typename Iterator>
[[gnu::always_inline]]
inline char read(Iterator& it) { char c = *it; ++it; return c; }
2012-10-08 14:25:05 +02:00
// returns an iterator to next character first byte
template<typename Iterator>
Iterator next(Iterator it, const Iterator& end)
2012-10-08 14:25:05 +02:00
{
2015-09-25 14:19:21 +02:00
if (it != end and read(it) & 0x80)
while (it != end and (*(it) & 0xC0) == 0x80)
2012-10-08 14:25:05 +02:00
++it;
return it;
}
// returns it's parameter if it points to a character first byte,
// or else returns next character first byte
template<typename Iterator>
Iterator finish(Iterator it, const Iterator& end)
2012-10-08 14:25:05 +02:00
{
while (it != end and (*(it) & 0xC0) == 0x80)
2012-10-08 14:25:05 +02:00
++it;
return it;
}
// returns an iterator to the previous character first byte
template<typename Iterator>
Iterator previous(Iterator it, const Iterator& begin)
2012-10-08 14:25:05 +02:00
{
while (it != begin and (*(--it) & 0xC0) == 0x80)
2012-10-08 14:25:05 +02:00
;
return it;
}
// returns an iterator pointing to the first byte of the
// dth character after (or before if d < 0) the character
// pointed by it
2012-10-27 13:26:40 +02:00
template<typename Iterator>
Iterator advance(Iterator it, const Iterator& end, CharCount d)
2012-10-08 14:25:05 +02:00
{
if (d < 0)
{
while (it != end and d++)
it = utf8::previous(it, end);
2012-10-08 14:25:05 +02:00
}
else
{
while (it != end and d--)
it = utf8::next(it, end);
2012-10-08 14:25:05 +02:00
}
return it;
}
2015-09-23 23:09:37 +02:00
// return true if it points to the first byte of a (either single or
// multibyte) character
[[gnu::always_inline]]
2015-09-23 23:09:37 +02:00
inline bool is_character_start(char c)
{
return (c & 0xC0) != 0x80;
}
2012-10-08 14:25:05 +02:00
// returns the character count between begin and end
template<typename Iterator>
CharCount distance(Iterator begin, const Iterator& end)
2012-10-08 14:25:05 +02:00
{
2012-10-27 13:26:40 +02:00
CharCount dist = 0;
2015-09-23 23:09:37 +02:00
2012-10-08 14:25:05 +02:00
while (begin != end)
{
if (is_character_start(read(begin)))
2012-10-08 14:25:05 +02:00
++dist;
}
return dist;
2012-10-08 14:25:05 +02:00
}
2013-02-26 14:05:51 +01:00
// returns an iterator to the first byte of the character it is into
template<typename Iterator>
Iterator character_start(Iterator it, const Iterator& begin)
2013-02-26 14:05:51 +01:00
{
while (it != begin and not is_character_start(*it))
2013-02-26 14:05:51 +01:00
--it;
return it;
}
namespace InvalidPolicy
{
struct Assert
{
Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
};
struct Pass
{
Codepoint operator()(Codepoint cp) const { return cp; }
};
}
2012-10-08 14:25:05 +02:00
// returns the codepoint of the character whose first byte
// is pointed by it
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
typename Iterator>
Codepoint read_codepoint(Iterator& it, const Iterator& end)
2012-10-08 14:25:05 +02:00
{
if (it == end)
return InvalidPolicy{}(-1);
2012-10-08 14:25:05 +02:00
// According to rfc3629, UTF-8 allows only up to 4 bytes.
// (21 bits codepoint)
2015-09-25 14:19:21 +02:00
unsigned char byte = read(it);
2012-10-08 14:25:05 +02:00
if (not (byte & 0x80)) // 0xxxxxxx
return byte;
if (it == end)
return InvalidPolicy{}(byte);
if ((byte & 0xE0) == 0xC0) // 110xxxxx
2015-09-25 14:19:21 +02:00
return ((byte & 0x1F) << 6) | (read(it) & 0x3F);
if ((byte & 0xF0) == 0xE0) // 1110xxxx
2012-10-08 14:25:05 +02:00
{
2015-09-25 14:19:21 +02:00
Codepoint cp = ((byte & 0x0F) << 12) | ((read(it) & 0x3F) << 6);
if (it == end)
return InvalidPolicy{}(cp);
2015-09-25 14:19:21 +02:00
return cp | (read(it) & 0x3F);
2012-10-08 14:25:05 +02:00
}
if ((byte & 0xF8) == 0xF0) // 11110xxx
2012-10-08 14:25:05 +02:00
{
2015-09-25 14:19:21 +02:00
Codepoint cp = ((byte & 0x0F) << 18) | ((read(it) & 0x3F) << 12);
if (it == end)
return InvalidPolicy{}(cp);
2015-09-25 14:19:21 +02:00
cp |= (read(it) & 0x3F) << 6;
if (it == end)
return InvalidPolicy{}(cp);
2015-09-25 14:19:21 +02:00
return cp | (read(it) & 0x3F);
2012-10-08 14:25:05 +02:00
}
return InvalidPolicy{}(byte);
2012-10-08 14:25:05 +02:00
}
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
typename Iterator>
Codepoint codepoint(Iterator it, const Iterator& end)
{
return read_codepoint(it, end);
}
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
ByteCount codepoint_size(char byte)
2013-05-30 18:49:50 +02:00
{
if (not (byte & 0x80)) // 0xxxxxxx
return 1;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
return 2;
else if ((byte & 0xF0) == 0xE0) // 1110xxxx
return 3;
else if ((byte & 0xF8) == 0xF0) // 11110xxx
return 4;
else
{
InvalidPolicy{}(byte);
return 1;
2013-05-30 18:49:50 +02:00
}
}
struct invalid_codepoint{};
2016-02-05 10:13:07 +01:00
inline ByteCount codepoint_size(Codepoint cp)
{
if (cp <= 0x7F)
return 1;
else if (cp <= 0x7FF)
return 2;
else if (cp <= 0xFFFF)
return 3;
else if (cp <= 0x10FFFF)
return 4;
else
throw invalid_codepoint{};
}
template<typename OutputIterator>
void dump(OutputIterator&& it, Codepoint cp)
{
if (cp <= 0x7F)
*it++ = cp;
else if (cp <= 0x7FF)
{
*it++ = 0xC0 | (cp >> 6);
*it++ = 0x80 | (cp & 0x3F);
}
else if (cp <= 0xFFFF)
{
*it++ = 0xE0 | (cp >> 12);
*it++ = 0x80 | ((cp >> 6) & 0x3F);
*it++ = 0x80 | (cp & 0x3F);
}
else if (cp <= 0x10FFFF)
{
*it++ = 0xF0 | (cp >> 18);
*it++ = 0x80 | ((cp >> 12) & 0x3F);
*it++ = 0x80 | ((cp >> 6) & 0x3F);
*it++ = 0x80 | (cp & 0x3F);
}
else
throw invalid_codepoint{};
}
2012-10-08 14:25:05 +02:00
}
}
#endif // utf8_hh_INCLUDED