From 2db1d023294d985ba5624ad52b64946b9bbf7a2a Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Mon, 8 Oct 2012 14:25:05 +0200 Subject: [PATCH] add utf8 helpers in utf8.hh --- src/utf8.hh | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/utf8.hh diff --git a/src/utf8.hh b/src/utf8.hh new file mode 100644 index 00000000..f9fb87cd --- /dev/null +++ b/src/utf8.hh @@ -0,0 +1,116 @@ +#ifndef utf8_hh_INCLUDED +#define utf8_hh_INCLUDED + +namespace Kakoune +{ + +namespace utf8 +{ + +using Codepoint = uint32_t; + +// returns an iterator to next character first byte +template +Iterator next(Iterator it) +{ + if (*it++ & 0x80) + while ((*(it) & 0xC0) == 0x80) + ++it; + return it; +} + +// returns it's parameter if it points to a character first byte, +// or else returns next character first byte +template +Iterator finish(Iterator it) +{ + while ((*(it) & 0xC0) == 0x80) + ++it; + return it; +} + +// returns an iterator to the previous character first byte +template +Iterator previous(Iterator it) +{ + while ((*(--it) & 0xC0) == 0x80) + ; + return it; +} + +// returns an iterator pointing to the first byte of the +// dth character after (or before if d < 0) the character +// pointed by it +template +Iterator advance(Iterator it, Distance d) +{ + if (d < 0) + { + while (d++) + it = previous(it); + } + else + { + while (d--) + it = next(it); + } + return it; +} + +// returns the character count between begin and end +template +size_t distance(Iterator begin, Iterator end) +{ + size_t dist = 0; + while (begin != end) + { + if ((*begin++ & 0xC0) != 0x80) + ++dist; + } +} + +// return true if it points to the first byte of a (either single or +// multibyte) character +template +bool is_character_start(Iterator it) +{ + return (*it & 0xC0) != 0x80; +} + +struct invalid_utf8_sequence{}; + +// returns the codepoint of the character whose first byte +// is pointed by it +template +Codepoint codepoint(Iterator it) +{ + // According to rfc3629, UTF-8 allows only up to 4 bytes. + // (21 bits codepoint) + Codepoint cp; + char byte = *it++; + if (not (byte & 0x80)) // 0xxxxxxx + cp = byte; + else if ((byte & 0xE0) == 0xC0) // 110xxxxx + { + cp = ((byte & 0x1F) << 6) | (*it & 0x3F); + } + else if ((byte & 0xF0) == 0xE0) // 1110xxxx + { + cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6); + cp |= (*it & 0x3F); + } + else if ((byte & 0xF8) == 0xF0) // 11110xxx + { + cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12); + cp |= (*it++ & 0x3F) << 6; + cp |= (*it & 0x3F); + } + else + throw invalid_utf8_sequence{}; +} + +} + +} + +#endif // utf8_hh_INCLUDED