Use unsigned char rather than char in utf8 decoding to avoid sign extension

This commit is contained in:
Maxime Coste 2014-07-01 23:47:09 +01:00
parent 70db72b175
commit 3f70d91f8c

View File

@ -95,12 +95,12 @@ namespace InvalidBytePolicy
struct Assert struct Assert
{ {
Codepoint operator()(char byte) const { kak_assert(false); return byte; } Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; }
}; };
struct Pass struct Pass
{ {
Codepoint operator()(char byte) const { return byte; } Codepoint operator()(unsigned char byte) const { return byte; }
}; };
} }
@ -114,7 +114,7 @@ Codepoint codepoint(Iterator it)
// According to rfc3629, UTF-8 allows only up to 4 bytes. // According to rfc3629, UTF-8 allows only up to 4 bytes.
// (21 bits codepoint) // (21 bits codepoint)
Codepoint cp; Codepoint cp;
char byte = *it++; unsigned char byte = *it++;
if (not (byte & 0x80)) // 0xxxxxxx if (not (byte & 0x80)) // 0xxxxxxx
cp = byte; cp = byte;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx else if ((byte & 0xE0) == 0xC0) // 110xxxxx
@ -141,7 +141,7 @@ template<typename InvalidPolicy = InvalidBytePolicy::Assert,
typename Iterator> typename Iterator>
ByteCount codepoint_size(Iterator it) ByteCount codepoint_size(Iterator it)
{ {
char byte = *it; unsigned char byte = *it;
if (not (byte & 0x80)) // 0xxxxxxx if (not (byte & 0x80)) // 0xxxxxxx
return 1; return 1;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx else if ((byte & 0xE0) == 0xC0) // 110xxxxx