Use unsigned char rather than char in utf8 decoding to avoid sign extension
This commit is contained in:
parent
70db72b175
commit
3f70d91f8c
|
@ -95,12 +95,12 @@ namespace InvalidBytePolicy
|
||||||
|
|
||||||
struct Assert
|
struct Assert
|
||||||
{
|
{
|
||||||
Codepoint operator()(char byte) const { kak_assert(false); return byte; }
|
Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Pass
|
struct Pass
|
||||||
{
|
{
|
||||||
Codepoint operator()(char byte) const { return byte; }
|
Codepoint operator()(unsigned char byte) const { return byte; }
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -114,7 +114,7 @@ Codepoint codepoint(Iterator it)
|
||||||
// According to rfc3629, UTF-8 allows only up to 4 bytes.
|
// According to rfc3629, UTF-8 allows only up to 4 bytes.
|
||||||
// (21 bits codepoint)
|
// (21 bits codepoint)
|
||||||
Codepoint cp;
|
Codepoint cp;
|
||||||
char byte = *it++;
|
unsigned char byte = *it++;
|
||||||
if (not (byte & 0x80)) // 0xxxxxxx
|
if (not (byte & 0x80)) // 0xxxxxxx
|
||||||
cp = byte;
|
cp = byte;
|
||||||
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
|
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
|
||||||
|
@ -141,7 +141,7 @@ template<typename InvalidPolicy = InvalidBytePolicy::Assert,
|
||||||
typename Iterator>
|
typename Iterator>
|
||||||
ByteCount codepoint_size(Iterator it)
|
ByteCount codepoint_size(Iterator it)
|
||||||
{
|
{
|
||||||
char byte = *it;
|
unsigned char byte = *it;
|
||||||
if (not (byte & 0x80)) // 0xxxxxxx
|
if (not (byte & 0x80)) // 0xxxxxxx
|
||||||
return 1;
|
return 1;
|
||||||
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
|
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
|
||||||
|
|
Loading…
Reference in New Issue
Block a user