Back then, I was working on a hobby project for text processing with particular reference to UTF- encoding and decoding. I wrote a lot of exploratory code (in a C++ library called "tea") that was left hanging when I moved on to other things. I've decided to re-visit that library and publish what I discovered whilst working on it.
Firstly, let's define some useful types. Note that all the code below is pseudo-code; I'll publish the full library source code when it's in a suitable state:
namespace tea {
namespace types {
// Unsigned integers
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
// Signed integers
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
// Signed characters
typedef signed char c8;
typedef int16_t c16;
typedef int32_t c32;
// IEEE floating-point
typedef float f32;
typedef double f64;
// Traits
template<typename T>
struct Traits {
inline static T minimum(void) {
return std::numeric_limits<T>::lowest();
}
inline static T maximum(void) {
return std::numeric_limits<T>::max();
}
inline static T extreme(void) {
return std::is_signed<T>::value ?
std::numeric_limits<T>::lowest() :
std::numeric_limits<T>::max();
}
};
// Bit-sizes
template<size_t BITS>
struct BitSize;
template<>
struct BitSize<8> {
typedef s8 Signed;
typedef u8 Unsigned;
};
template<>
struct BitSize<16> {
typedef s16 Signed;
typedef u16 Unsigned;
};
template<>
struct BitSize<32> {
typedef s32 Signed;
typedef u32 Unsigned;
};
template<>
struct BitSize<64> {
typedef s64 Signed;
typedef u64 Unsigned;
};
}
using namespace types;
}
- UTF-8 codepoints are not truncated mid-sequence.
- All codepoints are valid Unicode codepoints.
- When writing UTF-8, we never over-run the buffer.
namespace tea {
namespace utf8 {
const c32 codePointMinimum = 0x000000;
const c32 codePointMaximum = 0x10FFFF;
enum Status_UTF8 {
Status_UTF8_OK = 0,
Status_UTF8_Overlong = -1,
Status_UTF8_Beyond = -2,
Status_UTF8_Extraneous = -3,
Status_UTF8_Continuation = -4,
Status_UTF8_Truncated = -5,
Status_UTF8_Invalid = -6,
Status_UTF8_ThresholdStrict = Status_UTF8_OK,
Status_UTF8_ThresholdNormal = Status_UTF8_Overlong,
Status_UTF8_ThresholdLax = Status_UTF8_Invalid
};
// Positive for code point length, negative for error
// (Continuation/Invalid)
extern const s8 leadByteTable[256];
// Positive for code point length, zero for error
extern const u8 msbEncodedSize[33];
// The UTF-8 sequence must be valid
namespace optimistic {
size_t measureCodePoint(c32 codepoint);
void encodeCodePointMeasured(u8* ptr, c32 codepoint, size_t bytes);
u8* encodeCodePoint(u8* ptr, c32 codepoint);
c32 decodeCodePoint(const u8*& ptr, const u8* end);
size_t countCodePoints(const u8* begin, const u8* end);
}
}
}
The "leadByteTable" array is the classic length-of-UTF-8-sequence-based-on-the-first-byte:
#define SIXTEEN(x) x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x
const tea::s8 tea::utf8::leadByteTable[256] = {
/* 0x00-0x0F */ SIXTEEN(1),
/* 0x10-0x1F */ SIXTEEN(1),
/* 0x20-0x2F */ SIXTEEN(1),
/* 0x30-0x3F */ SIXTEEN(1),
/* 0x40-0x4F */ SIXTEEN(1),
/* 0x50-0x5F */ SIXTEEN(1),
/* 0x60-0x6F */ SIXTEEN(1),
/* 0x70-0x7F */ SIXTEEN(1),
/* 0x80-0x8F */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),
/* 0x90-0x9F */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),
/* 0xA0-0xAF */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),
/* 0xB0-0xBF */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),
/* 0xC0-0xCF */ SIXTEEN(2),
/* 0xD0-0xDF */ SIXTEEN(2),
/* 0xE0-0xEF */ SIXTEEN(3),
/* 0xF0-0xFF */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
tea::utf8::Status_UTF8_Invalid,
tea::utf8::Status_UTF8_Invalid
};
#undef SIXTEEN
Note that negative numbers indicate the type of error.
This gives us a trivial implementation of "countCodePoints()" for UTF-8 strings assuming that unrecognised UTF-8 sequences are mapped to a single, replacement codepoint:
size_t countCodePoints(const u8* begin, const u8* end) {
assert(begin != nullptr);
assert(end >= begin);
size_t count = 0;
while (begin < end) {
count++;
s8 bytes = leadByteTable[*begin];
begin += (bytes <= 0) ? 1 : bytes;
}
return count;
}
The implementation of "decodeCodePoint()" to decode a single UTF-8 codepoint to UTF-32 is also relatively simple:
c32 decodeCodePoint(const u8*& ptr, const u8* end) {
static const s32 bias[] = {
0, 0, -12416, -925824, -63447168, 100130688, 2113396608
};
assert(ptr != nullptr);
assert(ptr < end);
u8 u1 = *(ptr++);
s8 expected = leadByteTable[u1];
assert((expected > 0) && (expected < TEA_NELEMS(bias)));
s32 codepoint = u1;
while (--expected > 0) {
// Extract any continuation bytes
assert(ptr < end);
u8 u2 = *(ptr++);
assert(leadByteTable[u2] == Status_UTF8_Continuation);
codepoint = (codepoint << 6) + u2;
}
codepoint += bias[leadByteTable[u1]];
assert(codepoint >= 0);
return codepoint;
}
Working out how many UTF-8 bytes are required to store a UTF-32 character is performed by "measureCodePoint()":
size_t measureCodePoint(c32 codepoint) {
int msb = tea::bits::scanMostSetBit(u32(codepoint)) + 1;
assert((msb >= 0) && (msb < TEA_NELEMS(msbEncodedSize)));
return msbEncodedSize[msb];
}
This requires a lookup table "msbEncodedSize" and a fast way of detecting the most-significant set bit of a 32-bit value:
const u8 msbEncodedSize[33] = {
1,1,1,1,1,1,1,1, // 0000000000000000000000000XXXXXX1: 0-7 : One byte
2,2,2,2, // 000000000000000000000XXX1xxxxxxx: 8-11: Two bytes
3,3,3,3,3, // 0000000000000000XXXX1xxxxxxxxxxx: 12-16: Three bytes
4,4,4,4,4, // 00000000000XXXX1xxxxxxxxxxxxxxxx: 17-21: Four bytes
5,5,5,5,5, // 000000XXXX1xxxxxxxxxxxxxxxxxxxxx: 22-26: Five bytes
6,6,6,6,6, // 0XXXX1xxxxxxxxxxxxxxxxxxxxxxxxxx: 27-31: Six bytes
0 // 1xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx: 32 : Invalid
};
Obviously, the implementation for "scanMostSetBit" is platform-dependent. So for Microsoft compilers it is:
int tea::bits::scanMostSetBit(u32 value) {
// Returns the index (31..0) of the most-significant non-zero bit
// or -1 if input is zero
unsigned long bit;
return _BitScanReverse(&bit, value) ? int(bit) : -1;
}
Finally, encoding a UTF-32 codepoint to a UTF-8 sequency is a case of measuring the size and storing the bytes:
void encodeCodePointMeasured(u8* ptr, c32 codepoint, size_t bytes) {
static const u64 bias[7] = {
0, 0, 12288, 917504, 62914560, 4160749568, 270582939648
};
assert(codepoint >= 0);
assert(ptr != nullptr);
assert((bytes > 0) && (bytes < TEA_NELEMS(bias)));;
u8* p = ptr + bytes;
u64 bits = bias[bytes] + codepoint;
while (bits > 0xFF) {
*(--p) = u8((bits & 0x3F) + 0x80);
bits >>= 6;
}
*(--p) = u8(bits);
assert(p == ptr);
}
u8* encodeCodePoint(u8* ptr, c32 codepoint) {
assert(codepoint >= 0);
assert(ptr != nullptr);
size_t bytes = measureCodePoint(codepoint);
assert(bytes > 0);
encodeCodePointMeasured(ptr, codepoint, bytes);
return ptr + bytes;
}
Obviously, our assumptions about valid UTF-8 sequences and the lack of checks for buffer over-runs make these "optimistic" routines unsuitable for production code, unless we've pre-validated the sequences.
Next time, we'll discuss routines that handle those error conditions gracefully.