Created
December 10, 2019 06:38
-
-
Save jkeiser/a3dcadd2770fb422d114baa510bbc6c2 to your computer and use it in GitHub Desktop.
UTF-8 lookup ARM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <arm_neon.h> | |
// | |
// Detect Unicode errors. | |
// | |
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic | |
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits | |
// are straight up concatenated into the final value. The first byte of a multibyte character is a | |
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte | |
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just | |
// start with 0, because that's what ASCII looks like. Here's what each size | |
// | |
// - ASCII (7 bits): 0_______ | |
// - 2 byte character (11 bits): 110_____ 10______ | |
// - 3 byte character (17 bits): 1110____ 10______ 10______ | |
// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______ | |
// - 5+ byte character (illegal): 11111___ <illegal> | |
// | |
// There are 5 classes of error that can happen in Unicode: | |
// | |
// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation). | |
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte | |
// character. | |
// | |
// e.g. 11000000 01100001 (2-byte character where second byte is ASCII) | |
// | |
// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation). | |
// We detect this by requiring that the next byte after your multibyte character be a new | |
// character--so a continuation after your character is wrong. | |
// | |
// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte) | |
// | |
// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large. | |
// | |
// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF). | |
// | |
// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have | |
// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is | |
// technically possible, but UTF-8 disallows it so that there is only one way to write an "a". | |
// | |
// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001) | |
// | |
// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and | |
// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8. | |
// | |
// e.g. 11101101 10100000 10000000 (U+D800) | |
// | |
// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not | |
// support values with more than 23 bits (which a 4-byte character supports). | |
// | |
// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) | |
// | |
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: | |
// | |
// Code Points 1st 2s 3s 4s | |
// U+0000..U+007F 00..7F | |
// U+0080..U+07FF C2..DF 80..BF | |
// U+0800..U+0FFF E0 A0..BF 80..BF | |
// U+1000..U+CFFF E1..EC 80..BF 80..BF | |
// U+D000..U+D7FF ED 80..9F 80..BF | |
// U+E000..U+FFFF EE..EF 80..BF 80..BF | |
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF | |
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF | |
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF | |
// | |
bool check_utf8(const char* buf, size_t len) { | |
uint8x16_t error; | |
uint8x16_t prev_input; | |
uint8x16_t prev_incomplete; | |
for (size_t i=0; i<len; i+=sizeof(uint8x16_t)) { | |
uint8x16_t input = vmovq_n_u8(&buf[i]); | |
// | |
// Check ASCII | |
// | |
if (vmaxvq_u8(input) < 0b10000000u) { | |
error = vorrq_u8(error, prev_incomplete); | |
continue; | |
} | |
// | |
// Match the "special case" exceptions like surrogates and overlong encodings using a trio | |
// of table lookups &'d together. | |
// | |
uint8x16_t prev1 = vextq_s8(prev_input, input, 16 - 1); | |
// These are the errors we're going to match for bytes 1-2, by looking at the first three | |
// nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2> | |
static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way) | |
static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________ | |
static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________ | |
static const int SURROGATE = 0x08; // 11101101 [101_]____ | |
static const int TOO_LARGE = 0x10; // 11110100 (1001|101_)____ | |
static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______ | |
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check | |
// byte 2 to be sure which things are errors and which aren't. | |
// Since high_bits is byte 5, byte 2 is high_bits.prev<3> | |
static const int CARRY = OVERLONG_2 | TOO_LARGE_2; | |
uint8x16_t byte_2_high = vqtbl1q_u8( | |
uint8x16_t{ | |
// ASCII: ________ [0___]____ | |
CARRY, CARRY, CARRY, CARRY, | |
// ASCII: ________ [0___]____ | |
CARRY, CARRY, CARRY, CARRY, | |
// Continuations: ________ [10__]____ | |
CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____ | |
CARRY | OVERLONG_3 | TOO_LARGE, // ________ [1001]____ | |
CARRY | TOO_LARGE | SURROGATE, // ________ [1010]____ | |
CARRY | TOO_LARGE | SURROGATE, // ________ [1011]____ | |
// Multibyte Leads: ________ [11__]____ | |
CARRY, CARRY, CARRY, CARRY | |
}, | |
vshrq_n_u8(input, 4) | |
); | |
uint8x16_t byte_1_high = vqtbl1q_u8( | |
uint8x16_t{ | |
// [0___]____ (ASCII) | |
0, 0, 0, 0, | |
0, 0, 0, 0, | |
// [10__]____ (continuation) | |
0, 0, 0, 0, | |
// [11__]____ (2+-byte leads) | |
OVERLONG_2, 0, // [110_]____ (2-byte lead) | |
OVERLONG_3 | SURROGATE, // [1110]____ (3-byte lead) | |
OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead) | |
}, | |
vshrq_n_u8(prev1, 4) | |
); | |
uint8x16_t byte_1_low = vqtlb1q_u8( | |
uint8x16_t{ | |
// ____[00__] ________ | |
OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________ | |
OVERLONG_2, // ____[0001] ________ | |
0, 0, | |
// ____[01__] ________ | |
TOO_LARGE, // ____[0100] ________ | |
TOO_LARGE_2, | |
TOO_LARGE_2, | |
TOO_LARGE_2, | |
// ____[10__] ________ | |
TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, | |
// ____[11__] ________ | |
TOO_LARGE_2, | |
TOO_LARGE_2 | SURROGATE, // ____[1101] ________ | |
TOO_LARGE_2, TOO_LARGE_2 | |
}, | |
vandq_u8(prev1, vdupq_n_u8(0x0F)) | |
) | |
error = vorrq_u8(error, vandq_u8(byte_1_high, vandq_u8(byte_1_low, byte_2_high))); | |
// | |
// Validate that we have continuations iff we want them | |
// | |
uint8x16_t prev2 = vextq_s8(prev_input, input, 16 - 2); | |
uint8x16_t prev3 = vextq_s8(prev_input, input, 16 - 3); | |
// Cont is 10000000-101111111 (-65...-128). Use signed comparison. | |
uint8x16_t is_continuation = vcltq_s8(vreinterpretq_s8_u8(input), vdupq_n_s8(-64)); | |
uint8x16_t is_second_byte = vcgeq_u8(prev1, vdupq_n_u8(0b11000000u)); | |
uint8x16_t is_third_byte = vcgeq_u8(prev2, vdupq_n_u8(0b11100000u)); | |
uint8x16_t is_fourth_byte = vcgeq_u8(prev3, vdupq_n_u8(0b11110000u)); | |
// Use ^ instead of | for is_second_byte | is_third_byte | is_fourth_byte, because ^ is | |
// commutative. We can do this because we only have to report errors for cases with NO lead bytes, | |
// or exactly ONE, in which case ^ and | yield the same result. | |
error = vorrq_u8(error, | |
veorq_u8( | |
veorq_u8(is_continuation, is_second_byte), | |
veorq_u8(is_third_byte, is_fourth_byte) | |
) | |
); | |
// | |
// Prepare next iteration | |
// | |
// If there is an unfinished multibyte character, record that here (ASCII check will use it): | |
prev_incomplete = vcgt_u8(input, vuint8x16_t{ | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 | |
}); | |
prev_input = input; | |
} | |
// If any error bits are on, this will be > 0 | |
return vmaxvq_u8(error); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment