Created
April 28, 2020 22:47
-
-
Save henrygab/92a7077e4cc9b3e6b098ecaa5fa2d908 to your computer and use it in GitHub Desktop.
Brute-force validation of UTF-8 decoder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* The MIT License (MIT) | |
* | |
* Copyright (c) 2020 Henry Gabryjelski | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
#include <inttypes.h> | |
#include <stdio.h> | |
#include <assert.h> | |
const uint32_t CODEPOINT_LARGEST_VALID_UTF8 = 0x10FFFF; | |
const uint32_t CODEPOINT_REPLACEMENT_CHARACTER = 0xFFFD; | |
const uint32_t CODEPOINT_WORD_JOINER = 0x2060; | |
const uint32_t CODEPOINT_LOWEST_SURROGATE_HALF = 0xD800; | |
const uint32_t CODEPOINT_HIGHEST_SURROGATE_HALF = 0xDFFF; | |
constexpr static inline bool isInvalidUtf8Octet(uint8_t t) { | |
// see bullets in https://tools.ietf.org/html/rfc3629#section-1 | |
return (t == 0xc0) || (t == 0xC1) || ((t >= 0xF5) && (t <= 0xFF)); | |
} | |
constexpr static inline bool isCodepointSurrogateHalf(uint32_t codepoint) { | |
// high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are | |
// not legal Unicode values ... see RFC 3629. | |
return ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF)); | |
} | |
constexpr static inline bool isValidCodepointForUTF8(uint32_t codepoint) { | |
if (isCodepointSurrogateHalf(codepoint)) { | |
return false; | |
} else if (codepoint > CODEPOINT_LARGEST_VALID_UTF8) { | |
return false; | |
} else { | |
return true; | |
} | |
} | |
constexpr static inline int expectedEncodingLengthForUTF8(uint32_t codepoint) { | |
if (!isValidCodepointForUTF8(codepoint)) { | |
return -1; | |
} else if (codepoint <= 0x00007F) { | |
return 1; | |
} else if (codepoint <= 0x0007FF) { | |
return 2; | |
} else if (codepoint <= 0x00FFFF) { | |
return 3; | |
} else if (codepoint <= 0x10FFFF) { | |
static_assert(0x10FFFF == CODEPOINT_LARGEST_VALID_UTF8); | |
return 4; | |
} else { | |
return -1; // belt and suspenders for validity | |
} | |
} | |
/* Functions under test */ | |
static int8_t utf8Codepoint(const uint8_t *utf8, uint32_t *codepointp) | |
{ | |
*codepointp = 0xFFFD; // always initialize output to known value ... 0xFFFD (REPLACEMENT CHARACTER) seems the natural choice | |
int codepoint; | |
int len; | |
// The upper bits define both the length of additional bytes for the multi-byte encoding, | |
// as well as defining how many bits of the first byte are included in the codepoint. | |
// Each additional byte starts with 0b10xxxxxx, encoding six additional bits for the codepoint. | |
// | |
// For key summary points, see: | |
// * https://tools.ietf.org/html/rfc3629#section-3 | |
// | |
// if ((utf8[0] == 0xEF) && (utf8[1] == 0xBB) && (utf8[2] == 0xBF)) { | |
// // If the caller does not exclude the BOM (which is legal for UTF-8), convert it to U+2060 WORD JOINER | |
// *codepointp = CODEPOINT_WORD_JOINER; | |
// return 3; | |
// } | |
if (isInvalidUtf8Octet(utf8[0])) { // do not allow illegal octet sequences (e.g., 0xC0 0x80 should NOT decode to NULL) | |
return -1; | |
} | |
if (utf8[0] < 0x80) { // characters 0000 0000..0000 007F (up to 7 significant bits) | |
len = 1; | |
codepoint = utf8[0]; | |
} else if ((utf8[0] & 0xe0) == 0xc0) { // characters 0000 0080..0000 07FF (up to 11 significant bits, so first byte encodes five bits) | |
len = 2; | |
codepoint = utf8[0] & 0x1f; | |
} else if ((utf8[0] & 0xf0) == 0xe0) { // characters 0000 8000..0000 FFFF (up to 16 significant bits, so first byte encodes four bits) | |
len = 3; | |
codepoint = utf8[0] & 0x0f; | |
} else if ((utf8[0] & 0xf8) == 0xf0) { // characters 0001 0000..0010 FFFF (up to 21 significant bits, so first byte encodes three bits) | |
len = 4; | |
codepoint = utf8[0] & 0x07; | |
} else { // UTF-8 is defined to only map to Unicode -- 0x00000000..0x0010FFFF | |
// 5-byte and 6-byte sequences are not legal per RFC3629 | |
return -1; | |
} | |
for (int i = 1; i < len; i++) { | |
if ((utf8[i] & 0xc0) != 0x80) { | |
// the additional bytes in a valid UTF-8 multi-byte encoding cannot have either of the top two bits set | |
// This is more restrictive than isInvalidUtf8Octet() | |
return -1; | |
} | |
codepoint <<= 6; // each continuation byte adds six bits to the codepoint | |
codepoint |= utf8[i] & 0x3f; // mask off the top two continuation bits, and add the six relevant bits | |
} | |
// explicit validation to prevent overlong encodings | |
if ( (len == 1) && ((codepoint < 0x000000) || (codepoint > 0x00007F))) { | |
return -1; | |
} else if ((len == 2) && ((codepoint < 0x000080) || (codepoint > 0x0007FF))) { | |
return -1; | |
} else if ((len == 3) && ((codepoint < 0x000800) || (codepoint > 0x00FFFF))) { | |
return -1; | |
} else if ((len == 4) && ((codepoint < 0x010000) || (codepoint > 0x10FFFF))) { | |
// "You might expect larger code points than U+10FFFF | |
// to be expressible, but Unicode is limited in Sections 12 | |
// of RFC3629 to match the limits of UTF-16." -- Wikipedia UTF-8 note | |
// See https://tools.ietf.org/html/rfc3629#section-12 | |
return -1; | |
} | |
// high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are | |
// not legal Unicode values ... see RFC 3629. | |
if ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF)) { | |
return -1; | |
} | |
*codepointp = codepoint; | |
return len; | |
} | |
/* How to test those? */ | |
/* | |
1. From i = 0 through at least 0x10FFFF + 1 .... | |
2. | |
*/ | |
uint8_t bufferUnderTest[8] = {0,0,0,0,0}; // up to four bytes + NULL | |
void PrintBufferUnderTest(int encodingLength, bool outputNewline = true) | |
{ | |
printf("Buffer under test (length %d): %02x %02x %02x %02x %02x", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]); | |
if (outputNewline) { printf("\n"); } | |
} | |
void ValidateDecoder() | |
{ | |
// Simple! Just increment the four-byte values, and get the last code point. | |
// The decoder should ONLY decode 0x10FFFF total codepoints, and each one | |
// decoded should be exactly one greater than the prior decoded value. | |
// Exceptions: | |
// * Surrogate halves | |
// * Byte order mark (0xEF 0xBB 0xBF) --> 0x8288 (WORD JOINER) | |
uint32_t lastDecodedCodePoint = 0xFFFFFFFF; // invalid, and +1 will be zero | |
int8_t encodingLength = 1; | |
bool finished = false; | |
bool anyFailure = false; | |
while (!finished) { | |
uint32_t currentCodepoint = 0xFFFFFFFF; | |
int8_t len = utf8Codepoint(bufferUnderTest, ¤tCodepoint); | |
// Check the result of attempt to decode | |
if (len < 0) { | |
// do nothing. This is a consequence of the optimized testing without having | |
// know if all inputs are valid. | |
} else if (len < encodingLength) { | |
// the reported encoded length is smaller ... this is not by itself a failure, and can be ignored | |
// as it was already handled by prior testing of the smaller length buffers. | |
} else if (len > encodingLength) { | |
assert(!"Reported success decoding, but length is greater than expected"); | |
} else { | |
// function reported successful decoding. | |
// This is a failure unless one of the following two conditions hold: | |
bool isActuallyFailure = true; | |
if (lastDecodedCodePoint == (CODEPOINT_LOWEST_SURROGATE_HALF-1) && (CODEPOINT_HIGHEST_SURROGATE_HALF+1) == currentCodepoint) { | |
isActuallyFailure = false; | |
} | |
if (lastDecodedCodePoint + 1 == currentCodepoint) { | |
isActuallyFailure = false; | |
} | |
//PrintBufferUnderTest(encodingLength, false); | |
//printf(" ==> %08x (%" PRIu32 ") %s\n", currentCodepoint, currentCodepoint, (isActuallyFailure ? "(failure)" : "")); | |
// if (currentCodepoint == CODEPOINT_WORD_JOINER && bufferUnderTest[0] == 0xEF && bufferUnderTest[1] == 0xBB && bufferUnderTest[2] == 0xBE) { | |
// // Special-case ... byte order mark is allowed to convert to CODEPOINT_WORD_JOINER | |
// // This is neither failure (assertion) nor success (update of last decoded code point) ... should be ignored. | |
// } else | |
if (isActuallyFailure) { | |
printf("Failure: prior decoded code point %08x (%" PRIu32 "), this decoded code point is %08x (%" PRIu32 ")\n", lastDecodedCodePoint, lastDecodedCodePoint, currentCodepoint, currentCodepoint); | |
PrintBufferUnderTest(encodingLength); | |
assert(!"failure vs. last decoded"); | |
anyFailure = true; | |
} else { | |
lastDecodedCodePoint = currentCodepoint; | |
} | |
} | |
if (true) { // update encodingLength / LOOP EXIT CONDITION | |
bool updateEncodingLength = true; | |
for (int i = 0; i < encodingLength; ++i) { | |
if (bufferUnderTest[i] != 0xFF) { | |
updateEncodingLength = false; | |
} | |
} | |
if (updateEncodingLength && encodingLength == 4) { | |
finished = true; | |
} | |
if (!finished && updateEncodingLength) { | |
if (anyFailure) { | |
finished = true; | |
} | |
printf("Updating encoding length from %d to %d\n", encodingLength, encodingLength+1); | |
bufferUnderTest[encodingLength] = 0xFF; // hack so incrementing uses common code | |
bufferUnderTest[0] = 0x00; // hack so does not test all the zero- | |
++encodingLength; | |
} | |
} | |
if (!finished) { // increment the byte(s) so all inputs are tested | |
bool showSomethingMore = false; | |
for (int i = encodingLength-1; (i >= 0); --i) { | |
bufferUnderTest[i] += 1; | |
if (bufferUnderTest[i] != 0x00) break; // only increment least significant byte(s) | |
if (encodingLength > 2 && i <= encodingLength-2) { | |
showSomethingMore = true; | |
} | |
} | |
if (showSomethingMore) { | |
printf("Now testing buffer of length %d: %02x %02x %02x %02x %02x\n", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]); | |
} | |
} | |
} | |
if (lastDecodedCodePoint != CODEPOINT_LARGEST_VALID_UTF8) { | |
printf("Last decoded code point did not reach expected value (%08x != %08x)\r\n", lastDecodedCodePoint, CODEPOINT_LARGEST_VALID_UTF8); | |
assert(false); | |
} | |
printf("\nSUCCESS!\n"); | |
} | |
void main(void) { | |
ValidateDecoder(); | |
} | |
/* The function utf8Codepoint() comes from https://github.com/adafruit/Adafruit_TinyUSB_ArduinoCore | |
and requires the following notice be present, if this is considered a modified copy. | |
In an abundance of caution, the text is thus provided for reference purposes below. | |
* The MIT License (MIT) | |
* | |
* Copyright (c) 2019 Ha Thach for Adafruit Industries | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment