henrygab · April 28, 2020 22:47
diff --git a/test.cpp b/test.cpp
 /* 
 * The MIT License (MIT)
 * 
 * Copyright (c) 2020 Henry Gabryjelski
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */


 #include <inttypes.h>
 #include <stdio.h>
 #include <assert.h>

 const uint32_t CODEPOINT_LARGEST_VALID_UTF8     = 0x10FFFF;
 const uint32_t CODEPOINT_REPLACEMENT_CHARACTER  =   0xFFFD;
 const uint32_t CODEPOINT_WORD_JOINER            =   0x2060;
 const uint32_t CODEPOINT_LOWEST_SURROGATE_HALF  =   0xD800;
 const uint32_t CODEPOINT_HIGHEST_SURROGATE_HALF =   0xDFFF;


 constexpr static inline bool isInvalidUtf8Octet(uint8_t t) {
  // see bullets in https://tools.ietf.org/html/rfc3629#section-1
  return (t == 0xc0) || (t == 0xC1) || ((t >= 0xF5) && (t <= 0xFF));
 }
 constexpr static inline bool isCodepointSurrogateHalf(uint32_t codepoint) {
  // high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are
  // not legal Unicode values ... see RFC 3629.
  return ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF));
 }
 constexpr static inline bool isValidCodepointForUTF8(uint32_t codepoint) {
  if (isCodepointSurrogateHalf(codepoint)) {
    return false;
  } else if (codepoint > CODEPOINT_LARGEST_VALID_UTF8) {
    return false;
  } else {
    return true;
  }
 }
 constexpr static inline int  expectedEncodingLengthForUTF8(uint32_t codepoint) {
  if (!isValidCodepointForUTF8(codepoint)) {
    return -1;
  } else if (codepoint <= 0x00007F) {
    return 1;
  } else if (codepoint <= 0x0007FF) {
    return 2;
  } else if (codepoint <= 0x00FFFF) {
    return 3;
  } else if (codepoint <= 0x10FFFF) {
    static_assert(0x10FFFF == CODEPOINT_LARGEST_VALID_UTF8);
    return 4;
  } else {
    return -1; // belt and suspenders for validity
  }
 }




 /* Functions under test */
 static int8_t utf8Codepoint(const uint8_t *utf8, uint32_t *codepointp)
 {
  *codepointp = 0xFFFD; // always initialize output to known value ... 0xFFFD (REPLACEMENT CHARACTER) seems the natural choice
  int codepoint;
  int len;

  // The upper bits define both the length of additional bytes for the multi-byte encoding,
  // as well as defining how many bits of the first byte are included in the codepoint.
  // Each additional byte starts with 0b10xxxxxx, encoding six additional bits for the codepoint.
  //
  // For key summary points, see:
  // * https://tools.ietf.org/html/rfc3629#section-3
  //
  // if ((utf8[0] == 0xEF) && (utf8[1] == 0xBB) && (utf8[2] == 0xBF)) {
  //   // If the caller does not exclude the BOM (which is legal for UTF-8), convert it to U+2060 WORD JOINER
  //   *codepointp = CODEPOINT_WORD_JOINER;
  //   return 3;
  // }

  if (isInvalidUtf8Octet(utf8[0])) { // do not allow illegal octet sequences (e.g., 0xC0 0x80 should NOT decode to NULL)
    return -1;
  }

  if (utf8[0] < 0x80) {                   // characters 0000 0000..0000 007F (up to  7 significant bits)
    len = 1;
    codepoint = utf8[0];
  } else if ((utf8[0] & 0xe0) == 0xc0) {  // characters 0000 0080..0000 07FF (up to 11 significant bits, so first byte encodes five bits)
    len = 2;
    codepoint = utf8[0] & 0x1f;
  } else if ((utf8[0] & 0xf0) == 0xe0) {  // characters 0000 8000..0000 FFFF (up to 16 significant bits, so first byte encodes four bits)
    len = 3;
    codepoint = utf8[0] & 0x0f;
  } else if ((utf8[0] & 0xf8) == 0xf0) {  // characters 0001 0000..0010 FFFF (up to 21 significant bits, so first byte encodes three bits)
    len = 4;
    codepoint = utf8[0] & 0x07;
  } else {                                // UTF-8 is defined to only map to Unicode -- 0x00000000..0x0010FFFF
    // 5-byte and 6-byte sequences are not legal per RFC3629
    return -1;
  }

  for (int i = 1; i < len; i++) {
    if ((utf8[i] & 0xc0) != 0x80) {
      // the additional bytes in a valid UTF-8 multi-byte encoding cannot have either of the top two bits set
      // This is more restrictive than isInvalidUtf8Octet()
      return -1;
    }
    codepoint <<= 6;             // each continuation byte adds six bits to the codepoint
    codepoint |= utf8[i] & 0x3f; // mask off the top two continuation bits, and add the six relevant bits
  }

  // explicit validation to prevent overlong encodings
  if (       (len == 1) && ((codepoint < 0x000000) || (codepoint > 0x00007F))) {
    return -1;
  } else if ((len == 2) && ((codepoint < 0x000080) || (codepoint > 0x0007FF))) {
    return -1;
  } else if ((len == 3) && ((codepoint < 0x000800) || (codepoint > 0x00FFFF))) {
    return -1;
  } else if ((len == 4) && ((codepoint < 0x010000) || (codepoint > 0x10FFFF))) {
    // "You might expect larger code points than U+10FFFF
    // to be expressible, but Unicode is limited in Sections 12
    // of RFC3629 to match the limits of UTF-16." -- Wikipedia UTF-8 note
    // See https://tools.ietf.org/html/rfc3629#section-12
    return -1;
  }

  // high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are
  // not legal Unicode values ... see RFC 3629.
  if ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF)) {
    return -1;
  }

  *codepointp = codepoint;
  return len;
 }

 /* How to test those? */
 /*
   1. From i = 0 through at least 0x10FFFF + 1 ....
   2. 
 */
 uint8_t bufferUnderTest[8] = {0,0,0,0,0}; // up to four bytes + NULL
 void PrintBufferUnderTest(int encodingLength, bool outputNewline = true)
 {
  printf("Buffer under test (length %d): %02x %02x %02x %02x %02x", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]);
  if (outputNewline) { printf("\n"); }
 }
 void ValidateDecoder()
 {
  // Simple!  Just increment the four-byte values, and get the last code point.
  // The decoder should ONLY decode 0x10FFFF total codepoints, and each one
  // decoded should be exactly one greater than the prior decoded value.
  // Exceptions:
  // * Surrogate halves
  // * Byte order mark (0xEF 0xBB 0xBF) --> 0x8288 (WORD JOINER)

  uint32_t lastDecodedCodePoint = 0xFFFFFFFF; // invalid, and +1 will be zero

  int8_t encodingLength = 1;
  bool finished = false;
  bool anyFailure = false;


  while (!finished) {
    uint32_t currentCodepoint = 0xFFFFFFFF;
    int8_t len = utf8Codepoint(bufferUnderTest, &currentCodepoint);

    // Check the result of attempt to decode
    if (len < 0) {
      // do nothing.  This is a consequence of the optimized testing without having
      // know if all inputs are valid.
    } else if (len < encodingLength) {
      // the reported encoded length is smaller ... this is not by itself a failure, and can be ignored
      // as it was already handled by prior testing of the smaller length buffers.
    } else if (len > encodingLength) {
      assert(!"Reported success decoding, but length is greater than expected");
    } else {
      // function reported successful decoding.
      // This is a failure unless one of the following two conditions hold:
      bool isActuallyFailure = true;
      if (lastDecodedCodePoint == (CODEPOINT_LOWEST_SURROGATE_HALF-1) && (CODEPOINT_HIGHEST_SURROGATE_HALF+1) == currentCodepoint) {
        isActuallyFailure = false;
      }
      if (lastDecodedCodePoint + 1 == currentCodepoint) {
        isActuallyFailure = false;
      }
      //PrintBufferUnderTest(encodingLength, false);
      //printf(" ==> %08x (%" PRIu32 ") %s\n", currentCodepoint, currentCodepoint, (isActuallyFailure ? "(failure)" : ""));
      // if (currentCodepoint == CODEPOINT_WORD_JOINER && bufferUnderTest[0] == 0xEF && bufferUnderTest[1] == 0xBB && bufferUnderTest[2] == 0xBE) {
      //   // Special-case ... byte order mark is allowed to convert to CODEPOINT_WORD_JOINER
      //   // This is neither failure (assertion) nor success (update of last decoded code point) ... should be ignored.
      // } else
      if (isActuallyFailure) {
        printf("Failure: prior decoded code point %08x (%" PRIu32 "), this decoded code point is %08x (%" PRIu32 ")\n", lastDecodedCodePoint, lastDecodedCodePoint, currentCodepoint, currentCodepoint);
        PrintBufferUnderTest(encodingLength);
        assert(!"failure vs. last decoded");
        anyFailure = true;
      } else {
        lastDecodedCodePoint = currentCodepoint;
      }
    }

    if (true) { // update encodingLength / LOOP EXIT CONDITION
      bool updateEncodingLength = true;
      for (int i = 0; i < encodingLength; ++i) {
        if (bufferUnderTest[i] != 0xFF) {
          updateEncodingLength = false;
        }
      }
      if (updateEncodingLength && encodingLength == 4) {
        finished = true;
      }
      
      if (!finished && updateEncodingLength) {
        if (anyFailure) {
          finished = true;
        }
        printf("Updating encoding length from %d to %d\n", encodingLength, encodingLength+1);
        bufferUnderTest[encodingLength] = 0xFF; // hack so incrementing uses common code
        bufferUnderTest[0]              = 0x00; // hack so does not test all the zero-
        ++encodingLength;
      }
    }

    if (!finished) { // increment the byte(s) so all inputs are tested
      bool showSomethingMore = false;
      for (int i = encodingLength-1; (i >= 0); --i) {
        bufferUnderTest[i] += 1;
        if (bufferUnderTest[i] != 0x00) break; // only increment least significant byte(s)
        if (encodingLength > 2 && i <= encodingLength-2) {
          showSomethingMore = true;
        }
      }
      if (showSomethingMore) {
        printf("Now testing buffer of length %d: %02x %02x %02x %02x %02x\n", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]);
      }
    }
  }

  if (lastDecodedCodePoint != CODEPOINT_LARGEST_VALID_UTF8) {
    printf("Last decoded code point did not reach expected value (%08x != %08x)\r\n", lastDecodedCodePoint, CODEPOINT_LARGEST_VALID_UTF8);
    assert(false);
  }
  printf("\nSUCCESS!\n");
 }

 void main(void) {
  ValidateDecoder();
 }


 /* The function utf8Codepoint() comes from https://github.com/adafruit/Adafruit_TinyUSB_ArduinoCore
   and requires the following notice be present, if this is considered a modified copy.
   In an abundance of caution, the text is thus provided for reference purposes below.

 * The MIT License (MIT)
 * 
 * Copyright (c) 2019 Ha Thach for Adafruit Industries
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
	/*
	* The MIT License (MIT)
	*
	* Copyright (c) 2020 Henry Gabryjelski
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/


	#include <inttypes.h>
	#include <stdio.h>
	#include <assert.h>

	const uint32_t CODEPOINT_LARGEST_VALID_UTF8 = 0x10FFFF;
	const uint32_t CODEPOINT_REPLACEMENT_CHARACTER = 0xFFFD;
	const uint32_t CODEPOINT_WORD_JOINER = 0x2060;
	const uint32_t CODEPOINT_LOWEST_SURROGATE_HALF = 0xD800;
	const uint32_t CODEPOINT_HIGHEST_SURROGATE_HALF = 0xDFFF;


	constexpr static inline bool isInvalidUtf8Octet(uint8_t t) {
	// see bullets in https://tools.ietf.org/html/rfc3629#section-1
	return (t == 0xc0) \|\| (t == 0xC1) \|\| ((t >= 0xF5) && (t <= 0xFF));
	}
	constexpr static inline bool isCodepointSurrogateHalf(uint32_t codepoint) {
	// high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are
	// not legal Unicode values ... see RFC 3629.
	return ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF));
	}
	constexpr static inline bool isValidCodepointForUTF8(uint32_t codepoint) {
	if (isCodepointSurrogateHalf(codepoint)) {
	return false;
	} else if (codepoint > CODEPOINT_LARGEST_VALID_UTF8) {
	return false;
	} else {
	return true;
	}
	}
	constexpr static inline int expectedEncodingLengthForUTF8(uint32_t codepoint) {
	if (!isValidCodepointForUTF8(codepoint)) {
	return -1;
	} else if (codepoint <= 0x00007F) {
	return 1;
	} else if (codepoint <= 0x0007FF) {
	return 2;
	} else if (codepoint <= 0x00FFFF) {
	return 3;
	} else if (codepoint <= 0x10FFFF) {
	static_assert(0x10FFFF == CODEPOINT_LARGEST_VALID_UTF8);
	return 4;
	} else {
	return -1; // belt and suspenders for validity
	}
	}




	/* Functions under test */
	static int8_t utf8Codepoint(const uint8_t utf8, uint32_t codepointp)
	{
	*codepointp = 0xFFFD; // always initialize output to known value ... 0xFFFD (REPLACEMENT CHARACTER) seems the natural choice
	int codepoint;
	int len;

	// The upper bits define both the length of additional bytes for the multi-byte encoding,
	// as well as defining how many bits of the first byte are included in the codepoint.
	// Each additional byte starts with 0b10xxxxxx, encoding six additional bits for the codepoint.
	//
	// For key summary points, see:
	// * https://tools.ietf.org/html/rfc3629#section-3
	//
	// if ((utf8[0] == 0xEF) && (utf8[1] == 0xBB) && (utf8[2] == 0xBF)) {
	// // If the caller does not exclude the BOM (which is legal for UTF-8), convert it to U+2060 WORD JOINER
	// *codepointp = CODEPOINT_WORD_JOINER;
	// return 3;
	// }

	if (isInvalidUtf8Octet(utf8[0])) { // do not allow illegal octet sequences (e.g., 0xC0 0x80 should NOT decode to NULL)
	return -1;
	}

	if (utf8[0] < 0x80) { // characters 0000 0000..0000 007F (up to 7 significant bits)
	len = 1;
	codepoint = utf8[0];
	} else if ((utf8[0] & 0xe0) == 0xc0) { // characters 0000 0080..0000 07FF (up to 11 significant bits, so first byte encodes five bits)
	len = 2;
	codepoint = utf8[0] & 0x1f;
	} else if ((utf8[0] & 0xf0) == 0xe0) { // characters 0000 8000..0000 FFFF (up to 16 significant bits, so first byte encodes four bits)
	len = 3;
	codepoint = utf8[0] & 0x0f;
	} else if ((utf8[0] & 0xf8) == 0xf0) { // characters 0001 0000..0010 FFFF (up to 21 significant bits, so first byte encodes three bits)
	len = 4;
	codepoint = utf8[0] & 0x07;
	} else { // UTF-8 is defined to only map to Unicode -- 0x00000000..0x0010FFFF
	// 5-byte and 6-byte sequences are not legal per RFC3629
	return -1;
	}

	for (int i = 1; i < len; i++) {
	if ((utf8[i] & 0xc0) != 0x80) {
	// the additional bytes in a valid UTF-8 multi-byte encoding cannot have either of the top two bits set
	// This is more restrictive than isInvalidUtf8Octet()
	return -1;
	}
	codepoint <<= 6; // each continuation byte adds six bits to the codepoint
	codepoint \|= utf8[i] & 0x3f; // mask off the top two continuation bits, and add the six relevant bits
	}

	// explicit validation to prevent overlong encodings
	if ( (len == 1) && ((codepoint < 0x000000) \|\| (codepoint > 0x00007F))) {
	return -1;
	} else if ((len == 2) && ((codepoint < 0x000080) \|\| (codepoint > 0x0007FF))) {
	return -1;
	} else if ((len == 3) && ((codepoint < 0x000800) \|\| (codepoint > 0x00FFFF))) {
	return -1;
	} else if ((len == 4) && ((codepoint < 0x010000) \|\| (codepoint > 0x10FFFF))) {
	// "You might expect larger code points than U+10FFFF
	// to be expressible, but Unicode is limited in Sections 12
	// of RFC3629 to match the limits of UTF-16." -- Wikipedia UTF-8 note
	// See https://tools.ietf.org/html/rfc3629#section-12
	return -1;
	}

	// high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are
	// not legal Unicode values ... see RFC 3629.
	if ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF)) {
	return -1;
	}

	*codepointp = codepoint;
	return len;
	}

	/* How to test those? */
	/*
	1. From i = 0 through at least 0x10FFFF + 1 ....
	2.
	*/
	uint8_t bufferUnderTest[8] = {0,0,0,0,0}; // up to four bytes + NULL
	void PrintBufferUnderTest(int encodingLength, bool outputNewline = true)
	{
	printf("Buffer under test (length %d): %02x %02x %02x %02x %02x", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]);
	if (outputNewline) { printf("\n"); }
	}
	void ValidateDecoder()
	{
	// Simple! Just increment the four-byte values, and get the last code point.
	// The decoder should ONLY decode 0x10FFFF total codepoints, and each one
	// decoded should be exactly one greater than the prior decoded value.
	// Exceptions:
	// * Surrogate halves
	// * Byte order mark (0xEF 0xBB 0xBF) --> 0x8288 (WORD JOINER)

	uint32_t lastDecodedCodePoint = 0xFFFFFFFF; // invalid, and +1 will be zero

	int8_t encodingLength = 1;
	bool finished = false;
	bool anyFailure = false;


	while (!finished) {
	uint32_t currentCodepoint = 0xFFFFFFFF;
	int8_t len = utf8Codepoint(bufferUnderTest, &currentCodepoint);

	// Check the result of attempt to decode
	if (len < 0) {
	// do nothing. This is a consequence of the optimized testing without having
	// know if all inputs are valid.
	} else if (len < encodingLength) {
	// the reported encoded length is smaller ... this is not by itself a failure, and can be ignored
	// as it was already handled by prior testing of the smaller length buffers.
	} else if (len > encodingLength) {
	assert(!"Reported success decoding, but length is greater than expected");
	} else {
	// function reported successful decoding.
	// This is a failure unless one of the following two conditions hold:
	bool isActuallyFailure = true;
	if (lastDecodedCodePoint == (CODEPOINT_LOWEST_SURROGATE_HALF-1) && (CODEPOINT_HIGHEST_SURROGATE_HALF+1) == currentCodepoint) {
	isActuallyFailure = false;
	}
	if (lastDecodedCodePoint + 1 == currentCodepoint) {
	isActuallyFailure = false;
	}
	//PrintBufferUnderTest(encodingLength, false);
	//printf(" ==> %08x (%" PRIu32 ") %s\n", currentCodepoint, currentCodepoint, (isActuallyFailure ? "(failure)" : ""));
	// if (currentCodepoint == CODEPOINT_WORD_JOINER && bufferUnderTest[0] == 0xEF && bufferUnderTest[1] == 0xBB && bufferUnderTest[2] == 0xBE) {
	// // Special-case ... byte order mark is allowed to convert to CODEPOINT_WORD_JOINER
	// // This is neither failure (assertion) nor success (update of last decoded code point) ... should be ignored.
	// } else
	if (isActuallyFailure) {
	printf("Failure: prior decoded code point %08x (%" PRIu32 "), this decoded code point is %08x (%" PRIu32 ")\n", lastDecodedCodePoint, lastDecodedCodePoint, currentCodepoint, currentCodepoint);
	PrintBufferUnderTest(encodingLength);
	assert(!"failure vs. last decoded");
	anyFailure = true;
	} else {
	lastDecodedCodePoint = currentCodepoint;
	}
	}

	if (true) { // update encodingLength / LOOP EXIT CONDITION
	bool updateEncodingLength = true;
	for (int i = 0; i < encodingLength; ++i) {
	if (bufferUnderTest[i] != 0xFF) {
	updateEncodingLength = false;
	}
	}
	if (updateEncodingLength && encodingLength == 4) {
	finished = true;
	}

	if (!finished && updateEncodingLength) {
	if (anyFailure) {
	finished = true;
	}
	printf("Updating encoding length from %d to %d\n", encodingLength, encodingLength+1);
	bufferUnderTest[encodingLength] = 0xFF; // hack so incrementing uses common code
	bufferUnderTest[0] = 0x00; // hack so does not test all the zero-
	++encodingLength;
	}
	}

	if (!finished) { // increment the byte(s) so all inputs are tested
	bool showSomethingMore = false;
	for (int i = encodingLength-1; (i >= 0); --i) {
	bufferUnderTest[i] += 1;
	if (bufferUnderTest[i] != 0x00) break; // only increment least significant byte(s)
	if (encodingLength > 2 && i <= encodingLength-2) {
	showSomethingMore = true;
	}
	}
	if (showSomethingMore) {
	printf("Now testing buffer of length %d: %02x %02x %02x %02x %02x\n", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]);
	}
	}
	}

	if (lastDecodedCodePoint != CODEPOINT_LARGEST_VALID_UTF8) {
	printf("Last decoded code point did not reach expected value (%08x != %08x)\r\n", lastDecodedCodePoint, CODEPOINT_LARGEST_VALID_UTF8);
	assert(false);
	}
	printf("\nSUCCESS!\n");
	}

	void main(void) {
	ValidateDecoder();
	}


	/* The function utf8Codepoint() comes from https://github.com/adafruit/Adafruit_TinyUSB_ArduinoCore
	and requires the following notice be present, if this is considered a modified copy.
	In an abundance of caution, the text is thus provided for reference purposes below.

	* The MIT License (MIT)
	*
	* Copyright (c) 2019 Ha Thach for Adafruit Industries
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/