Last active
November 17, 2025 04:30
-
-
Save MaskRay/80b705d903688870bac96da64e7e243b to your computer and use it in GitHub Desktop.
variants of Little Endian Base 128 where the length information is in the first byte
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <assert.h> | |
| #include <inttypes.h> | |
| #include <stdbit.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| /* | |
| Big-endian prefix varint encoding for 64-bit integers. | |
| The first byte determines the total length using most significant bits. | |
| Leading ones indicate continuation bytes that follow. | |
| Format: | |
| 0xxxxxxx: 7 value bits, 1 byte total | |
| 10xxxxxx xxxxxxxx: 14 value bits, 2 bytes total | |
| 110xxxxx xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes total | |
| 1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes total | |
| 11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes total | |
| 111110xx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes total | |
| 1111110x xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes total | |
| 11111110 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes total | |
| 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes total | |
| The remaining bits in the first byte, plus all subsequent bytes, contain the actual value in big-endian order. | |
| */ | |
| typedef uint64_t uu64 [[gnu::aligned(1)]]; | |
| // Normalize to big-endian, i.e. byte swap on little-endian. | |
| static inline uint64_t norm_be64(uint64_t val) { | |
| #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
| return __builtin_bswap64(val); | |
| #else | |
| return val; | |
| #endif | |
| } | |
| // Read variable-length unsigned 64-bit integer. | |
| [[gnu::noinline]] uint64_t get_be_prefix_varint(unsigned char **buf) { | |
| // Fast path for n == 1 | |
| uint8_t b0 = (*buf)[0]; | |
| if ((b0 & 0x80) == 0) { | |
| *buf += 1; | |
| return b0; | |
| } | |
| if (b0 == 0xff) { | |
| // 9 bytes: 11111111 xxxxxxxx ... | |
| *buf += 9; | |
| return norm_be64(*(uu64 *)(*buf - 8)); | |
| } | |
| unsigned n = stdc_leading_ones(b0) + 1; | |
| uint64_t x = 0; | |
| memcpy(&x, *buf, n); | |
| *buf += n; | |
| #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
| x = norm_be64(x << (64 - 8 * n)); | |
| #else | |
| x >>= 64 - 8 * n; | |
| #endif | |
| return x & (((uint64_t)1 << (7 * n)) - 1); | |
| } | |
| // The variant is suitable when reading to 6 bytes after the last byte is ok. | |
| [[gnu::noinline]] uint64_t get_be_prefix_varint_unsafe(unsigned char **buf) { | |
| // Fast path for n == 1 | |
| uint8_t b0 = (*buf)[0]; | |
| if ((b0 & 0x80) == 0) { | |
| *buf += 1; | |
| return b0; | |
| } | |
| if (b0 == 0xff) { | |
| // 9 bytes: 11111111 xxxxxxxx ... | |
| *buf += 9; | |
| return norm_be64(*(uu64 *)(*buf - 8)); | |
| } | |
| uint64_t x = norm_be64(*(uu64 *)*buf); | |
| unsigned n = stdc_leading_ones(b0) + 1; | |
| *buf += n; | |
| return x << n >> (64 - 7 * n); | |
| } | |
| // Write variable-length unsigned 64-bit integer. | |
| [[gnu::noinline]] unsigned put_be_prefix_varint(unsigned char *buf, uint64_t x) { | |
| // Fast path for n == 1 | |
| if (x < 128) { | |
| buf[0] = (uint8_t)x; | |
| return 1; | |
| } | |
| unsigned sig_bits = 64 - stdc_leading_zeros(x); | |
| unsigned n = (sig_bits + 6) / 7; | |
| if (n > 8) { | |
| // 9 bytes: 11111111 xxxxxxxx ... | |
| buf[0] = 0xff; | |
| *(uu64 *)(buf + 1) = norm_be64(x); | |
| return 9; | |
| } | |
| uint64_t tagged = norm_be64(((uint64_t)0xfe << (64 - n)) | (x << (64 - 8 * n))); | |
| memcpy(buf, &tagged, n); | |
| return n; | |
| } | |
| // Read variable-length signed 64-bit integer using sign extension. | |
| [[gnu::noinline]] int64_t get_be_prefix_varint_signed(unsigned char **buf) { | |
| // Fast path for n == 1 | |
| uint8_t b0 = (*buf)[0]; | |
| if ((b0 & 0x80) == 0) { | |
| *buf += 1; | |
| return (int64_t)((uint64_t)b0 << 57) >> 57; | |
| } | |
| if (b0 == 0xff) { | |
| // 9 bytes: 11111111 xxxxxxxx ... | |
| *buf += 9; | |
| return (int64_t)norm_be64(*(uu64 *)(*buf - 8)); | |
| } | |
| unsigned n = stdc_leading_ones(b0) + 1; | |
| uint64_t x = 0; | |
| memcpy(&x, *buf, n); | |
| *buf += n; | |
| #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
| x = norm_be64(x << (64 - 8 * n)); | |
| #else | |
| x >>= 64 - 8 * n; | |
| #endif | |
| return (int64_t)(x << (64 - 7 * n)) >> (64 - 7 * n); | |
| } | |
| // Write variable-length signed 64-bit integer using sign extension. | |
| [[gnu::noinline]] unsigned put_be_prefix_varint_signed(unsigned char *buf, int64_t x) { | |
| // Fast path for n == 1 | |
| if (x >= -64 && x < 64) { | |
| buf[0] = (uint8_t)((uint64_t)x & 0x7f); | |
| return 1; | |
| } | |
| // Determine the number of significant bits needed for the signed value | |
| uint64_t ux = (uint64_t)x; | |
| unsigned sig_bits = 64 - stdc_leading_zeros(x < 0 ? ~ux : ux) + 1; | |
| unsigned n = (sig_bits + 6) / 7; | |
| if (n > 8) { | |
| // 9 bytes: 11111111 xxxxxxxx ... | |
| buf[0] = 0xff; | |
| *(uu64 *)(buf + 1) = norm_be64(ux); | |
| return 9; | |
| } | |
| ux &= ((uint64_t)1 << (7 * n)) - 1; | |
| uint64_t tagged = norm_be64(((uint64_t)0xfe << (64 - n)) | (ux << (64 - 8 * n))); | |
| memcpy(buf, &tagged, n); | |
| return n; | |
| } | |
| static void print_hex(unsigned char *buf, unsigned bytes) { | |
| for (unsigned j = 0; j < bytes; j++) | |
| printf("%02x ", buf[j]); | |
| } | |
| #define DEFINE_TEST_FUNCTION(suffix, type, format_spec, put_func, get_func1, get_func2, test_values...) \ | |
| static int test_##suffix() { \ | |
| int ret = 0; \ | |
| unsigned char buf[16]; \ | |
| const type tests[] = { test_values }; \ | |
| \ | |
| printf("Testing " #suffix " 64-bit prefix varint:\n"); \ | |
| for (int i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) { \ | |
| type original = tests[i]; \ | |
| memset(buf, 0, sizeof(buf)); \ | |
| unsigned bytes = put_func(buf, original); \ | |
| \ | |
| unsigned char *ptr1 = buf; \ | |
| type decoded1 = get_func1(&ptr1); \ | |
| unsigned char *ptr2 = buf; \ | |
| type decoded2 = get_func2(&ptr2); \ | |
| \ | |
| printf("%16" format_spec " | Bytes: %u | Hex: ", original, bytes); \ | |
| print_hex(buf, bytes); \ | |
| \ | |
| int safe_ok = (original == decoded1); \ | |
| int unsafe_ok = (original == decoded2); \ | |
| ret |= !safe_ok || !unsafe_ok; \ | |
| \ | |
| if (decoded1 == decoded2) { \ | |
| printf("| Decoded: %16" format_spec " | %s\n", decoded1, (safe_ok && unsafe_ok) ? "✓" : "✗"); \ | |
| } else { \ | |
| printf("| Safe: %16" format_spec " | Unsafe: %16" format_spec " | %s %s\n", decoded1, decoded2, safe_ok ? "✓" : "✗", unsafe_ok ? "✓" : "✗"); \ | |
| } \ | |
| } \ | |
| return ret; \ | |
| } | |
| DEFINE_TEST_FUNCTION(unsigned, uint64_t, PRIx64, put_be_prefix_varint, get_be_prefix_varint, get_be_prefix_varint_unsafe, | |
| 0, 1, 127, 128, 301, 1000, 16383, 16384, | |
| 0x7fffffffffffffull, 0xffffffffffffffull, | |
| 0x7fffffffffffffffull, 0x8000000000000000ull, 0xffffffffffffffffull) | |
| DEFINE_TEST_FUNCTION(signed, int64_t, PRId64, put_be_prefix_varint_signed, get_be_prefix_varint_signed, get_be_prefix_varint_signed, | |
| 0, 1, -1, 63, 64, 127, -128, 128, -129, 255, -256, | |
| 8191, 8192, -16384, 16384, -16385, 0x7fffffffffffffffLL, | |
| -0x8000000000000000LL, -1, -2, -127, -32768, 32767) | |
| static int test_comprehensive() { | |
| unsigned char buf[16]; | |
| int errors = 0; | |
| for (uint64_t i = 0; i < 20000; i++) { | |
| memset(buf, 0, sizeof(buf)); | |
| unsigned bytes = put_be_prefix_varint(buf, i); | |
| unsigned char *ptr1 = buf; | |
| uint64_t decoded1 = get_be_prefix_varint(&ptr1); | |
| unsigned char *ptr2 = buf; | |
| uint64_t decoded2 = get_be_prefix_varint_unsafe(&ptr2); | |
| if (i != decoded1 || i != decoded2) | |
| errors++; | |
| } | |
| if (errors) | |
| printf("Comprehensive test failed: %d errors found\n", errors); | |
| return errors > 0; | |
| } | |
| int main() { | |
| return test_unsigned() | test_signed() | test_comprehensive(); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <assert.h> | |
| #include <inttypes.h> | |
| #include <stdbit.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| /* | |
| Little-endian prefix varint encoding for 64-bit integers. | |
| The first byte determines the total length using trailing bits. | |
| Trailing zeros indicate continuation bytes that follow. | |
| Format: | |
| xxxxxxx1: 7 value bits, 1 byte total | |
| xxxxxx10 xxxxxxxx: 14 value bits, 2 bytes total | |
| xxxxx100 xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes total | |
| xxxx1000 xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes total | |
| xxx10000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes total | |
| xx100000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes total | |
| x1000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes total | |
| 10000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes total | |
| 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes total | |
| The remaining bits in the first byte, plus all subsequent bytes, contain the actual value in little-endian order. | |
| Example: 147 (decimal) = 10010011 (binary), with 8 significant bits | |
| Use 2-byte encoding: xxxxxx10 xxxxxxxx | |
| Byte 0: ((147 & 63) << 2) | 0x02 = 0x4e | |
| Byte 1: (147 >> 6) = 0x02 | |
| Encoded as: 0x4e 0x02 | |
| */ | |
| typedef uint64_t uu64 [[gnu::aligned(1)]]; | |
| // Normalize to little-endian, i.e. byte swap on big-endian. | |
| static inline uint64_t norm_le64(uint64_t val) { | |
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |
| return __builtin_bswap64(val); | |
| #else | |
| return val; | |
| #endif | |
| } | |
| // Read variable-length unsigned 64-bit integer. | |
| [[gnu::noinline]] uint64_t get_le_prefix_varint(unsigned char **buf) { | |
| // Fast path for n == 1 | |
| uint8_t b0 = (*buf)[0]; | |
| if (b0 & 1) { | |
| *buf += 1; | |
| return b0 >> 1; | |
| } | |
| if (b0 == 0x00) { | |
| // 9 bytes: 00000000 xxxxxxxx ... | |
| *buf += 9; | |
| return norm_le64(*(uu64 *)(*buf - 8)); | |
| } | |
| unsigned n = stdc_trailing_zeros(b0) + 1; | |
| uint64_t x = 0; | |
| memcpy(&x, *buf, n); | |
| *buf += n; | |
| x = (norm_le64(x) >> n) & (((uint64_t)1 << (7 * n)) - 1); | |
| return x; | |
| } | |
| // Read variable-length unsigned 64-bit integer. | |
| // The implementation is suitable when reading up to 6 bytes after the last byte is ok. | |
| [[gnu::noinline]] uint64_t get_le_prefix_varint_unsafe(unsigned char **buf) { | |
| // Fast path for n == 1 | |
| uint8_t b0 = (*buf)[0]; | |
| if (b0 & 1) { | |
| *buf += 1; | |
| return b0 >> 1; | |
| } | |
| if (b0 == 0x00) { | |
| // 9 bytes: 00000000 xxxxxxxx ... | |
| *buf += 9; | |
| return norm_le64(*(uu64 *)(*buf - 8)); | |
| } | |
| uint64_t x = norm_le64(*(uu64 *)*buf); | |
| unsigned n = stdc_trailing_zeros(b0) + 1; | |
| *buf += n; | |
| return x << (64 - 8 * n) >> (64 - 7 * n); | |
| } | |
| // Write variable-length unsigned 64-bit integer. | |
| [[gnu::noinline]] unsigned put_le_prefix_varint(unsigned char *buf, uint64_t x) { | |
| // Fast path for n == 1 | |
| if (x < 128) { | |
| buf[0] = (x << 1) | 1; | |
| return 1; | |
| } | |
| unsigned sig_bits = 64 - stdc_leading_zeros(x); | |
| unsigned n = (sig_bits + 6) / 7; | |
| if (n > 8) { | |
| // 9 bytes: 00000000 xxxxxxxx ... | |
| buf[0] = 0x00; | |
| *(uu64 *)(buf + 1) = norm_le64(x); | |
| return 9; | |
| } | |
| uint64_t tagged = norm_le64((x << n) | ((uint64_t)1 << (n - 1))); | |
| memcpy(buf, &tagged, n); | |
| return n; | |
| } | |
| // Read variable-length signed 64-bit integer using sign extension. | |
| // The implementation is suitable when reading up to 6 bytes after the last byte is ok. | |
| [[gnu::noinline]] int64_t get_le_prefix_varint_signed(unsigned char **buf) { | |
| // Fast path for n == 1 | |
| uint8_t b0 = (*buf)[0]; | |
| if (b0 & 1) { | |
| *buf += 1; | |
| return (int64_t)((uint64_t)(b0 >> 1) << 57) >> 57; | |
| } | |
| if ((*buf)[0] == 0x00) { | |
| // 9 bytes: 00000000 xxxxxxxx ... | |
| *buf += 9; | |
| return (int64_t)norm_le64(*(uu64 *)(*buf - 8)); | |
| } | |
| uint64_t x = norm_le64(*(uu64 *)*buf); | |
| unsigned n = stdc_trailing_zeros(b0) + 1; | |
| *buf += n; | |
| return (int64_t)(x << (64 - 8 * n)) >> (64 - 7 * n); | |
| } | |
| // Write variable-length signed 64-bit integer using sign extension. | |
| [[gnu::noinline]] unsigned put_le_prefix_varint_signed(unsigned char *buf, int64_t x) { | |
| // Fast path for n == 1 | |
| if (x >= -64 && x < 64) { | |
| buf[0] = ((uint64_t)x << 1) | 1; | |
| return 1; | |
| } | |
| // Determine the number of significant bits needed | |
| uint64_t ux = (uint64_t)x; | |
| unsigned sig_bits = 64 - stdc_leading_zeros(x < 0 ? ~ux : ux) + 1; | |
| unsigned n = (sig_bits + 6) / 7; | |
| if (n > 8) { | |
| // 9 bytes: 00000000 xxxxxxxx ... | |
| buf[0] = 0x00; | |
| *(uu64 *)(buf + 1) = norm_le64(ux); | |
| return 9; | |
| } | |
| ux &= ((uint64_t)1 << (7 * n)) - 1; | |
| uint64_t tagged = norm_le64((ux << n) | ((uint64_t)1 << (n - 1))); | |
| memcpy(buf, &tagged, n); | |
| return n; | |
| } | |
| static void print_hex(unsigned char *buf, unsigned bytes) { | |
| for (unsigned j = 0; j < bytes; j++) | |
| printf("%02x ", buf[j]); | |
| } | |
| #define DEFINE_TEST_FUNCTION(suffix, type, format_spec, put_func, get_func1, get_func2, test_values...) \ | |
| static int test_##suffix() { \ | |
| int ret = 0; \ | |
| unsigned char buf[16]; \ | |
| const type tests[] = { test_values }; \ | |
| \ | |
| printf("Testing " #suffix " 64-bit suffix varint:\n"); \ | |
| for (int i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) { \ | |
| type original = tests[i]; \ | |
| memset(buf, 0, sizeof(buf)); \ | |
| unsigned bytes = put_func(buf, original); \ | |
| \ | |
| unsigned char *ptr1 = buf; \ | |
| type decoded1 = get_func1(&ptr1); \ | |
| unsigned char *ptr2 = buf; \ | |
| type decoded2 = get_func2(&ptr2); \ | |
| \ | |
| printf("%16" format_spec " | Bytes: %u | Hex: ", original, bytes); \ | |
| print_hex(buf, bytes); \ | |
| \ | |
| int safe_ok = (original == decoded1); \ | |
| int unsafe_ok = (original == decoded2); \ | |
| ret |= !safe_ok || !unsafe_ok; \ | |
| \ | |
| if (decoded1 == decoded2) { \ | |
| printf("| Decoded: %16" format_spec " | %s\n", decoded1, (safe_ok && unsafe_ok) ? "✓" : "✗"); \ | |
| } else { \ | |
| printf("| Safe: %16" format_spec " | Unsafe: %16" format_spec " | %s %s\n", decoded1, decoded2, safe_ok ? "✓" : "✗", unsafe_ok ? "✓" : "✗"); \ | |
| } \ | |
| } \ | |
| return ret; \ | |
| } | |
| DEFINE_TEST_FUNCTION(unsigned, uint64_t, PRIx64, put_le_prefix_varint, get_le_prefix_varint, get_le_prefix_varint_unsafe, | |
| 0, 1, 127, 128, 147, 255, 1000, 16383, 16384, | |
| 0x7fffffffffffffull, 0xffffffffffffffull, | |
| 0x7fffffffffffffffull, 0x8000000000000000ull, 0xffffffffffffffffull) | |
| DEFINE_TEST_FUNCTION(signed, int64_t, PRId64, put_le_prefix_varint_signed, get_le_prefix_varint_signed, get_le_prefix_varint_signed, | |
| 0, 1, -1, 63, 64, 127, -128, 128, -129, 255, -256, | |
| 8191, 8192, -16384, 16384, -16385, 0x7fffffffffffffffLL, | |
| -0x8000000000000000LL, -1, -2, -127, -32768, 32767) | |
| static int test_comprehensive() { | |
| unsigned char buf[16]; | |
| int errors = 0; | |
| for (uint64_t i = 0; i < 20000; i++) { | |
| memset(buf, 0, sizeof(buf)); | |
| unsigned bytes = put_le_prefix_varint(buf, i); | |
| unsigned char *ptr1 = buf; | |
| uint64_t decoded1 = get_le_prefix_varint(&ptr1); | |
| unsigned char *ptr2 = buf; | |
| uint64_t decoded2 = get_le_prefix_varint_unsafe(&ptr2); | |
| if (i != decoded1 || i != decoded2) | |
| errors++; | |
| } | |
| if (errors) | |
| printf("Comprehensive test failed: %d errors found\n", errors); | |
| return errors > 0; | |
| } | |
| int main() { | |
| return test_unsigned() | test_signed() | test_comprehensive(); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/zsh -e | |
| clang -fsanitize=undefined -g be_prefix_varint.c -o be_prefix_varint && ./be_prefix_varint | |
| clang -fsanitize=undefined -g le_prefix_varint.c -o le_prefix_varint && ./le_prefix_varint | |
| s390x-linux-gnu-gcc-14 -static be_prefix_varint.c && ./a.out | |
| s390x-linux-gnu-gcc-14 -static le_prefix_varint.c && ./a.out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment