Skip to content

Instantly share code, notes, and snippets.

@MaskRay
Last active November 17, 2025 04:30
Show Gist options
  • Select an option

  • Save MaskRay/80b705d903688870bac96da64e7e243b to your computer and use it in GitHub Desktop.

Select an option

Save MaskRay/80b705d903688870bac96da64e7e243b to your computer and use it in GitHub Desktop.
variants of Little Endian Base 128 where the length information is in the first byte
#include <assert.h>
#include <inttypes.h>
#include <stdbit.h>
#include <stdio.h>
#include <string.h>
/*
Big-endian prefix varint encoding for 64-bit integers.
The first byte determines the total length using most significant bits.
Leading ones indicate continuation bytes that follow.
Format:
0xxxxxxx: 7 value bits, 1 byte total
10xxxxxx xxxxxxxx: 14 value bits, 2 bytes total
110xxxxx xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes total
1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes total
11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes total
111110xx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes total
1111110x xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes total
11111110 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes total
11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes total
The remaining bits in the first byte, plus all subsequent bytes, contain the actual value in big-endian order.
*/
typedef uint64_t uu64 [[gnu::aligned(1)]];
// Normalize to big-endian, i.e. byte swap on little-endian.
static inline uint64_t norm_be64(uint64_t val) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return __builtin_bswap64(val);
#else
return val;
#endif
}
// Read variable-length unsigned 64-bit integer.
[[gnu::noinline]] uint64_t get_be_prefix_varint(unsigned char **buf) {
// Fast path for n == 1
uint8_t b0 = (*buf)[0];
if ((b0 & 0x80) == 0) {
*buf += 1;
return b0;
}
if (b0 == 0xff) {
// 9 bytes: 11111111 xxxxxxxx ...
*buf += 9;
return norm_be64(*(uu64 *)(*buf - 8));
}
unsigned n = stdc_leading_ones(b0) + 1;
uint64_t x = 0;
memcpy(&x, *buf, n);
*buf += n;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
x = norm_be64(x << (64 - 8 * n));
#else
x >>= 64 - 8 * n;
#endif
return x & (((uint64_t)1 << (7 * n)) - 1);
}
// The variant is suitable when reading to 6 bytes after the last byte is ok.
[[gnu::noinline]] uint64_t get_be_prefix_varint_unsafe(unsigned char **buf) {
// Fast path for n == 1
uint8_t b0 = (*buf)[0];
if ((b0 & 0x80) == 0) {
*buf += 1;
return b0;
}
if (b0 == 0xff) {
// 9 bytes: 11111111 xxxxxxxx ...
*buf += 9;
return norm_be64(*(uu64 *)(*buf - 8));
}
uint64_t x = norm_be64(*(uu64 *)*buf);
unsigned n = stdc_leading_ones(b0) + 1;
*buf += n;
return x << n >> (64 - 7 * n);
}
// Write variable-length unsigned 64-bit integer.
[[gnu::noinline]] unsigned put_be_prefix_varint(unsigned char *buf, uint64_t x) {
// Fast path for n == 1
if (x < 128) {
buf[0] = (uint8_t)x;
return 1;
}
unsigned sig_bits = 64 - stdc_leading_zeros(x);
unsigned n = (sig_bits + 6) / 7;
if (n > 8) {
// 9 bytes: 11111111 xxxxxxxx ...
buf[0] = 0xff;
*(uu64 *)(buf + 1) = norm_be64(x);
return 9;
}
uint64_t tagged = norm_be64(((uint64_t)0xfe << (64 - n)) | (x << (64 - 8 * n)));
memcpy(buf, &tagged, n);
return n;
}
// Read variable-length signed 64-bit integer using sign extension.
[[gnu::noinline]] int64_t get_be_prefix_varint_signed(unsigned char **buf) {
// Fast path for n == 1
uint8_t b0 = (*buf)[0];
if ((b0 & 0x80) == 0) {
*buf += 1;
return (int64_t)((uint64_t)b0 << 57) >> 57;
}
if (b0 == 0xff) {
// 9 bytes: 11111111 xxxxxxxx ...
*buf += 9;
return (int64_t)norm_be64(*(uu64 *)(*buf - 8));
}
unsigned n = stdc_leading_ones(b0) + 1;
uint64_t x = 0;
memcpy(&x, *buf, n);
*buf += n;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
x = norm_be64(x << (64 - 8 * n));
#else
x >>= 64 - 8 * n;
#endif
return (int64_t)(x << (64 - 7 * n)) >> (64 - 7 * n);
}
// Write variable-length signed 64-bit integer using sign extension.
[[gnu::noinline]] unsigned put_be_prefix_varint_signed(unsigned char *buf, int64_t x) {
// Fast path for n == 1
if (x >= -64 && x < 64) {
buf[0] = (uint8_t)((uint64_t)x & 0x7f);
return 1;
}
// Determine the number of significant bits needed for the signed value
uint64_t ux = (uint64_t)x;
unsigned sig_bits = 64 - stdc_leading_zeros(x < 0 ? ~ux : ux) + 1;
unsigned n = (sig_bits + 6) / 7;
if (n > 8) {
// 9 bytes: 11111111 xxxxxxxx ...
buf[0] = 0xff;
*(uu64 *)(buf + 1) = norm_be64(ux);
return 9;
}
ux &= ((uint64_t)1 << (7 * n)) - 1;
uint64_t tagged = norm_be64(((uint64_t)0xfe << (64 - n)) | (ux << (64 - 8 * n)));
memcpy(buf, &tagged, n);
return n;
}
static void print_hex(unsigned char *buf, unsigned bytes) {
for (unsigned j = 0; j < bytes; j++)
printf("%02x ", buf[j]);
}
#define DEFINE_TEST_FUNCTION(suffix, type, format_spec, put_func, get_func1, get_func2, test_values...) \
static int test_##suffix() { \
int ret = 0; \
unsigned char buf[16]; \
const type tests[] = { test_values }; \
\
printf("Testing " #suffix " 64-bit prefix varint:\n"); \
for (int i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) { \
type original = tests[i]; \
memset(buf, 0, sizeof(buf)); \
unsigned bytes = put_func(buf, original); \
\
unsigned char *ptr1 = buf; \
type decoded1 = get_func1(&ptr1); \
unsigned char *ptr2 = buf; \
type decoded2 = get_func2(&ptr2); \
\
printf("%16" format_spec " | Bytes: %u | Hex: ", original, bytes); \
print_hex(buf, bytes); \
\
int safe_ok = (original == decoded1); \
int unsafe_ok = (original == decoded2); \
ret |= !safe_ok || !unsafe_ok; \
\
if (decoded1 == decoded2) { \
printf("| Decoded: %16" format_spec " | %s\n", decoded1, (safe_ok && unsafe_ok) ? "✓" : "✗"); \
} else { \
printf("| Safe: %16" format_spec " | Unsafe: %16" format_spec " | %s %s\n", decoded1, decoded2, safe_ok ? "✓" : "✗", unsafe_ok ? "✓" : "✗"); \
} \
} \
return ret; \
}
DEFINE_TEST_FUNCTION(unsigned, uint64_t, PRIx64, put_be_prefix_varint, get_be_prefix_varint, get_be_prefix_varint_unsafe,
0, 1, 127, 128, 301, 1000, 16383, 16384,
0x7fffffffffffffull, 0xffffffffffffffull,
0x7fffffffffffffffull, 0x8000000000000000ull, 0xffffffffffffffffull)
DEFINE_TEST_FUNCTION(signed, int64_t, PRId64, put_be_prefix_varint_signed, get_be_prefix_varint_signed, get_be_prefix_varint_signed,
0, 1, -1, 63, 64, 127, -128, 128, -129, 255, -256,
8191, 8192, -16384, 16384, -16385, 0x7fffffffffffffffLL,
-0x8000000000000000LL, -1, -2, -127, -32768, 32767)
static int test_comprehensive() {
unsigned char buf[16];
int errors = 0;
for (uint64_t i = 0; i < 20000; i++) {
memset(buf, 0, sizeof(buf));
unsigned bytes = put_be_prefix_varint(buf, i);
unsigned char *ptr1 = buf;
uint64_t decoded1 = get_be_prefix_varint(&ptr1);
unsigned char *ptr2 = buf;
uint64_t decoded2 = get_be_prefix_varint_unsafe(&ptr2);
if (i != decoded1 || i != decoded2)
errors++;
}
if (errors)
printf("Comprehensive test failed: %d errors found\n", errors);
return errors > 0;
}
int main() {
return test_unsigned() | test_signed() | test_comprehensive();
}
#include <assert.h>
#include <inttypes.h>
#include <stdbit.h>
#include <stdio.h>
#include <string.h>
/*
Little-endian prefix varint encoding for 64-bit integers.
The first byte determines the total length using trailing bits.
Trailing zeros indicate continuation bytes that follow.
Format:
xxxxxxx1: 7 value bits, 1 byte total
xxxxxx10 xxxxxxxx: 14 value bits, 2 bytes total
xxxxx100 xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes total
xxxx1000 xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes total
xxx10000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes total
xx100000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes total
x1000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes total
10000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes total
00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes total
The remaining bits in the first byte, plus all subsequent bytes, contain the actual value in little-endian order.
Example: 147 (decimal) = 10010011 (binary), with 8 significant bits
Use 2-byte encoding: xxxxxx10 xxxxxxxx
Byte 0: ((147 & 63) << 2) | 0x02 = 0x4e
Byte 1: (147 >> 6) = 0x02
Encoded as: 0x4e 0x02
*/
typedef uint64_t uu64 [[gnu::aligned(1)]];
// Normalize to little-endian, i.e. byte swap on big-endian.
static inline uint64_t norm_le64(uint64_t val) {
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return __builtin_bswap64(val);
#else
return val;
#endif
}
// Read variable-length unsigned 64-bit integer.
[[gnu::noinline]] uint64_t get_le_prefix_varint(unsigned char **buf) {
// Fast path for n == 1
uint8_t b0 = (*buf)[0];
if (b0 & 1) {
*buf += 1;
return b0 >> 1;
}
if (b0 == 0x00) {
// 9 bytes: 00000000 xxxxxxxx ...
*buf += 9;
return norm_le64(*(uu64 *)(*buf - 8));
}
unsigned n = stdc_trailing_zeros(b0) + 1;
uint64_t x = 0;
memcpy(&x, *buf, n);
*buf += n;
x = (norm_le64(x) >> n) & (((uint64_t)1 << (7 * n)) - 1);
return x;
}
// Read variable-length unsigned 64-bit integer.
// The implementation is suitable when reading up to 6 bytes after the last byte is ok.
[[gnu::noinline]] uint64_t get_le_prefix_varint_unsafe(unsigned char **buf) {
// Fast path for n == 1
uint8_t b0 = (*buf)[0];
if (b0 & 1) {
*buf += 1;
return b0 >> 1;
}
if (b0 == 0x00) {
// 9 bytes: 00000000 xxxxxxxx ...
*buf += 9;
return norm_le64(*(uu64 *)(*buf - 8));
}
uint64_t x = norm_le64(*(uu64 *)*buf);
unsigned n = stdc_trailing_zeros(b0) + 1;
*buf += n;
return x << (64 - 8 * n) >> (64 - 7 * n);
}
// Write variable-length unsigned 64-bit integer.
[[gnu::noinline]] unsigned put_le_prefix_varint(unsigned char *buf, uint64_t x) {
// Fast path for n == 1
if (x < 128) {
buf[0] = (x << 1) | 1;
return 1;
}
unsigned sig_bits = 64 - stdc_leading_zeros(x);
unsigned n = (sig_bits + 6) / 7;
if (n > 8) {
// 9 bytes: 00000000 xxxxxxxx ...
buf[0] = 0x00;
*(uu64 *)(buf + 1) = norm_le64(x);
return 9;
}
uint64_t tagged = norm_le64((x << n) | ((uint64_t)1 << (n - 1)));
memcpy(buf, &tagged, n);
return n;
}
// Read variable-length signed 64-bit integer using sign extension.
// The implementation is suitable when reading up to 6 bytes after the last byte is ok.
[[gnu::noinline]] int64_t get_le_prefix_varint_signed(unsigned char **buf) {
// Fast path for n == 1
uint8_t b0 = (*buf)[0];
if (b0 & 1) {
*buf += 1;
return (int64_t)((uint64_t)(b0 >> 1) << 57) >> 57;
}
if ((*buf)[0] == 0x00) {
// 9 bytes: 00000000 xxxxxxxx ...
*buf += 9;
return (int64_t)norm_le64(*(uu64 *)(*buf - 8));
}
uint64_t x = norm_le64(*(uu64 *)*buf);
unsigned n = stdc_trailing_zeros(b0) + 1;
*buf += n;
return (int64_t)(x << (64 - 8 * n)) >> (64 - 7 * n);
}
// Write variable-length signed 64-bit integer using sign extension.
[[gnu::noinline]] unsigned put_le_prefix_varint_signed(unsigned char *buf, int64_t x) {
// Fast path for n == 1
if (x >= -64 && x < 64) {
buf[0] = ((uint64_t)x << 1) | 1;
return 1;
}
// Determine the number of significant bits needed
uint64_t ux = (uint64_t)x;
unsigned sig_bits = 64 - stdc_leading_zeros(x < 0 ? ~ux : ux) + 1;
unsigned n = (sig_bits + 6) / 7;
if (n > 8) {
// 9 bytes: 00000000 xxxxxxxx ...
buf[0] = 0x00;
*(uu64 *)(buf + 1) = norm_le64(ux);
return 9;
}
ux &= ((uint64_t)1 << (7 * n)) - 1;
uint64_t tagged = norm_le64((ux << n) | ((uint64_t)1 << (n - 1)));
memcpy(buf, &tagged, n);
return n;
}
static void print_hex(unsigned char *buf, unsigned bytes) {
for (unsigned j = 0; j < bytes; j++)
printf("%02x ", buf[j]);
}
#define DEFINE_TEST_FUNCTION(suffix, type, format_spec, put_func, get_func1, get_func2, test_values...) \
static int test_##suffix() { \
int ret = 0; \
unsigned char buf[16]; \
const type tests[] = { test_values }; \
\
printf("Testing " #suffix " 64-bit suffix varint:\n"); \
for (int i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) { \
type original = tests[i]; \
memset(buf, 0, sizeof(buf)); \
unsigned bytes = put_func(buf, original); \
\
unsigned char *ptr1 = buf; \
type decoded1 = get_func1(&ptr1); \
unsigned char *ptr2 = buf; \
type decoded2 = get_func2(&ptr2); \
\
printf("%16" format_spec " | Bytes: %u | Hex: ", original, bytes); \
print_hex(buf, bytes); \
\
int safe_ok = (original == decoded1); \
int unsafe_ok = (original == decoded2); \
ret |= !safe_ok || !unsafe_ok; \
\
if (decoded1 == decoded2) { \
printf("| Decoded: %16" format_spec " | %s\n", decoded1, (safe_ok && unsafe_ok) ? "✓" : "✗"); \
} else { \
printf("| Safe: %16" format_spec " | Unsafe: %16" format_spec " | %s %s\n", decoded1, decoded2, safe_ok ? "✓" : "✗", unsafe_ok ? "✓" : "✗"); \
} \
} \
return ret; \
}
DEFINE_TEST_FUNCTION(unsigned, uint64_t, PRIx64, put_le_prefix_varint, get_le_prefix_varint, get_le_prefix_varint_unsafe,
0, 1, 127, 128, 147, 255, 1000, 16383, 16384,
0x7fffffffffffffull, 0xffffffffffffffull,
0x7fffffffffffffffull, 0x8000000000000000ull, 0xffffffffffffffffull)
DEFINE_TEST_FUNCTION(signed, int64_t, PRId64, put_le_prefix_varint_signed, get_le_prefix_varint_signed, get_le_prefix_varint_signed,
0, 1, -1, 63, 64, 127, -128, 128, -129, 255, -256,
8191, 8192, -16384, 16384, -16385, 0x7fffffffffffffffLL,
-0x8000000000000000LL, -1, -2, -127, -32768, 32767)
static int test_comprehensive() {
unsigned char buf[16];
int errors = 0;
for (uint64_t i = 0; i < 20000; i++) {
memset(buf, 0, sizeof(buf));
unsigned bytes = put_le_prefix_varint(buf, i);
unsigned char *ptr1 = buf;
uint64_t decoded1 = get_le_prefix_varint(&ptr1);
unsigned char *ptr2 = buf;
uint64_t decoded2 = get_le_prefix_varint_unsafe(&ptr2);
if (i != decoded1 || i != decoded2)
errors++;
}
if (errors)
printf("Comprehensive test failed: %d errors found\n", errors);
return errors > 0;
}
int main() {
return test_unsigned() | test_signed() | test_comprehensive();
}
#!/bin/zsh -e
clang -fsanitize=undefined -g be_prefix_varint.c -o be_prefix_varint && ./be_prefix_varint
clang -fsanitize=undefined -g le_prefix_varint.c -o le_prefix_varint && ./le_prefix_varint
s390x-linux-gnu-gcc-14 -static be_prefix_varint.c && ./a.out
s390x-linux-gnu-gcc-14 -static le_prefix_varint.c && ./a.out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment