Skip to content

Instantly share code, notes, and snippets.

@jerch
Created February 1, 2026 20:51
Show Gist options
  • Select an option

  • Save jerch/2f9bc0391a11ecf0c1f5b3e51be860a7 to your computer and use it in GitHub Desktop.

Select an option

Save jerch/2f9bc0391a11ecf0c1f5b3e51be860a7 to your computer and use it in GitHub Desktop.
/**
* Copyright (c) 2023 The xterm.js authors. All rights reserved.
* @license MIT
*/
import { InWasm, OutputMode, OutputType } from 'inwasm-runtime';
// memory addresses in uint32
const enum P32 {
// 1. byte table
D0 = 256,
// 2. byte table
D1 = 512,
// 3. byte table
D2 = 768,
// 4. byte table
D3 = 1024,
// address of state struct
STATE = 1280,
// state.wp: end of base64 data
STATE_WP = 1280,
// state.sp: read position of base64 data
STATE_SP = 1281,
// state.dp: write position of decoded data
STATE_DP = 1282,
// state.e_size: max byte expected
STATE_ESIZE = 1283,
// state.data: data[0]
STATE_DATA = 1288 // 16 aligned
}
/**
* wasm base64 decoder.
*/
const wasmDecode = InWasm({
name: 'decode',
type: OutputType.INSTANCE,
mode: OutputMode.SYNC,
srctype: 'Clang-C',
imports: {
env: { memory: new WebAssembly.Memory({ initial: 1 }) }
},
exports: {
dec: () => 0,
end: () => 0
},
compile: {
switches: ['-Wl,-z,stack-size=0', '-Wl,--stack-first']
},
code: `
typedef struct {
unsigned int wp;
unsigned int sp;
unsigned int dp;
unsigned int e_size;
unsigned int dummy[4];
unsigned char data[0];
} State;
unsigned int *D0 = (unsigned int *) ${P32.D0*4};
unsigned int *D1 = (unsigned int *) ${P32.D1*4};
unsigned int *D2 = (unsigned int *) ${P32.D2*4};
unsigned int *D3 = (unsigned int *) ${P32.D3*4};
State *state = (State *) ${P32.STATE*4};
__attribute__((noinline)) int dec() {
unsigned int nsp = (state->wp - 1) & ~3;
unsigned char *src = state->data + state->sp;
unsigned char *end = state->data + nsp;
unsigned char *dst = state->data + state->dp;
unsigned int error = 0;
// 4x loop unrolling (~30% speedup)
// FIXME: investigate why zeroing the last bits of nsp does not always work here
// a temp fix is the line below, which always subtracts 16 from the end pointer
// this has a tiny penalty of always doing the last portion in 4-bytes, -16 is safe,
// as it is further guarded by the big chunk clause in Base64Decoder.put
unsigned char *end16 = state->data + (nsp & ~15);
//unsigned char *end16 = state->data + nsp - 16;
while (src < end16) {
error |= *((unsigned int *) dst ) = D0[src[ 0]] | D1[src[ 1]] | D2[src[ 2]] | D3[src[ 3]];
error |= *((unsigned int *) (dst+3)) = D0[src[ 4]] | D1[src[ 5]] | D2[src[ 6]] | D3[src[ 7]];
error |= *((unsigned int *) (dst+6)) = D0[src[ 8]] | D1[src[ 9]] | D2[src[10]] | D3[src[11]];
error |= *((unsigned int *) (dst+9)) = D0[src[12]] | D1[src[13]] | D2[src[14]] | D3[src[15]];
dst += 12;
src += 16;
}
// operate on 4-byte blocks
while (src < end) {
error |= *((unsigned int *) dst) = D0[src[0]] | D1[src[1]] | D2[src[2]] | D3[src[3]];
dst += 3;
src += 4;
}
// eval error state lazy (opportunistic, favors good over bad data)
if (error >> 24) return 1;
state->sp = nsp;
state->dp = dst - state->data;
return 0;
}
int end() {
int rem = state->wp - state->sp;
if (rem > 4 && dec()) return 1;
rem = state->wp - state->sp;
if (rem < 2) return 1;
unsigned char *src = state->data + state->sp;
if (rem == 4) {
if (src[3] == 61) rem--;
if (src[2] == 61) rem--;
}
unsigned int accu = D0[src[0]] | D1[src[1]];
int dp = 1;
if (rem > 2) {
accu |= D2[src[2]];
dp++;
if (rem == 4) {
accu |= D3[src[3]];
dp++;
}
}
if (accu >> 24) return 1;
*((unsigned int *) (state->data + state->dp)) = accu;
state->dp += dp;
return 0;
}
`
});
// SIMD version (speedup ~1.4x, not covered by tests yet)
/*
const wasmDecode = InWasm({
name: 'decode',
type: OutputType.INSTANCE,
mode: OutputMode.SYNC,
srctype: 'Clang-C',
imports: {
env: { memory: new WebAssembly.Memory({ initial: 1 }) }
},
exports: {
dec: () => 0,
end: () => 0
},
compile: {
switches: ['-msimd128', '-Wl,-z,stack-size=0', '-Wl,--stack-first']
},
code: `
#include <wasm_simd128.h>
typedef struct {
unsigned int wp;
unsigned int sp;
unsigned int dp;
unsigned int e_size;
unsigned int dummy[4];
unsigned char data[0];
} State;
unsigned int *D0 = (unsigned int *) ${P32.D0 * 4};
unsigned int *D1 = (unsigned int *) ${P32.D1 * 4};
unsigned int *D2 = (unsigned int *) ${P32.D2 * 4};
unsigned int *D3 = (unsigned int *) ${P32.D3 * 4};
State *state = (State *) ${P32.STATE * 4};
#define packed_byte(x) wasm_i8x16_splat((char) x)
#define packed_dword(x) wasm_i32x4_splat(x)
#define masked(x, mask) wasm_v128_and(x, wasm_i32x4_splat(mask))
__attribute__((noinline)) int dec() {
unsigned int nsp = (state->wp - 1) & ~3;
unsigned char *src = state->data + state->sp;
unsigned char *end = state->data + nsp;
unsigned char *dst = state->data + state->dp;
unsigned int error = 0;
v128_t err = wasm_i8x16_splat(0);
unsigned char *end16 = state->data + (nsp & ~15);
while (src < end16) {
v128_t data = wasm_v128_load((v128_t *) src);
// wasm-simd rewrite of http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html#vector-lookup-pshufb
const v128_t higher_nibble = wasm_u32x4_shr(data, 4) & packed_byte(0x0f);
const char linv = 1;
const char hinv = 0;
const v128_t lower_bound_LUT = wasm_i8x16_const(
// order: 0 1 2 3 4 5 6 7 8 9 a b c d e f
linv, linv, 0x2b, 0x30,
0x41, 0x50, 0x61, 0x70,
linv, linv, linv, linv,
linv, linv, linv, linv
);
const v128_t upper_bound_LUT = wasm_i8x16_const(
// order: 0 1 2 3 4 5 6 7 8 9 a b c d e f
hinv, hinv, 0x2b, 0x39,
0x4f, 0x5a, 0x6f, 0x7a,
hinv, hinv, hinv, hinv,
hinv, hinv, hinv, hinv
);
// the difference between the shift and lower bound
const v128_t shift_LUT = wasm_i8x16_const(
// order: 0 1 2 3 4 5 6 7 8 9 a b c d e f
0x00, 0x00, 0x3e - 0x2b, 0x34 - 0x30,
0x00 - 0x41, 0x0f - 0x50, 0x1a - 0x61, 0x29 - 0x70,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00
);
const v128_t upper_bound = wasm_i8x16_swizzle(upper_bound_LUT, higher_nibble);
const v128_t lower_bound = wasm_i8x16_swizzle(lower_bound_LUT, higher_nibble);
const v128_t below = wasm_i8x16_lt(data, lower_bound);
const v128_t above = wasm_i8x16_gt(data, upper_bound);
const v128_t eq_2f = wasm_i8x16_eq(data, packed_byte(0x2f));
// in_range = not (below or above) or eq_2f
// outside = not in_range = below or above and not eq_2f (from deMorgan law)
const v128_t outside = wasm_v128_andnot(eq_2f, above | below);
err = wasm_v128_or(err, outside);
const v128_t shift = wasm_i8x16_swizzle(shift_LUT, higher_nibble);
const v128_t t0 = wasm_i8x16_add(data, shift);
v128_t v = wasm_i8x16_add(t0, wasm_v128_and(eq_2f, packed_byte(-3)));
// pack bytes
const v128_t ca = masked(v, 0x003f003f);
const v128_t db = masked(v, 0x3f003f00);
const v128_t t00 = wasm_v128_or(wasm_u32x4_shr(db, 8), wasm_i32x4_shl(ca, 6));
v128_t res = wasm_v128_or(wasm_u32x4_shr(t00, 16), wasm_i32x4_shl(t00, 12));
res = wasm_i8x16_swizzle(res, wasm_i8x16_const(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 16, 16, 16, 16));
wasm_v128_store((v128_t *) dst, res);
dst += 12;
src += 16;
}
//if (wasm_i8x16_bitmask(err) != 0) return 1;
if (wasm_v128_any_true(err)) return 1;
// operate on 4-byte blocks
while (src < end) {
error |= *((unsigned int *) dst) = D0[src[0]] | D1[src[1]] | D2[src[2]] | D3[src[3]];
dst += 3;
src += 4;
}
if (error >> 24) return 1;
state->sp = nsp;
state->dp = dst - state->data;
return 0;
}
int end() {
int rem = state->wp - state->sp;
if (rem > 4 && dec()) return 1;
rem = state->wp - state->sp;
if (rem < 2) return 1;
unsigned char *src = state->data + state->sp;
if (rem == 4) {
if (src[3] == 61) rem--;
if (src[2] == 61) rem--;
}
unsigned int accu = D0[src[0]] | D1[src[1]];
int dp = 1;
if (rem > 2) {
accu |= D2[src[2]];
dp++;
if (rem == 4) {
accu |= D3[src[3]];
dp++;
}
}
if (accu >> 24) return 1;
*((unsigned int *) (state->data + state->dp)) = accu;
state->dp += dp;
return 0;
}
`
});
*/
// base64 map
const MAP = new Uint8Array(
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
.split('')
.map(el => el.charCodeAt(0))
);
// init decoder maps in LE order
const D = new Uint32Array(1024);
D.fill(0xFF000000);
for (let i = 0; i < MAP.length; ++i) D[MAP[i]] = i << 2;
for (let i = 0; i < MAP.length; ++i) D[256 + MAP[i]] = i >> 4 | ((i << 4) & 0xFF) << 8;
for (let i = 0; i < MAP.length; ++i) D[512 + MAP[i]] = (i >> 2) << 8 | ((i << 6) & 0xFF) << 16;
for (let i = 0; i < MAP.length; ++i) D[768 + MAP[i]] = i << 16;
const EMPTY = new Uint8Array(0);
/**
* base64 streamline inplace decoder.
*
* Features / assumptions:
* - lazy chunkwise decoding
* - errors out on any non base64 chars (no support for NL formatted base64)
* - decodes in wasm
* - inplace decoding to save memory
* - supports a keepSize for lazy memory release
*/
export default class Base64Decoder {
private _d!: Uint8Array;
private _m32!: Uint32Array;
private _inst!: ReturnType<typeof wasmDecode>;
private _mem!: WebAssembly.Memory;
constructor(public keepSize: number) { }
/**
* Currently decoded bytes (borrowed).
* Must be accessed before calling `release` or `init`.
*/
public get data8(): Uint8Array {
return this._inst ? this._d.subarray(0, this._m32[P32.STATE_DP]) : EMPTY;
}
/**
* Release memory conditionally based on `keepSize`.
* If memory gets released, also the wasm instance will be freed and recreated on next `init`,
* otherwise the instance will be reused.
*/
public release(): void {
if (!this._inst) return;
if (this._mem.buffer.byteLength > this.keepSize) {
this._inst = this._m32 = this._d = this._mem = null!;
} else {
this._m32[P32.STATE_WP] = 0;
this._m32[P32.STATE_SP] = 0;
this._m32[P32.STATE_DP] = 0;
}
}
/**
* Initializes the decoder for new base64 data.
* Must be called before doing any decoding attempts.
* `size` is the max amount of decoded bytes to be expected.
* The method will either spawn a new wasm instance or grow
* the needed memory of an existing instance.
*/
public init(size: number): void {
let m = this._m32;
const bytes = (Math.ceil(size / 3) + P32.STATE_DATA) * 4;
if (!this._inst) {
this._mem = new WebAssembly.Memory({ initial: Math.ceil(bytes / 65536) });
this._inst = wasmDecode({ env: { memory: this._mem } });
m = new Uint32Array(this._mem.buffer, 0);
m.set(D, P32.D0);
this._d = new Uint8Array(this._mem.buffer, P32.STATE_DATA * 4);
} else if (this._mem.buffer.byteLength < bytes) {
this._mem.grow(Math.ceil((bytes - this._mem.buffer.byteLength) / 65536));
m = new Uint32Array(this._mem.buffer, 0);
this._d = new Uint8Array(this._mem.buffer, P32.STATE_DATA * 4);
}
m[P32.STATE_ESIZE] = Math.ceil(size / 3) * 4;
m[P32.STATE_WP] = 0;
m[P32.STATE_SP] = 0;
m[P32.STATE_DP] = 0;
this._m32 = m;
}
/**
* Put bytes in `data` into the decoder.
* Additionally decodes the payload, if it reached 2^17 bytes.
* Returns 1 on error, else 0.
*/
public put(data: Uint8Array | Uint16Array | Uint32Array): number {
if (!this._inst) return 1;
const m = this._m32;
if (data.length + m[P32.STATE_WP] > m[P32.STATE_ESIZE]) return 1;
this._d.set(data, m[P32.STATE_WP]);
m[P32.STATE_WP] += data.length;
// max chunk in input handler is 2^17, try to run in "tandem mode"
// also assures that we dont run into illegal offsets in the wasm part
return m[P32.STATE_WP] - m[P32.STATE_SP] >= 131072 ? this._inst.exports.dec() : 0;
}
/**
* End the current decoding.
* Also decodes leftover payload from previous put calls.
* Returns 1 on error, else 0.
*/
public end(): number {
return this._inst ? this._inst.exports.end() : 1;
}
}
const DATA = new Uint8Array(1024*1024);
DATA.fill(66);
const dec = new Base64Decoder(5000000);
dec.init(4194304);
function m() {
const start = performance.now();
for (let i = 0; i < 1000; ++i) {
dec.init(4194304);
dec.put(DATA);
dec.end();
}
const end = performance.now();
const duration = end - start;
console.log(`duration: ${Math.round(duration)} ms, rate: ${Math.round(1024*1024/(end-start))} MB/s`);
}
m();
m();
setTimeout(() => m(), 1000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment