Skip to content

Instantly share code, notes, and snippets.

@jerch
Created January 28, 2026 15:40
Show Gist options
  • Select an option

  • Save jerch/e2f7695b887228d9e703c53af23c0737 to your computer and use it in GitHub Desktop.

Select an option

Save jerch/e2f7695b887228d9e703c53af23c0737 to your computer and use it in GitHub Desktop.
import { InWasm, OutputMode, OutputType } from 'inwasm';
const BYTES = new Uint8Array(786432*16);
const DATA = new Uint8Array(1048576*16);
const PIXELS = 262144*16;
for (let i = 0; i < BYTES.length; i+= 3) {
BYTES[i] = 50;
BYTES[i+1] = 100;
BYTES[i+2] = 200;
}
function m1(): number {
const start = Date.now();
const bytesPerPixel = 3;
const BYTES_PER_PIXEL_RGBA = 4;
const isRgba = false;
let dstOffset = 0;
let srcOffset = 0;
for (let i = 0; i < PIXELS; i++) {
DATA[dstOffset ] = BYTES[srcOffset ]; // R
DATA[dstOffset + 1] = BYTES[srcOffset + 1]; // G
DATA[dstOffset + 2] = BYTES[srcOffset + 2]; // B
DATA[dstOffset + 3] = isRgba ? BYTES[srcOffset + 3] : 255;
srcOffset += bytesPerPixel;
dstOffset += BYTES_PER_PIXEL_RGBA;
}
const end = Date.now();
return end - start;
}
function m2() {
const start = Date.now();
let dstOffset = 0;
let srcOffset = 0;
for (let i = 0; i < PIXELS; i++) {
DATA[dstOffset ] = BYTES[srcOffset ]; // R
DATA[dstOffset + 1] = BYTES[srcOffset + 1]; // G
DATA[dstOffset + 2] = BYTES[srcOffset + 2]; // B
DATA[dstOffset + 3] = 255;
srcOffset += 3;
dstOffset += 4;
}
const end = Date.now();
return end - start;
}
function m3() {
const start = Date.now();
const bytes32 = new Uint32Array(BYTES.buffer);
const data32 = new Uint32Array(DATA.buffer);
let dstOffset = 0;
let srcOffset = 0;
for (let i = 0; i < PIXELS; i += 4) {
const bloc1 = bytes32[srcOffset++];
const bloc2 = bytes32[srcOffset++];
const bloc3 = bytes32[srcOffset++];
data32[dstOffset++] = bloc1 | 0xFF000000;
data32[dstOffset++] = (bloc1 >> 24) | (bloc2 << 8) | 0xFF000000;
data32[dstOffset++] = (bloc2 >> 16) | (bloc3 << 16) | 0xFF000000;
data32[dstOffset++] = (bloc3 >> 8) | 0xFF000000;
}
const end = Date.now();
return end - start;
}
function check() {
const data32 = new Uint32Array(DATA.buffer);
for (let i = 0; i < data32.length; ++i) {
if (data32[i] != 0xFFC86432) {
throw Error(`wrong value ${data32[i]} at index ${i}`);
}
}
data32.fill(0);
}
const wasmInterleave = InWasm({
name: 'decode',
type: OutputType.INSTANCE,
mode: OutputMode.SYNC,
srctype: 'Clang-C',
exports: {
memory: new WebAssembly.Memory({initial: 500, maximum: 500}),
interleave: (pixels: number) => 0,
},
compile: {
switches: ['-msimd128', '-Wl,-z,stack-size=0', '-Wl,--stack-first']
},
code: `
#include <wasm_simd128.h>
int interleave(int pixels) {
// hardcoded mem offsets for simplicity
char *src = (char*) 1024;
char *dst = (char*) 12583936;
v128_t opaque = wasm_u32x4_splat(0xFF000000);
v128_t picker = wasm_i8x16_const(0, 1, 2, 16, 3, 4, 5, 16, 6, 7, 8, 16, 9, 10, 11, 16);
for (int i = 0; i < pixels; i += 4) {
v128_t data = wasm_v128_load((v128_t *) src);
// swizzle + or ist faster
data = wasm_i8x16_swizzle(data, picker);
data = wasm_v128_or(data, opaque);
// shuffle runs slower
//data = wasm_i8x16_shuffle(data, opaque, 0, 1, 2, 19, 3, 4, 5, 23, 6, 7, 8, 27, 9, 10, 11, 31);
wasm_v128_store((v128_t *) dst, data);
src += 12;
dst += 16;
}
return 0;
}
`
})
const wasmInstance = wasmInterleave();
const wasmMemory = new Uint8Array(wasmInstance.exports.memory.buffer);
wasmMemory.set(BYTES, 1024);
function m4() {
const start = Date.now();
wasmInstance.exports.interleave(PIXELS);
const borrowed = wasmMemory.subarray(12583936, 12583936+1048576*16);
const end = Date.now();
DATA.set(borrowed);
return end - start;
}
m1();
check();
m2();
check();
m3();
check();
m4();
check();
function measure(func: Function, msg: string, runs: number = 22) {
let slowest = 0;
let fastest = 200000;
let time = 0;
for (let i = 0; i < runs; ++i) {
const runtime = func();
time += runtime;
slowest = Math.max(slowest, runtime);
fastest = Math.min(fastest, runtime);
}
time -= slowest;
time -= fastest;
const duration = time/(runs-2);
console.log(msg, duration, 'ms,', Math.round(786432*16/duration/1000), 'MB/s');
}
measure(m1, 'original');
measure(m2, 'slightly optimized');
measure(m3, '4 byte blocks');
measure(m4, 'wasm simd');
@jerch
Copy link
Author

jerch commented Jan 28, 2026

Tested with: node v24
build with: tsc -p . && inwasm 'lib/*.wasm.js'

Results:

$> node lib/test.wasm.js
original 50.4 ms, 250 MB/s
slightly optimized 23.9 ms, 526 MB/s
4 byte blocks 8.1 ms, 1553 MB/s
wasm simd 2.65 ms, 4748 MB/s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment