Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created July 4, 2025 05:49
Show Gist options
  • Save rygorous/b847ed5c91855c1bb51f0871283b9ac4 to your computer and use it in GitHub Desktop.
Save rygorous/b847ed5c91855c1bb51f0871283b9ac4 to your computer and use it in GitHub Desktop.
Vectorized 3D Morton encoding for no reason
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
typedef uint32_t uint32;
typedef __m128i Vec128_U32;
// "Insert" two 0 bits after each of the 11 low bits of x
static uint32 Part1By2(uint32 x)
{
x &= 0x000007ff; // x = ---- ---- ---- ---- ---- -a98 7654 3210
x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- -a98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x << 8)) & 0x0700f00f; // x = ---- -a98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x << 4)) & 0x430c30c3; // x = -a-- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x << 2)) & 0x49249249; // x = -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
}
static uint32 MortonEncode3D(uint32 X, uint32 Y, uint32 Z)
{
return Part1By2(X) | (Part1By2(Y) << 1) | (Part1By2(Z) << 2);
}
// The original idea was for 3D Morton order encoding using VBMI and GFNI operations.
//
// Unlike the example code above, here and in the following I'll write the bits
// from left to right because this is just a bit permutation on a vector and I'm
// done pretending these are numbers given how notationally inconvenient it turns
// out to be in the following (since we switch between working on bits and bytes
// within a little-endian word).
//
// Anyway, we start with two 11-bit vectors for x and y and one 10-bit vector for z,
// which I'll group into nibbles for clarity: (the | denotes byte boundaries)
//
// x0x1x2x3 x4x5x6x7|x8x9xa
// y0y1y2y3 y4y5y6y7|y8y9ya
// z0z1z2z3 z4z5z6z7|z8z9
//
// The result we want is this:
//
// x0y0z0x1 y1z1x2y2|z2x3y3z3 x4y4z4x5|y5z5x6y6 z6x7y7z7|x8y8z8x9 y9z9xaya
//
// and the underlying observation is that we have the same bit permutation
// in every byte, using 3 bits, 3 bits then 2 bits worth from the first, second
// and third coordinate respectively.
//
// The first byte interleaves coordinates in the order x, y, z and starts at
// bit 0 in all components.
// The second bytes goes z, x, y, starting at bit 2 in z and 3 in x and y.
// Third byte: y,z,x starting at 5,5,6 respectively.
// Fourth byte: x,y,z all starting at 8.
//
// This is a useful observation because it means that we can figure out how to
// do the bit permutation within a single byte (e.g. byte 0) and then use that
// same permutation for every byte of the output. Now all we need to do is to
// implement that permutation, and figure out how to set up into it!
//
// Implementing the permutation is easy with GFNI: GF2P8AFFINEQB is a full 8x8
// bit matrix multiply, which certainly includes all 8x8 permutation matrices,
// so we can trivially reorder the bits in a byte arbitrarily.
//
// Now, working backwards from that, we can apply the inverse permutation to
// the desired output to see what we need to set up:
// x0y0z0x1 y1z1x2y2|z2x3y3z3 x4y4z4x5|y5z5x6y6 z6x7y7z7|x8y8z8x9 y9z9xaya
//
// bytewise deinterleaves into:
// x0x1x2y0 y1y2z0z1|z2z3z4x3 x4x5y3y4|y5y6y7z5 z6z7x6x7|x8x9xay8 y9yaz8z9
//
// which we can then mask to just the X, Y and Z parts to make things more clear:
// x0x1x2-- --------|------x3 x4x5----|-------- ----x6x7|x8x9xa-- --------
// ------y0 y1y2----|-------- ----y3y4|y5y6y7-- --------|------y8 y9ya----
// -------- ----z0z1|z2z3z4-- --------|------z5 z6z7----|-------- ----z8z9
//
// as we can now see, each of these 3 individualy consists of 3 contiguous runs of
// bits from the X, Y and Z coordinate numbers, which would not be much of a problem
// to do directly using conventional bit shifts. But since every byte contains at most
// one such run (by construction!), we can grab the relevant bits for each byte via
// VPMULTISHIFTQB straight from a vector of source X/Y/Z coordinates (respectively).
// Then we need to mask to just the bits we care about and combine the three bit vectors
// to get our four bytes per 32b lane that are the input into our bit matrix op.
// The combining here can be done using two auxiliary masks using two VPTERNLOGD instructions.
// The end result is that (provided we have the right constants loaded) we can do a 3-way
// vector bit interleave (3D Morton encode) in 6 instructions using GFNI+VBMI. I illustrated
// this for 32-bit outputs, but 16/64-bit outputs work easily as well. (For anything larger,
// crossing 64-bit boundaries in VPMULTISHIFTQB would need some extra work.)
//
// This is implemented in MortonEncode3D_Fancy below.
//
// But as it usually goes when playing with fancy AVX512 operations, I soon realize that a
// similar approach is viable in older ISA variants as well. In this case, we can do a passable
// job with anything that has at least SSSE3 (PSHUFB is the key).
//
// In particular, we use a GF2P8AFFINEQB for our bit reordering, and that's nice when it's
// available, but it's absolutely not required.
//
// We can use PSHUFB to get a lookup into a table of 16 8b entries (standard technique) per
// byte. In our case, for the bit permutation, we do one table lookup to see what the low nibble
// should map to, one table lookup to see what the high nibble should map to, and then OR the
// two together.
//
// That's more expensive than a single instruction, but we only do this step once, at the very end,
// so it's not the end of the world.
//
// That leaves the multishift action. That, too, simplifies considerably. Repeating the deinterleaved
// x values from above:
// x0x1x2-- --------|------x3 x4x5----|-------- ----x6x7|x8x9xa-- --------
//
// note that the x3 in byte 1 is exactly in the same bit position within a byte as the actual x3
// bit in our input number is. In fact, this is true for all of those values. We don't need
// arbitrary per-byte shift amounts at all. All we really need to do here is repeat the first
// (low-order, since we're little-endian) byte of the X coordinate 3 times, then the second byte
// of the X coordinate, and finally use an AND mask. This is just PSHUFB + AND.
//
// The Y and Z coordinates are slightly more complicated, but really only slightly:
// ------y0 y1y2----|-------- ----y3y4|y5y6y7-- --------|------y8 y9ya----
// -------- ----z0z1|z2z3z4-- --------|------z5 z6z7----|-------- ----z8z9
//
// Note that again the bit positions within bytes all line up. We just need different masks
// and it turns out to be advantageous to use the sequence PSHUFB -> PSLLD (with 3 or 6) -> PAND
// for Y and Z since it lets us reuse the same shuffle control vector for all three coordinates.
// Input: vectors of 32-bit X/Y/Z coords
// Output: 32-bit Morton code
static Vec128_U32 MortonEncode3D_Vec(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z)
{
// In a loop, all these constants only need to get loaded once:
Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13);
Vec128_U32 MaskX = _mm_set1_epi32(0x07c03807);
Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038);
Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0);
Vec128_U32 MaskNib = _mm_set1_epi8(0xf);
Vec128_U32 LoLUT = _mm_setr_epi8(0x00, 0x01, 0x08, 0x09, 0x40, 0x41, 0x48, 0x49, 0x02, 0x03, 0x0a, 0x0b, 0x42, 0x43, 0x4a, 0x4b);
Vec128_U32 HiLUT = _mm_setr_epi8(0x00, 0x10, 0x80, 0x90, 0x04, 0x14, 0x84, 0x94, 0x20, 0x30, 0xa0, 0xb0, 0x24, 0x34, 0xa4, 0xb4);
// Actual work:
Vec128_U32 ShiftedX = _mm_and_si128(_mm_shuffle_epi8(X, ShufS), MaskX);
Vec128_U32 ShiftedY = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Y, ShufS), 3), MaskY);
Vec128_U32 ShiftedZ = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Z, ShufS), 6), MaskZ);
Vec128_U32 Merged = _mm_or_si128(_mm_or_si128(ShiftedX, ShiftedY), ShiftedZ);
Vec128_U32 LoResult = _mm_shuffle_epi8(LoLUT, _mm_and_si128(Merged, MaskNib));
Vec128_U32 HiResult = _mm_shuffle_epi8(HiLUT, _mm_and_si128(_mm_srli_epi16(Merged, 4), MaskNib));
return _mm_or_si128(LoResult, HiResult);
}
// Input: vectors of 32-bit X/Y/Z coords
// Output: 32-bit Morton code
static Vec128_U32 MortonEncode3D_Fancy(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z)
{
// In a loop, all these constants only need to get loaded once:
Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13);
Vec128_U32 ShiftsY = _mm_setr_epi8(-3,-3,5,5, 29,29,37,37, -3,-3,5,5, 29,29,37,37);
Vec128_U32 ShiftsZ = _mm_setr_epi8(-6,2,2,2, 26,34,34,34, -6,2,2,2, 26,34,34,34);
Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038);
Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0);
Vec128_U32 PermuteMatrix = _mm_set_epi8(0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20, 0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20);
// Set up 3 pre-shifted inputs
Vec128_U32 ShiftedX = _mm_shuffle_epi8(X, ShufS);
Vec128_U32 ShiftedY = _mm_multishift_epi64_epi8(ShiftsY, Y);
Vec128_U32 ShiftedZ = _mm_multishift_epi64_epi8(ShiftsZ, Z);
// Merge
constexpr uint8_t TERNLOG_A = 0xF0;
constexpr uint8_t TERNLOG_B = 0xCC;
constexpr uint8_t TERNLOG_C = 0xAA;
Vec128_U32 MergedXY = _mm_ternarylogic_epi32(ShiftedX, ShiftedY, MaskY, (TERNLOG_A & ~TERNLOG_C) | (TERNLOG_B & TERNLOG_C));
Vec128_U32 Merged = _mm_ternarylogic_epi32(MergedXY, ShiftedZ, MaskZ, (TERNLOG_A & ~TERNLOG_C) | (TERNLOG_B & TERNLOG_C));
// Bit permute
return _mm_gf2p8affine_epi64_epi8(Merged, PermuteMatrix, 0x00);
}
static void BatchEncode_Ref(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
{
for (size_t i = 0; i < Count; i++)
{
Dest[i] = MortonEncode3D(X[i], Y[i], Z[i]);
}
}
static inline Vec128_U32 Load128_U32(const uint32* Values)
{
return _mm_loadu_si128((const __m128i *) Values);
}
static void Store128_U32(uint32* Dest, Vec128_U32 Vec)
{
_mm_storeu_si128((__m128i *)Dest, Vec);
}
static void BatchEncode_Vec(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
{
assert(Count % 4 == 0);
for (size_t i = 0; i < Count; i += 4)
{
Store128_U32(Dest + i, MortonEncode3D_Vec(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i)));
}
}
static void BatchEncode_Fancy(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
{
assert(Count % 4 == 0);
for (size_t i = 0; i < Count; i += 4)
{
Store128_U32(Dest + i, MortonEncode3D_Fancy(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i)));
}
}
class Rng
{
static const uint64_t MCG_MUL = 6364136223846793005ull;
uint64_t state;
public:
static Rng seed(uint32_t seed)
{
// State may not be 0 (MCG); that's why we flip the low bits
// also do one multiply step in case the input is a small integer (which it often is).
Rng r;
r.state = ~(uint64_t)seed | (uint64_t(seed) << 32);
return r;
}
// Random 32-bit uint
uint32_t random()
{
uint64_t oldstate = state;
uint32_t rot_input = (uint32_t) (((oldstate >> 18) ^ oldstate) >> 27);
uint32_t rot_amount = (uint32_t) (oldstate >> 59);
uint32_t output = (rot_input >> rot_amount) | (rot_input << ((0u - rot_amount) & 31)); // rotr(rot_input, rot_amount)
// Advance multiplicative congruential generator
// Constant from PCG reference impl.
state = oldstate * MCG_MUL;
return output;
}
};
int main()
{
Rng random = Rng::seed(12345);
static const size_t Count = 4096;
uint32 X[Count], Y[Count], Z[Count];
uint32 Ref[Count], Tst[Count];
for (size_t i = 0; i < Count; i++)
{
X[i] = random.random();
Y[i] = random.random();
Z[i] = random.random();
}
BatchEncode_Ref(Ref, X, Y, Z, Count);
printf("Testing Vec:\n");
BatchEncode_Vec(Tst, X, Y, Z, Count);
for (size_t i = 0; i < Count; i++)
{
if (Ref[i] != Tst[i])
{
printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]);
return 1;
}
}
printf("Testing Fancy:\n");
BatchEncode_Fancy(Tst, X, Y, Z, Count);
for (size_t i = 0; i < Count; i++)
{
if (Ref[i] != Tst[i])
{
printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]);
return 1;
}
}
printf("all OK!\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment