Created
July 4, 2025 05:49
-
-
Save rygorous/b847ed5c91855c1bb51f0871283b9ac4 to your computer and use it in GitHub Desktop.
Vectorized 3D Morton encoding for no reason
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <assert.h> | |
typedef uint32_t uint32; | |
typedef __m128i Vec128_U32; | |
// "Insert" two 0 bits after each of the 11 low bits of x | |
static uint32 Part1By2(uint32 x) | |
{ | |
x &= 0x000007ff; // x = ---- ---- ---- ---- ---- -a98 7654 3210 | |
x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- -a98 ---- ---- ---- ---- 7654 3210 | |
x = (x ^ (x << 8)) & 0x0700f00f; // x = ---- -a98 ---- ---- 7654 ---- ---- 3210 | |
x = (x ^ (x << 4)) & 0x430c30c3; // x = -a-- --98 ---- 76-- --54 ---- 32-- --10 | |
x = (x ^ (x << 2)) & 0x49249249; // x = -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 | |
return x; | |
} | |
static uint32 MortonEncode3D(uint32 X, uint32 Y, uint32 Z) | |
{ | |
return Part1By2(X) | (Part1By2(Y) << 1) | (Part1By2(Z) << 2); | |
} | |
// The original idea was for 3D Morton order encoding using VBMI and GFNI operations. | |
// | |
// Unlike the example code above, here and in the following I'll write the bits | |
// from left to right because this is just a bit permutation on a vector and I'm | |
// done pretending these are numbers given how notationally inconvenient it turns | |
// out to be in the following (since we switch between working on bits and bytes | |
// within a little-endian word). | |
// | |
// Anyway, we start with two 11-bit vectors for x and y and one 10-bit vector for z, | |
// which I'll group into nibbles for clarity: (the | denotes byte boundaries) | |
// | |
// x0x1x2x3 x4x5x6x7|x8x9xa | |
// y0y1y2y3 y4y5y6y7|y8y9ya | |
// z0z1z2z3 z4z5z6z7|z8z9 | |
// | |
// The result we want is this: | |
// | |
// x0y0z0x1 y1z1x2y2|z2x3y3z3 x4y4z4x5|y5z5x6y6 z6x7y7z7|x8y8z8x9 y9z9xaya | |
// | |
// and the underlying observation is that we have the same bit permutation | |
// in every byte, using 3 bits, 3 bits then 2 bits worth from the first, second | |
// and third coordinate respectively. | |
// | |
// The first byte interleaves coordinates in the order x, y, z and starts at | |
// bit 0 in all components. | |
// The second bytes goes z, x, y, starting at bit 2 in z and 3 in x and y. | |
// Third byte: y,z,x starting at 5,5,6 respectively. | |
// Fourth byte: x,y,z all starting at 8. | |
// | |
// This is a useful observation because it means that we can figure out how to | |
// do the bit permutation within a single byte (e.g. byte 0) and then use that | |
// same permutation for every byte of the output. Now all we need to do is to | |
// implement that permutation, and figure out how to set up into it! | |
// | |
// Implementing the permutation is easy with GFNI: GF2P8AFFINEQB is a full 8x8 | |
// bit matrix multiply, which certainly includes all 8x8 permutation matrices, | |
// so we can trivially reorder the bits in a byte arbitrarily. | |
// | |
// Now, working backwards from that, we can apply the inverse permutation to | |
// the desired output to see what we need to set up: | |
// x0y0z0x1 y1z1x2y2|z2x3y3z3 x4y4z4x5|y5z5x6y6 z6x7y7z7|x8y8z8x9 y9z9xaya | |
// | |
// bytewise deinterleaves into: | |
// x0x1x2y0 y1y2z0z1|z2z3z4x3 x4x5y3y4|y5y6y7z5 z6z7x6x7|x8x9xay8 y9yaz8z9 | |
// | |
// which we can then mask to just the X, Y and Z parts to make things more clear: | |
// x0x1x2-- --------|------x3 x4x5----|-------- ----x6x7|x8x9xa-- -------- | |
// ------y0 y1y2----|-------- ----y3y4|y5y6y7-- --------|------y8 y9ya---- | |
// -------- ----z0z1|z2z3z4-- --------|------z5 z6z7----|-------- ----z8z9 | |
// | |
// as we can now see, each of these 3 individualy consists of 3 contiguous runs of | |
// bits from the X, Y and Z coordinate numbers, which would not be much of a problem | |
// to do directly using conventional bit shifts. But since every byte contains at most | |
// one such run (by construction!), we can grab the relevant bits for each byte via | |
// VPMULTISHIFTQB straight from a vector of source X/Y/Z coordinates (respectively). | |
// Then we need to mask to just the bits we care about and combine the three bit vectors | |
// to get our four bytes per 32b lane that are the input into our bit matrix op. | |
// The combining here can be done using two auxiliary masks using two VPTERNLOGD instructions. | |
// The end result is that (provided we have the right constants loaded) we can do a 3-way | |
// vector bit interleave (3D Morton encode) in 6 instructions using GFNI+VBMI. I illustrated | |
// this for 32-bit outputs, but 16/64-bit outputs work easily as well. (For anything larger, | |
// crossing 64-bit boundaries in VPMULTISHIFTQB would need some extra work.) | |
// | |
// This is implemented in MortonEncode3D_Fancy below. | |
// | |
// But as it usually goes when playing with fancy AVX512 operations, I soon realize that a | |
// similar approach is viable in older ISA variants as well. In this case, we can do a passable | |
// job with anything that has at least SSSE3 (PSHUFB is the key). | |
// | |
// In particular, we use a GF2P8AFFINEQB for our bit reordering, and that's nice when it's | |
// available, but it's absolutely not required. | |
// | |
// We can use PSHUFB to get a lookup into a table of 16 8b entries (standard technique) per | |
// byte. In our case, for the bit permutation, we do one table lookup to see what the low nibble | |
// should map to, one table lookup to see what the high nibble should map to, and then OR the | |
// two together. | |
// | |
// That's more expensive than a single instruction, but we only do this step once, at the very end, | |
// so it's not the end of the world. | |
// | |
// That leaves the multishift action. That, too, simplifies considerably. Repeating the deinterleaved | |
// x values from above: | |
// x0x1x2-- --------|------x3 x4x5----|-------- ----x6x7|x8x9xa-- -------- | |
// | |
// note that the x3 in byte 1 is exactly in the same bit position within a byte as the actual x3 | |
// bit in our input number is. In fact, this is true for all of those values. We don't need | |
// arbitrary per-byte shift amounts at all. All we really need to do here is repeat the first | |
// (low-order, since we're little-endian) byte of the X coordinate 3 times, then the second byte | |
// of the X coordinate, and finally use an AND mask. This is just PSHUFB + AND. | |
// | |
// The Y and Z coordinates are slightly more complicated, but really only slightly: | |
// ------y0 y1y2----|-------- ----y3y4|y5y6y7-- --------|------y8 y9ya---- | |
// -------- ----z0z1|z2z3z4-- --------|------z5 z6z7----|-------- ----z8z9 | |
// | |
// Note that again the bit positions within bytes all line up. We just need different masks | |
// and it turns out to be advantageous to use the sequence PSHUFB -> PSLLD (with 3 or 6) -> PAND | |
// for Y and Z since it lets us reuse the same shuffle control vector for all three coordinates. | |
// Input: vectors of 32-bit X/Y/Z coords | |
// Output: 32-bit Morton code | |
static Vec128_U32 MortonEncode3D_Vec(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z) | |
{ | |
// In a loop, all these constants only need to get loaded once: | |
Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13); | |
Vec128_U32 MaskX = _mm_set1_epi32(0x07c03807); | |
Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038); | |
Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0); | |
Vec128_U32 MaskNib = _mm_set1_epi8(0xf); | |
Vec128_U32 LoLUT = _mm_setr_epi8(0x00, 0x01, 0x08, 0x09, 0x40, 0x41, 0x48, 0x49, 0x02, 0x03, 0x0a, 0x0b, 0x42, 0x43, 0x4a, 0x4b); | |
Vec128_U32 HiLUT = _mm_setr_epi8(0x00, 0x10, 0x80, 0x90, 0x04, 0x14, 0x84, 0x94, 0x20, 0x30, 0xa0, 0xb0, 0x24, 0x34, 0xa4, 0xb4); | |
// Actual work: | |
Vec128_U32 ShiftedX = _mm_and_si128(_mm_shuffle_epi8(X, ShufS), MaskX); | |
Vec128_U32 ShiftedY = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Y, ShufS), 3), MaskY); | |
Vec128_U32 ShiftedZ = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Z, ShufS), 6), MaskZ); | |
Vec128_U32 Merged = _mm_or_si128(_mm_or_si128(ShiftedX, ShiftedY), ShiftedZ); | |
Vec128_U32 LoResult = _mm_shuffle_epi8(LoLUT, _mm_and_si128(Merged, MaskNib)); | |
Vec128_U32 HiResult = _mm_shuffle_epi8(HiLUT, _mm_and_si128(_mm_srli_epi16(Merged, 4), MaskNib)); | |
return _mm_or_si128(LoResult, HiResult); | |
} | |
// Input: vectors of 32-bit X/Y/Z coords | |
// Output: 32-bit Morton code | |
static Vec128_U32 MortonEncode3D_Fancy(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z) | |
{ | |
// In a loop, all these constants only need to get loaded once: | |
Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13); | |
Vec128_U32 ShiftsY = _mm_setr_epi8(-3,-3,5,5, 29,29,37,37, -3,-3,5,5, 29,29,37,37); | |
Vec128_U32 ShiftsZ = _mm_setr_epi8(-6,2,2,2, 26,34,34,34, -6,2,2,2, 26,34,34,34); | |
Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038); | |
Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0); | |
Vec128_U32 PermuteMatrix = _mm_set_epi8(0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20, 0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20); | |
// Set up 3 pre-shifted inputs | |
Vec128_U32 ShiftedX = _mm_shuffle_epi8(X, ShufS); | |
Vec128_U32 ShiftedY = _mm_multishift_epi64_epi8(ShiftsY, Y); | |
Vec128_U32 ShiftedZ = _mm_multishift_epi64_epi8(ShiftsZ, Z); | |
// Merge | |
constexpr uint8_t TERNLOG_A = 0xF0; | |
constexpr uint8_t TERNLOG_B = 0xCC; | |
constexpr uint8_t TERNLOG_C = 0xAA; | |
Vec128_U32 MergedXY = _mm_ternarylogic_epi32(ShiftedX, ShiftedY, MaskY, (TERNLOG_A & ~TERNLOG_C) | (TERNLOG_B & TERNLOG_C)); | |
Vec128_U32 Merged = _mm_ternarylogic_epi32(MergedXY, ShiftedZ, MaskZ, (TERNLOG_A & ~TERNLOG_C) | (TERNLOG_B & TERNLOG_C)); | |
// Bit permute | |
return _mm_gf2p8affine_epi64_epi8(Merged, PermuteMatrix, 0x00); | |
} | |
static void BatchEncode_Ref(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count) | |
{ | |
for (size_t i = 0; i < Count; i++) | |
{ | |
Dest[i] = MortonEncode3D(X[i], Y[i], Z[i]); | |
} | |
} | |
static inline Vec128_U32 Load128_U32(const uint32* Values) | |
{ | |
return _mm_loadu_si128((const __m128i *) Values); | |
} | |
static void Store128_U32(uint32* Dest, Vec128_U32 Vec) | |
{ | |
_mm_storeu_si128((__m128i *)Dest, Vec); | |
} | |
static void BatchEncode_Vec(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count) | |
{ | |
assert(Count % 4 == 0); | |
for (size_t i = 0; i < Count; i += 4) | |
{ | |
Store128_U32(Dest + i, MortonEncode3D_Vec(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i))); | |
} | |
} | |
static void BatchEncode_Fancy(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count) | |
{ | |
assert(Count % 4 == 0); | |
for (size_t i = 0; i < Count; i += 4) | |
{ | |
Store128_U32(Dest + i, MortonEncode3D_Fancy(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i))); | |
} | |
} | |
class Rng | |
{ | |
static const uint64_t MCG_MUL = 6364136223846793005ull; | |
uint64_t state; | |
public: | |
static Rng seed(uint32_t seed) | |
{ | |
// State may not be 0 (MCG); that's why we flip the low bits | |
// also do one multiply step in case the input is a small integer (which it often is). | |
Rng r; | |
r.state = ~(uint64_t)seed | (uint64_t(seed) << 32); | |
return r; | |
} | |
// Random 32-bit uint | |
uint32_t random() | |
{ | |
uint64_t oldstate = state; | |
uint32_t rot_input = (uint32_t) (((oldstate >> 18) ^ oldstate) >> 27); | |
uint32_t rot_amount = (uint32_t) (oldstate >> 59); | |
uint32_t output = (rot_input >> rot_amount) | (rot_input << ((0u - rot_amount) & 31)); // rotr(rot_input, rot_amount) | |
// Advance multiplicative congruential generator | |
// Constant from PCG reference impl. | |
state = oldstate * MCG_MUL; | |
return output; | |
} | |
}; | |
int main() | |
{ | |
Rng random = Rng::seed(12345); | |
static const size_t Count = 4096; | |
uint32 X[Count], Y[Count], Z[Count]; | |
uint32 Ref[Count], Tst[Count]; | |
for (size_t i = 0; i < Count; i++) | |
{ | |
X[i] = random.random(); | |
Y[i] = random.random(); | |
Z[i] = random.random(); | |
} | |
BatchEncode_Ref(Ref, X, Y, Z, Count); | |
printf("Testing Vec:\n"); | |
BatchEncode_Vec(Tst, X, Y, Z, Count); | |
for (size_t i = 0; i < Count; i++) | |
{ | |
if (Ref[i] != Tst[i]) | |
{ | |
printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]); | |
return 1; | |
} | |
} | |
printf("Testing Fancy:\n"); | |
BatchEncode_Fancy(Tst, X, Y, Z, Count); | |
for (size_t i = 0; i < Count; i++) | |
{ | |
if (Ref[i] != Tst[i]) | |
{ | |
printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]); | |
return 1; | |
} | |
} | |
printf("all OK!\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment