rygorous · July 4, 2025 05:49
diff --git a/morton3d_fun.cpp b/morton3d_fun.cpp
 #include <immintrin.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>

 typedef uint32_t uint32;
 typedef __m128i Vec128_U32;

 // "Insert" two 0 bits after each of the 11 low bits of x
 static uint32 Part1By2(uint32 x)
 {
 	x &= 0x000007ff;                  // x = ---- ---- ---- ---- ---- -a98 7654 3210
 	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- -a98 ---- ---- ---- ---- 7654 3210
 	x = (x ^ (x <<  8)) & 0x0700f00f; // x = ---- -a98 ---- ---- 7654 ---- ---- 3210
 	x = (x ^ (x <<  4)) & 0x430c30c3; // x = -a-- --98 ---- 76-- --54 ---- 32-- --10
 	x = (x ^ (x <<  2)) & 0x49249249; // x = -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
 	return x;
 }

 static uint32 MortonEncode3D(uint32 X, uint32 Y, uint32 Z)
 {
 	return Part1By2(X) | (Part1By2(Y) << 1) | (Part1By2(Z) << 2);
 }

 // The original idea was for 3D Morton order encoding using VBMI and GFNI operations.
 //
 // Unlike the example code above, here and in the following I'll write the bits
 // from left to right because this is just a bit permutation on a vector and I'm
 // done pretending these are numbers given how notationally inconvenient it turns
 // out to be in the following (since we switch between working on bits and bytes
 // within a little-endian word).
 //
 // Anyway, we start with two 11-bit vectors for x and y and one 10-bit vector for z,
 // which I'll group into nibbles for clarity: (the | denotes byte boundaries)
 //
 //   x0x1x2x3 x4x5x6x7|x8x9xa
 //   y0y1y2y3 y4y5y6y7|y8y9ya
 //   z0z1z2z3 z4z5z6z7|z8z9
 //
 // The result we want is this:
 //
 //   x0y0z0x1 y1z1x2y2|z2x3y3z3 x4y4z4x5|y5z5x6y6 z6x7y7z7|x8y8z8x9 y9z9xaya
 //
 // and the underlying observation is that we have the same bit permutation
 // in every byte, using 3 bits, 3 bits then 2 bits worth from the first, second
 // and third coordinate respectively.
 //
 // The first byte interleaves coordinates in the order x, y, z and starts at
 // bit 0 in all components.
 // The second bytes goes z, x, y, starting at bit 2 in z and 3 in x and y.
 // Third byte: y,z,x starting at 5,5,6 respectively.
 // Fourth byte: x,y,z all starting at 8.
 //
 // This is a useful observation because it means that we can figure out how to
 // do the bit permutation within a single byte (e.g. byte 0) and then use that
 // same permutation for every byte of the output. Now all we need to do is to
 // implement that permutation, and figure out how to set up into it!
 //
 // Implementing the permutation is easy with GFNI: GF2P8AFFINEQB is a full 8x8
 // bit matrix multiply, which certainly includes all 8x8 permutation matrices,
 // so we can trivially reorder the bits in a byte arbitrarily.
 //
 // Now, working backwards from that, we can apply the inverse permutation to
 // the desired output to see what we need to set up:
 //   x0y0z0x1 y1z1x2y2|z2x3y3z3 x4y4z4x5|y5z5x6y6 z6x7y7z7|x8y8z8x9 y9z9xaya
 //
 // bytewise deinterleaves into:
 //   x0x1x2y0 y1y2z0z1|z2z3z4x3 x4x5y3y4|y5y6y7z5 z6z7x6x7|x8x9xay8 y9yaz8z9
 //
 // which we can then mask to just the X, Y and Z parts to make things more clear:
 //   x0x1x2-- --------|------x3 x4x5----|-------- ----x6x7|x8x9xa-- --------
 //   ------y0 y1y2----|-------- ----y3y4|y5y6y7-- --------|------y8 y9ya----
 //   -------- ----z0z1|z2z3z4-- --------|------z5 z6z7----|-------- ----z8z9
 //
 // as we can now see, each of these 3 individualy consists of 3 contiguous runs of
 // bits from the X, Y and Z coordinate numbers, which would not be much of a problem
 // to do directly using conventional bit shifts. But since every byte contains at most
 // one such run (by construction!), we can grab the relevant bits for each byte via
 // VPMULTISHIFTQB straight from a vector of source X/Y/Z coordinates (respectively).
 // Then we need to mask to just the bits we care about and combine the three bit vectors
 // to get our four bytes per 32b lane that are the input into our bit matrix op.
 // The combining here can be done using two auxiliary masks using two VPTERNLOGD instructions.
 // The end result is that (provided we have the right constants loaded) we can do a 3-way
 // vector bit interleave (3D Morton encode) in 6 instructions using GFNI+VBMI. I illustrated
 // this for 32-bit outputs, but 16/64-bit outputs work easily as well. (For anything larger,
 // crossing 64-bit boundaries in VPMULTISHIFTQB would need some extra work.)
 //
 // This is implemented in MortonEncode3D_Fancy below.
 //
 // But as it usually goes when playing with fancy AVX512 operations, I soon realize that a
 // similar approach is viable in older ISA variants as well. In this case, we can do a passable
 // job with anything that has at least SSSE3 (PSHUFB is the key).
 //
 // In particular, we use a GF2P8AFFINEQB for our bit reordering, and that's nice when it's
 // available, but it's absolutely not required.
 //
 // We can use PSHUFB to get a lookup into a table of 16 8b entries (standard technique) per
 // byte. In our case, for the bit permutation, we do one table lookup to see what the low nibble
 // should map to, one table lookup to see what the high nibble should map to, and then OR the
 // two together.
 //
 // That's more expensive than a single instruction, but we only do this step once, at the very end,
 // so it's not the end of the world.
 //
 // That leaves the multishift action. That, too, simplifies considerably. Repeating the deinterleaved
 // x values from above:
 //   x0x1x2-- --------|------x3 x4x5----|-------- ----x6x7|x8x9xa-- --------
 //
 // note that the x3 in byte 1 is exactly in the same bit position within a byte as the actual x3
 // bit in our input number is. In fact, this is true for all of those values. We don't need
 // arbitrary per-byte shift amounts at all. All we really need to do here is repeat the first
 // (low-order, since we're little-endian) byte of the X coordinate 3 times, then the second byte
 // of the X coordinate, and finally use an AND mask. This is just PSHUFB + AND.
 //
 // The Y and Z coordinates are slightly more complicated, but really only slightly:
 //   ------y0 y1y2----|-------- ----y3y4|y5y6y7-- --------|------y8 y9ya----
 //   -------- ----z0z1|z2z3z4-- --------|------z5 z6z7----|-------- ----z8z9
 //
 // Note that again the bit positions within bytes all line up. We just need different masks
 // and it turns out to be advantageous to use the sequence PSHUFB -> PSLLD (with 3 or 6) -> PAND
 // for Y and Z since it lets us reuse the same shuffle control vector for all three coordinates.

 // Input: vectors of 32-bit X/Y/Z coords
 // Output: 32-bit Morton code
 static Vec128_U32 MortonEncode3D_Vec(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z)
 {
 	// In a loop, all these constants only need to get loaded once:
 	Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13);
 	Vec128_U32 MaskX = _mm_set1_epi32(0x07c03807);
 	Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038);
 	Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0);
 	Vec128_U32 MaskNib = _mm_set1_epi8(0xf);
 	Vec128_U32 LoLUT = _mm_setr_epi8(0x00, 0x01, 0x08, 0x09, 0x40, 0x41, 0x48, 0x49, 0x02, 0x03, 0x0a, 0x0b, 0x42, 0x43, 0x4a, 0x4b);
 	Vec128_U32 HiLUT = _mm_setr_epi8(0x00, 0x10, 0x80, 0x90, 0x04, 0x14, 0x84, 0x94, 0x20, 0x30, 0xa0, 0xb0, 0x24, 0x34, 0xa4, 0xb4);

 	// Actual work:
 	Vec128_U32 ShiftedX = _mm_and_si128(_mm_shuffle_epi8(X, ShufS), MaskX);
 	Vec128_U32 ShiftedY = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Y, ShufS), 3), MaskY);
 	Vec128_U32 ShiftedZ = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Z, ShufS), 6), MaskZ);

 	Vec128_U32 Merged = _mm_or_si128(_mm_or_si128(ShiftedX, ShiftedY), ShiftedZ);

 	Vec128_U32 LoResult = _mm_shuffle_epi8(LoLUT, _mm_and_si128(Merged, MaskNib));
 	Vec128_U32 HiResult = _mm_shuffle_epi8(HiLUT, _mm_and_si128(_mm_srli_epi16(Merged, 4), MaskNib));

 	return _mm_or_si128(LoResult, HiResult);
 }

 // Input: vectors of 32-bit X/Y/Z coords
 // Output: 32-bit Morton code
 static Vec128_U32 MortonEncode3D_Fancy(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z)
 {
 	// In a loop, all these constants only need to get loaded once:
 	Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13);
 	Vec128_U32 ShiftsY = _mm_setr_epi8(-3,-3,5,5, 29,29,37,37, -3,-3,5,5, 29,29,37,37);
 	Vec128_U32 ShiftsZ = _mm_setr_epi8(-6,2,2,2, 26,34,34,34, -6,2,2,2, 26,34,34,34);

 	Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038);
 	Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0);

 	Vec128_U32 PermuteMatrix = _mm_set_epi8(0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20, 0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20);

 	// Set up 3 pre-shifted inputs
 	Vec128_U32 ShiftedX = _mm_shuffle_epi8(X, ShufS);
 	Vec128_U32 ShiftedY = _mm_multishift_epi64_epi8(ShiftsY, Y);
 	Vec128_U32 ShiftedZ = _mm_multishift_epi64_epi8(ShiftsZ, Z);

 	// Merge
 	constexpr uint8_t TERNLOG_A = 0xF0;
 	constexpr uint8_t TERNLOG_B = 0xCC;
 	constexpr uint8_t TERNLOG_C = 0xAA;
 	Vec128_U32 MergedXY = _mm_ternarylogic_epi32(ShiftedX, ShiftedY, MaskY, (TERNLOG_A & ~TERNLOG_C) | (TERNLOG_B & TERNLOG_C));
 	Vec128_U32 Merged = _mm_ternarylogic_epi32(MergedXY, ShiftedZ, MaskZ, (TERNLOG_A & ~TERNLOG_C) | (TERNLOG_B & TERNLOG_C));

 	// Bit permute
 	return _mm_gf2p8affine_epi64_epi8(Merged, PermuteMatrix, 0x00);
 }

 static void BatchEncode_Ref(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
 {
 	for (size_t i = 0; i < Count; i++)
 	{
 		Dest[i] = MortonEncode3D(X[i], Y[i], Z[i]);
 	}
 }

 static inline Vec128_U32 Load128_U32(const uint32* Values)
 {
 	return _mm_loadu_si128((const __m128i *) Values);
 }

 static void Store128_U32(uint32* Dest, Vec128_U32 Vec)
 {
 	_mm_storeu_si128((__m128i *)Dest, Vec);
 }

 static void BatchEncode_Vec(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
 {
 	assert(Count % 4 == 0);

 	for (size_t i = 0; i < Count; i += 4)
 	{
 		Store128_U32(Dest + i, MortonEncode3D_Vec(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i)));
 	}
 }

 static void BatchEncode_Fancy(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
 {
 	assert(Count % 4 == 0);

 	for (size_t i = 0; i < Count; i += 4)
 	{
 		Store128_U32(Dest + i, MortonEncode3D_Fancy(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i)));
 	}
 }

 class Rng
 {
 	static const uint64_t MCG_MUL = 6364136223846793005ull; 
 	uint64_t state;

 public:
 	static Rng seed(uint32_t seed)
 	{
 		// State may not be 0 (MCG); that's why we flip the low bits
 		// also do one multiply step in case the input is a small integer (which it often is).
 		Rng r;
 		r.state = ~(uint64_t)seed | (uint64_t(seed) << 32);
 		return r;
 	}

 	// Random 32-bit uint
 	uint32_t random()
 	{
 		uint64_t oldstate = state;
 		uint32_t rot_input = (uint32_t) (((oldstate >> 18) ^ oldstate) >> 27);
 		uint32_t rot_amount = (uint32_t) (oldstate >> 59);
 		uint32_t output = (rot_input >> rot_amount) | (rot_input << ((0u - rot_amount) & 31)); // rotr(rot_input, rot_amount)

 		// Advance multiplicative congruential generator
 		// Constant from PCG reference impl.
 		state = oldstate * MCG_MUL;

 		return output;
 	}
 };

 int main()
 {
 	Rng random = Rng::seed(12345);
 	static const size_t Count = 4096;

 	uint32 X[Count], Y[Count], Z[Count];
 	uint32 Ref[Count], Tst[Count];

 	for (size_t i = 0; i < Count; i++)
 	{
 		X[i] = random.random();
 		Y[i] = random.random();
 		Z[i] = random.random();
 	}

 	BatchEncode_Ref(Ref, X, Y, Z, Count);
 	printf("Testing Vec:\n");
 	BatchEncode_Vec(Tst, X, Y, Z, Count);

 	for (size_t i = 0; i < Count; i++)
 	{
 		if (Ref[i] != Tst[i])
 		{
 			printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]);
 			return 1;
 		}
 	}

 	printf("Testing Fancy:\n");
 	BatchEncode_Fancy(Tst, X, Y, Z, Count);

 	for (size_t i = 0; i < Count; i++)
 	{
 		if (Ref[i] != Tst[i])
 		{
 			printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]);
 			return 1;
 		}
 	}

 	printf("all OK!\n");

 	return 0;
 }
	#include <immintrin.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <assert.h>

	typedef uint32_t uint32;
	typedef __m128i Vec128_U32;

	// "Insert" two 0 bits after each of the 11 low bits of x
	static uint32 Part1By2(uint32 x)
	{
	x &= 0x000007ff; // x = ---- ---- ---- ---- ---- -a98 7654 3210
	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- -a98 ---- ---- ---- ---- 7654 3210
	x = (x ^ (x << 8)) & 0x0700f00f; // x = ---- -a98 ---- ---- 7654 ---- ---- 3210
	x = (x ^ (x << 4)) & 0x430c30c3; // x = -a-- --98 ---- 76-- --54 ---- 32-- --10
	x = (x ^ (x << 2)) & 0x49249249; // x = -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
	return x;
	}

	static uint32 MortonEncode3D(uint32 X, uint32 Y, uint32 Z)
	{
	return Part1By2(X) \| (Part1By2(Y) << 1) \| (Part1By2(Z) << 2);
	}

	// The original idea was for 3D Morton order encoding using VBMI and GFNI operations.
	//
	// Unlike the example code above, here and in the following I'll write the bits
	// from left to right because this is just a bit permutation on a vector and I'm
	// done pretending these are numbers given how notationally inconvenient it turns
	// out to be in the following (since we switch between working on bits and bytes
	// within a little-endian word).
	//
	// Anyway, we start with two 11-bit vectors for x and y and one 10-bit vector for z,
	// which I'll group into nibbles for clarity: (the \| denotes byte boundaries)
	//
	// x0x1x2x3 x4x5x6x7\|x8x9xa
	// y0y1y2y3 y4y5y6y7\|y8y9ya
	// z0z1z2z3 z4z5z6z7\|z8z9
	//
	// The result we want is this:
	//
	// x0y0z0x1 y1z1x2y2\|z2x3y3z3 x4y4z4x5\|y5z5x6y6 z6x7y7z7\|x8y8z8x9 y9z9xaya
	//
	// and the underlying observation is that we have the same bit permutation
	// in every byte, using 3 bits, 3 bits then 2 bits worth from the first, second
	// and third coordinate respectively.
	//
	// The first byte interleaves coordinates in the order x, y, z and starts at
	// bit 0 in all components.
	// The second bytes goes z, x, y, starting at bit 2 in z and 3 in x and y.
	// Third byte: y,z,x starting at 5,5,6 respectively.
	// Fourth byte: x,y,z all starting at 8.
	//
	// This is a useful observation because it means that we can figure out how to
	// do the bit permutation within a single byte (e.g. byte 0) and then use that
	// same permutation for every byte of the output. Now all we need to do is to
	// implement that permutation, and figure out how to set up into it!
	//
	// Implementing the permutation is easy with GFNI: GF2P8AFFINEQB is a full 8x8
	// bit matrix multiply, which certainly includes all 8x8 permutation matrices,
	// so we can trivially reorder the bits in a byte arbitrarily.
	//
	// Now, working backwards from that, we can apply the inverse permutation to
	// the desired output to see what we need to set up:
	// x0y0z0x1 y1z1x2y2\|z2x3y3z3 x4y4z4x5\|y5z5x6y6 z6x7y7z7\|x8y8z8x9 y9z9xaya
	//
	// bytewise deinterleaves into:
	// x0x1x2y0 y1y2z0z1\|z2z3z4x3 x4x5y3y4\|y5y6y7z5 z6z7x6x7\|x8x9xay8 y9yaz8z9
	//
	// which we can then mask to just the X, Y and Z parts to make things more clear:
	// x0x1x2-- --------\|------x3 x4x5----\|-------- ----x6x7\|x8x9xa-- --------
	// ------y0 y1y2----\|-------- ----y3y4\|y5y6y7-- --------\|------y8 y9ya----
	// -------- ----z0z1\|z2z3z4-- --------\|------z5 z6z7----\|-------- ----z8z9
	//
	// as we can now see, each of these 3 individualy consists of 3 contiguous runs of
	// bits from the X, Y and Z coordinate numbers, which would not be much of a problem
	// to do directly using conventional bit shifts. But since every byte contains at most
	// one such run (by construction!), we can grab the relevant bits for each byte via
	// VPMULTISHIFTQB straight from a vector of source X/Y/Z coordinates (respectively).
	// Then we need to mask to just the bits we care about and combine the three bit vectors
	// to get our four bytes per 32b lane that are the input into our bit matrix op.
	// The combining here can be done using two auxiliary masks using two VPTERNLOGD instructions.
	// The end result is that (provided we have the right constants loaded) we can do a 3-way
	// vector bit interleave (3D Morton encode) in 6 instructions using GFNI+VBMI. I illustrated
	// this for 32-bit outputs, but 16/64-bit outputs work easily as well. (For anything larger,
	// crossing 64-bit boundaries in VPMULTISHIFTQB would need some extra work.)
	//
	// This is implemented in MortonEncode3D_Fancy below.
	//
	// But as it usually goes when playing with fancy AVX512 operations, I soon realize that a
	// similar approach is viable in older ISA variants as well. In this case, we can do a passable
	// job with anything that has at least SSSE3 (PSHUFB is the key).
	//
	// In particular, we use a GF2P8AFFINEQB for our bit reordering, and that's nice when it's
	// available, but it's absolutely not required.
	//
	// We can use PSHUFB to get a lookup into a table of 16 8b entries (standard technique) per
	// byte. In our case, for the bit permutation, we do one table lookup to see what the low nibble
	// should map to, one table lookup to see what the high nibble should map to, and then OR the
	// two together.
	//
	// That's more expensive than a single instruction, but we only do this step once, at the very end,
	// so it's not the end of the world.
	//
	// That leaves the multishift action. That, too, simplifies considerably. Repeating the deinterleaved
	// x values from above:
	// x0x1x2-- --------\|------x3 x4x5----\|-------- ----x6x7\|x8x9xa-- --------
	//
	// note that the x3 in byte 1 is exactly in the same bit position within a byte as the actual x3
	// bit in our input number is. In fact, this is true for all of those values. We don't need
	// arbitrary per-byte shift amounts at all. All we really need to do here is repeat the first
	// (low-order, since we're little-endian) byte of the X coordinate 3 times, then the second byte
	// of the X coordinate, and finally use an AND mask. This is just PSHUFB + AND.
	//
	// The Y and Z coordinates are slightly more complicated, but really only slightly:
	// ------y0 y1y2----\|-------- ----y3y4\|y5y6y7-- --------\|------y8 y9ya----
	// -------- ----z0z1\|z2z3z4-- --------\|------z5 z6z7----\|-------- ----z8z9
	//
	// Note that again the bit positions within bytes all line up. We just need different masks
	// and it turns out to be advantageous to use the sequence PSHUFB -> PSLLD (with 3 or 6) -> PAND
	// for Y and Z since it lets us reuse the same shuffle control vector for all three coordinates.

	// Input: vectors of 32-bit X/Y/Z coords
	// Output: 32-bit Morton code
	static Vec128_U32 MortonEncode3D_Vec(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z)
	{
	// In a loop, all these constants only need to get loaded once:
	Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13);
	Vec128_U32 MaskX = _mm_set1_epi32(0x07c03807);
	Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038);
	Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0);
	Vec128_U32 MaskNib = _mm_set1_epi8(0xf);
	Vec128_U32 LoLUT = _mm_setr_epi8(0x00, 0x01, 0x08, 0x09, 0x40, 0x41, 0x48, 0x49, 0x02, 0x03, 0x0a, 0x0b, 0x42, 0x43, 0x4a, 0x4b);
	Vec128_U32 HiLUT = _mm_setr_epi8(0x00, 0x10, 0x80, 0x90, 0x04, 0x14, 0x84, 0x94, 0x20, 0x30, 0xa0, 0xb0, 0x24, 0x34, 0xa4, 0xb4);

	// Actual work:
	Vec128_U32 ShiftedX = _mm_and_si128(_mm_shuffle_epi8(X, ShufS), MaskX);
	Vec128_U32 ShiftedY = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Y, ShufS), 3), MaskY);
	Vec128_U32 ShiftedZ = _mm_and_si128(_mm_slli_epi32(_mm_shuffle_epi8(Z, ShufS), 6), MaskZ);

	Vec128_U32 Merged = _mm_or_si128(_mm_or_si128(ShiftedX, ShiftedY), ShiftedZ);

	Vec128_U32 LoResult = _mm_shuffle_epi8(LoLUT, _mm_and_si128(Merged, MaskNib));
	Vec128_U32 HiResult = _mm_shuffle_epi8(HiLUT, _mm_and_si128(_mm_srli_epi16(Merged, 4), MaskNib));

	return _mm_or_si128(LoResult, HiResult);
	}

	// Input: vectors of 32-bit X/Y/Z coords
	// Output: 32-bit Morton code
	static Vec128_U32 MortonEncode3D_Fancy(Vec128_U32 X, Vec128_U32 Y, Vec128_U32 Z)
	{
	// In a loop, all these constants only need to get loaded once:
	Vec128_U32 ShufS = _mm_setr_epi8(0,0,0,1, 4,4,4,5, 8,8,8,9, 12,12,12,13);
	Vec128_U32 ShiftsY = _mm_setr_epi8(-3,-3,5,5, 29,29,37,37, -3,-3,5,5, 29,29,37,37);
	Vec128_U32 ShiftsZ = _mm_setr_epi8(-6,2,2,2, 26,34,34,34, -6,2,2,2, 26,34,34,34);

	Vec128_U32 MaskY = _mm_set1_epi32(0x3807c038);
	Vec128_U32 MaskZ = _mm_set1_epi32(0xc03807c0);

	Vec128_U32 PermuteMatrix = _mm_set_epi8(0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20, 0x01, 0x08, 0x40, 0x02, 0x10, 0x80, 0x04, 0x20);

	// Set up 3 pre-shifted inputs
	Vec128_U32 ShiftedX = _mm_shuffle_epi8(X, ShufS);
	Vec128_U32 ShiftedY = _mm_multishift_epi64_epi8(ShiftsY, Y);
	Vec128_U32 ShiftedZ = _mm_multishift_epi64_epi8(ShiftsZ, Z);

	// Merge
	constexpr uint8_t TERNLOG_A = 0xF0;
	constexpr uint8_t TERNLOG_B = 0xCC;
	constexpr uint8_t TERNLOG_C = 0xAA;
	Vec128_U32 MergedXY = _mm_ternarylogic_epi32(ShiftedX, ShiftedY, MaskY, (TERNLOG_A & ~TERNLOG_C) \| (TERNLOG_B & TERNLOG_C));
	Vec128_U32 Merged = _mm_ternarylogic_epi32(MergedXY, ShiftedZ, MaskZ, (TERNLOG_A & ~TERNLOG_C) \| (TERNLOG_B & TERNLOG_C));

	// Bit permute
	return _mm_gf2p8affine_epi64_epi8(Merged, PermuteMatrix, 0x00);
	}

	static void BatchEncode_Ref(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
	{
	for (size_t i = 0; i < Count; i++)
	{
	Dest[i] = MortonEncode3D(X[i], Y[i], Z[i]);
	}
	}

	static inline Vec128_U32 Load128_U32(const uint32* Values)
	{
	return _mm_loadu_si128((const __m128i *) Values);
	}

	static void Store128_U32(uint32* Dest, Vec128_U32 Vec)
	{
	_mm_storeu_si128((__m128i *)Dest, Vec);
	}

	static void BatchEncode_Vec(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
	{
	assert(Count % 4 == 0);

	for (size_t i = 0; i < Count; i += 4)
	{
	Store128_U32(Dest + i, MortonEncode3D_Vec(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i)));
	}
	}

	static void BatchEncode_Fancy(uint32* Dest, const uint32* X, const uint32* Y, const uint32* Z, size_t Count)
	{
	assert(Count % 4 == 0);

	for (size_t i = 0; i < Count; i += 4)
	{
	Store128_U32(Dest + i, MortonEncode3D_Fancy(Load128_U32(X + i), Load128_U32(Y + i), Load128_U32(Z + i)));
	}
	}

	class Rng
	{
	static const uint64_t MCG_MUL = 6364136223846793005ull;
	uint64_t state;

	public:
	static Rng seed(uint32_t seed)
	{
	// State may not be 0 (MCG); that's why we flip the low bits
	// also do one multiply step in case the input is a small integer (which it often is).
	Rng r;
	r.state = ~(uint64_t)seed \| (uint64_t(seed) << 32);
	return r;
	}

	// Random 32-bit uint
	uint32_t random()
	{
	uint64_t oldstate = state;
	uint32_t rot_input = (uint32_t) (((oldstate >> 18) ^ oldstate) >> 27);
	uint32_t rot_amount = (uint32_t) (oldstate >> 59);
	uint32_t output = (rot_input >> rot_amount) \| (rot_input << ((0u - rot_amount) & 31)); // rotr(rot_input, rot_amount)

	// Advance multiplicative congruential generator
	// Constant from PCG reference impl.
	state = oldstate * MCG_MUL;

	return output;
	}
	};

	int main()
	{
	Rng random = Rng::seed(12345);
	static const size_t Count = 4096;

	uint32 X[Count], Y[Count], Z[Count];
	uint32 Ref[Count], Tst[Count];

	for (size_t i = 0; i < Count; i++)
	{
	X[i] = random.random();
	Y[i] = random.random();
	Z[i] = random.random();
	}

	BatchEncode_Ref(Ref, X, Y, Z, Count);
	printf("Testing Vec:\n");
	BatchEncode_Vec(Tst, X, Y, Z, Count);

	for (size_t i = 0; i < Count; i++)
	{
	if (Ref[i] != Tst[i])
	{
	printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]);
	return 1;
	}
	}

	printf("Testing Fancy:\n");
	BatchEncode_Fancy(Tst, X, Y, Z, Count);

	for (size_t i = 0; i < Count; i++)
	{
	if (Ref[i] != Tst[i])
	{
	printf("Mismatch! i=%zd X=0x%08x Y=0x%08x Z=0x%08x Ref=0x%08x Tst=0x%08x\n", i, X[i], Y[i], Z[i], Ref[i], Tst[i]);
	return 1;
	}
	}

	printf("all OK!\n");

	return 0;
	}