Created
June 14, 2015 02:43
-
-
Save jbarczak/bb3f34df8999c06a076b to your computer and use it in GitHub Desktop.
ray reordering with shuffle lut
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Tried this, and it was marginally slower | |
// | |
// Some notes about this: | |
// 1. Seperate hit/miss arrays force me to use a lot more stack than I did before, and | |
// probably doesn't use the cache quite as well. | |
// 2. The prefetching of the rays doesn't fit in quite as neatly, and doesn't help anymore if I stick it in there | |
// it might make more sense to move that elsewhere anyway | |
// 3. LUT is 256 bytes. Not too bad, but it's probably knocking a few rays out of the cache | |
// 4. Reordering can produce at least one packet that is partially miss and partially hit. | |
// Seperate arrays means I need to split the ray movement into 2 passes, with a special case in between for the half-full one. | |
static const __m128i SHUFFLE_TABLE[16] = { | |
_mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1), | |
_mm_setr_epi8(12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1), | |
_mm_setr_epi8( 8, 9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1), | |
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1), | |
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15), | |
}; | |
uint32 pHitIDs[MAX_TRACER_SIZE+8]; | |
uint32 pMissIDs[MAX_TRACER_SIZE+8]; | |
size_t nHits = 0; | |
size_t nMisses = 0; | |
const char* pRays = (const char*) frame.pRays; | |
for( size_t i=0; i<nGroups; i++ ) | |
{ | |
uint64 hits = frame.pMasks[i]; | |
uint64 misses = hits ^ 0xff; | |
uint64 hit_lo = (hits & 0x0f); | |
uint64 hit_hi = (hits & 0xf0)>>4; | |
uint64 miss_lo = (misses & 0x0f); | |
uint64 miss_hi = (misses & 0xf0)>>4; | |
// load lo/hi ID pairs | |
// NOTE: These are 32-bit, because they're byte offsets from start of ray array | |
// This enables the ray read to avoid using shifts to multiply by sizeof(Ray) | |
// Could try doing the shuffles with m256, but stores would need an extract, which defeats the purpose | |
uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets; | |
__m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs ); | |
__m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) ); | |
// store hit/miss iDs | |
__m128i vhit_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] ); | |
__m128i vhit_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] ); | |
__m128i vmiss_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[miss_lo] ); | |
__m128i vmiss_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[miss_hi] ); | |
_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_lo ); | |
_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_lo ); | |
// NOTE: Tried replacing _mm_popcnts with an 8-bit LUT. This was even slower | |
nHits += _mm_popcnt_u64(hit_lo); | |
nMisses += _mm_popcnt_u64(miss_lo); | |
_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_hi ); | |
_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_hi ); | |
nHits += _mm_popcnt_u64(hit_hi); | |
nMisses += _mm_popcnt_u64(miss_hi); | |
} | |
// Probably going to try an unrolled variation of the prefix sum. You're right that I could use 16bit for adds, but if I use 8-bit I can squeeze | |
// an extra ray group into the upper halves |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment