Created
August 5, 2021 02:14
-
-
Save rygorous/e045d4687ce6782c0f3d5091459f4728 to your computer and use it in GitHub Desktop.
multigetbits
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static inline __m128i prefix_sum_u8(__m128i x) | |
{ | |
#if 1 | |
// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck | |
// for us) | |
x = _mm_add_epi8(x, _mm_slli_epi64(x, 8)); | |
x = _mm_add_epi8(x, _mm_slli_epi64(x, 16)); | |
x = _mm_add_epi8(x, _mm_slli_epi64(x, 32)); | |
x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7))); | |
#else | |
// x[0], x[1], x[2], x[3], ... | |
x = _mm_add_epi8(x, _mm_slli_si128(x, 1)); | |
// x[0], sum(x[0:2]), sum(x[1:3]), sum(x[2:4]), ... | |
x = _mm_add_epi8(x, _mm_slli_si128(x, 2)); | |
// x[0], sum(x[0:2]), sum(x[0:3]), sum(x[0:4]), sum(x[1:5]), sum(x[2:6]), ... | |
x = _mm_add_epi8(x, _mm_slli_si128(x, 4)); | |
// longest group now sums over 8 elems | |
x = _mm_add_epi8(x, _mm_slli_si128(x, 8)); | |
#endif | |
// and now we're done | |
return x; | |
} | |
static inline __m128i big_endian_load_shift128(const uint8_t *in_ptr, uint32_t bit_basepos) | |
{ | |
// Grab 128 source bits starting "bit_basepos" bits into the bit stream | |
__m128i src_byte0 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 0)); | |
__m128i src_byte1 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 1)); | |
// We need to consume the first (bit_basepos & 7) bits with a big-endian 128-bit | |
// funnel shift, which we don't have ready at hand, so we need to get creative; | |
// specifically, use 16-bit shifts by a single distance for all lanes (which we have) | |
// and make sure to not grab any bits that crossed byte boundaries (which would be | |
// taken from the wrong byte due to the endianness difference) | |
uint32_t basepos7 = bit_basepos & 7; | |
__m128i basepos_shiftamt = _mm_cvtsi32_si128(basepos7); | |
// Combine to big-endian 16-bit words and shift those since we don't have 8-bit shifts | |
// at hand | |
__m128i merged0 = _mm_unpacklo_epi8(src_byte1, src_byte0); | |
__m128i merged1 = _mm_unpackhi_epi8(src_byte1, src_byte0); | |
__m128i shifted0 = _mm_sll_epi16(merged0, basepos_shiftamt); | |
__m128i shifted1 = _mm_sll_epi16(merged1, basepos_shiftamt); | |
__m128i reduced0 = _mm_srli_epi16(shifted0, 8); | |
__m128i reduced1 = _mm_srli_epi16(shifted1, 8); | |
__m128i shifted_src_bytes = _mm_packus_epi16(reduced0, reduced1); | |
return shifted_src_bytes; | |
} | |
// individual field_widths in [0,8] | |
// MSB-first bit packing convention, SSSE3+ | |
// | |
// Once more, with feeling | |
// trying to figure out a niftier way to do this that'll also allow me do to | |
// wider variants that don't suck | |
static inline __m128i multigetbits8b(const uint8_t *in_ptr, uint32_t *pbit_basepos, __m128i field_widths) | |
{ | |
uint32_t bit_basepos = *pbit_basepos; | |
// Prefix-sum the field widths and advance bit position pointer | |
__m128i end_bit_index = prefix_sum_u8(field_widths); | |
uint32_t total_width = (uint32_t)_mm_extract_epi16(end_bit_index, 7) >> 8; // no PEXTRB before SSE4.1, and this is the only place where SSE4.1+ helps | |
*pbit_basepos = bit_basepos + total_width; | |
// Doing this shift is a bit of a production, but it simplifies the rest. | |
__m128i shifted_src_bytes = big_endian_load_shift128(in_ptr, bit_basepos); | |
__m128i end_byte_index = _mm_and_si128(_mm_srli_epi16(end_bit_index, 3), _mm_set1_epi8(0x1f)); // no "shift bytes", sigh. | |
// Grab first/second bytes for every lane | |
__m128i byte1 = _mm_shuffle_epi8(shifted_src_bytes, end_byte_index); | |
__m128i byte0 = _mm_shuffle_epi8(shifted_src_bytes, _mm_sub_epi8(end_byte_index, _mm_set1_epi8(1))); | |
// Assemble words (byte1 << 8) | byte0 | |
__m128i words0 = _mm_unpacklo_epi8(byte1, byte0); | |
__m128i words1 = _mm_unpackhi_epi8(byte1, byte0); | |
// Now do a left shift by 1 << (end_bit_index & 7) using a multiply, | |
// putting the end of the bit field at the boundary between the low and high byte | |
// in every word. | |
__m128i end_bit_index7 = _mm_and_si128(end_bit_index, _mm_set1_epi8(7)); | |
__m128i left_shift_lut = _mm_setr_epi8(1,2,4,8, 16,32,64,-128, 1,2,4,8, 16,32,64,-128); | |
__m128i shiftm = _mm_shuffle_epi8(left_shift_lut, end_bit_index7); | |
__m128i shift_mul0 = _mm_unpacklo_epi8(shiftm, _mm_setzero_si128()); | |
__m128i shift_mul1 = _mm_unpackhi_epi8(shiftm, _mm_setzero_si128()); | |
__m128i shifted0 = _mm_mullo_epi16(words0, shift_mul0); | |
__m128i shifted1 = _mm_mullo_epi16(words1, shift_mul1); | |
// Grab the high byte of the results and pack back into bytes | |
__m128i shifted_bytes = _mm_packus_epi16(_mm_srli_epi16(shifted0, 8), _mm_srli_epi16(shifted1, 8)); | |
// mask by field width, again using a PSHUFB LUT | |
__m128i width_mask_lut = _mm_setr_epi8(0,1,3,7, 15,31,63,127, -1,-1,-1,-1, -1,-1,-1,-1); | |
__m128i width_mask = _mm_shuffle_epi8(width_mask_lut, field_widths); | |
__m128i result = _mm_and_si128(shifted_bytes, width_mask); | |
return result; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment