rygorous · August 5, 2021 02:14
diff --git a/multigetbits.cpp b/multigetbits.cpp
 static inline __m128i prefix_sum_u8(__m128i x)
 {
 #if 1
 	// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
 	// for us)
 	x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
 	x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
 	x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
 	x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
 #else
 	// x[0], x[1], x[2], x[3], ...
 	x = _mm_add_epi8(x, _mm_slli_si128(x, 1));
 	// x[0], sum(x[0:2]), sum(x[1:3]), sum(x[2:4]), ...
 	x = _mm_add_epi8(x, _mm_slli_si128(x, 2));
 	// x[0], sum(x[0:2]), sum(x[0:3]), sum(x[0:4]), sum(x[1:5]), sum(x[2:6]), ...
 	x = _mm_add_epi8(x, _mm_slli_si128(x, 4));
 	// longest group now sums over 8 elems
 	x = _mm_add_epi8(x, _mm_slli_si128(x, 8));
 #endif
 	// and now we're done
 	return x;
 }


 static inline __m128i big_endian_load_shift128(const uint8_t *in_ptr, uint32_t bit_basepos)
 {
 	// Grab 128 source bits starting "bit_basepos" bits into the bit stream
 	__m128i src_byte0 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 0));
 	__m128i src_byte1 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 1));

 	// We need to consume the first (bit_basepos & 7) bits with a big-endian 128-bit
 	// funnel shift, which we don't have ready at hand, so we need to get creative;
 	// specifically, use 16-bit shifts by a single distance for all lanes (which we have)
 	// and make sure to not grab any bits that crossed byte boundaries (which would be
 	// taken from the wrong byte due to the endianness difference)
 	uint32_t basepos7 = bit_basepos & 7;
 	__m128i basepos_shiftamt = _mm_cvtsi32_si128(basepos7);

 	// Combine to big-endian 16-bit words and shift those since we don't have 8-bit shifts
 	// at hand
 	__m128i merged0 = _mm_unpacklo_epi8(src_byte1, src_byte0);
 	__m128i merged1 = _mm_unpackhi_epi8(src_byte1, src_byte0);
 	__m128i shifted0 = _mm_sll_epi16(merged0, basepos_shiftamt);
 	__m128i shifted1 = _mm_sll_epi16(merged1, basepos_shiftamt);
 	__m128i reduced0 = _mm_srli_epi16(shifted0, 8);
 	__m128i reduced1 = _mm_srli_epi16(shifted1, 8);
 	__m128i shifted_src_bytes = _mm_packus_epi16(reduced0, reduced1);

 	return shifted_src_bytes;
 }

 // individual field_widths in [0,8]
 // MSB-first bit packing convention, SSSE3+
 //
 // Once more, with feeling
 // trying to figure out a niftier way to do this that'll also allow me do to
 // wider variants that don't suck
 static inline __m128i multigetbits8b(const uint8_t *in_ptr, uint32_t *pbit_basepos, __m128i field_widths)
 {
 	uint32_t bit_basepos = *pbit_basepos;

 	// Prefix-sum the field widths and advance bit position pointer
 	__m128i end_bit_index = prefix_sum_u8(field_widths);
 	uint32_t total_width = (uint32_t)_mm_extract_epi16(end_bit_index, 7) >> 8; // no PEXTRB before SSE4.1, and this is the only place where SSE4.1+ helps
 	*pbit_basepos = bit_basepos + total_width;

 	// Doing this shift is a bit of a production, but it simplifies the rest.
 	__m128i shifted_src_bytes = big_endian_load_shift128(in_ptr, bit_basepos);
 	__m128i end_byte_index = _mm_and_si128(_mm_srli_epi16(end_bit_index, 3), _mm_set1_epi8(0x1f)); // no "shift bytes", sigh.

 	// Grab first/second bytes for every lane
 	__m128i byte1 = _mm_shuffle_epi8(shifted_src_bytes, end_byte_index);
 	__m128i byte0 = _mm_shuffle_epi8(shifted_src_bytes, _mm_sub_epi8(end_byte_index, _mm_set1_epi8(1)));

 	// Assemble words (byte1 << 8) | byte0
 	__m128i words0 = _mm_unpacklo_epi8(byte1, byte0);
 	__m128i words1 = _mm_unpackhi_epi8(byte1, byte0);

 	// Now do a left shift by 1 << (end_bit_index & 7) using a multiply,
 	// putting the end of the bit field at the boundary between the low and high byte
 	// in every word.
 	__m128i end_bit_index7 = _mm_and_si128(end_bit_index, _mm_set1_epi8(7));
 	__m128i left_shift_lut = _mm_setr_epi8(1,2,4,8, 16,32,64,-128, 1,2,4,8, 16,32,64,-128);
 	__m128i shiftm = _mm_shuffle_epi8(left_shift_lut, end_bit_index7);
 	__m128i shift_mul0 = _mm_unpacklo_epi8(shiftm, _mm_setzero_si128());
 	__m128i shift_mul1 = _mm_unpackhi_epi8(shiftm, _mm_setzero_si128());
 	__m128i shifted0 = _mm_mullo_epi16(words0, shift_mul0);
 	__m128i shifted1 = _mm_mullo_epi16(words1, shift_mul1);

 	// Grab the high byte of the results and pack back into bytes
 	__m128i shifted_bytes = _mm_packus_epi16(_mm_srli_epi16(shifted0, 8), _mm_srli_epi16(shifted1, 8));

 	// mask by field width, again using a PSHUFB LUT
 	__m128i width_mask_lut = _mm_setr_epi8(0,1,3,7, 15,31,63,127, -1,-1,-1,-1, -1,-1,-1,-1);
 	__m128i width_mask = _mm_shuffle_epi8(width_mask_lut, field_widths);
 	__m128i result = _mm_and_si128(shifted_bytes, width_mask);

 	return result;
 }
	static inline __m128i prefix_sum_u8(__m128i x)
	{
	#if 1
	// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
	// for us)
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
	x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
	#else
	// x[0], x[1], x[2], x[3], ...
	x = _mm_add_epi8(x, _mm_slli_si128(x, 1));
	// x[0], sum(x[0:2]), sum(x[1:3]), sum(x[2:4]), ...
	x = _mm_add_epi8(x, _mm_slli_si128(x, 2));
	// x[0], sum(x[0:2]), sum(x[0:3]), sum(x[0:4]), sum(x[1:5]), sum(x[2:6]), ...
	x = _mm_add_epi8(x, _mm_slli_si128(x, 4));
	// longest group now sums over 8 elems
	x = _mm_add_epi8(x, _mm_slli_si128(x, 8));
	#endif
	// and now we're done
	return x;
	}


	static inline __m128i big_endian_load_shift128(const uint8_t *in_ptr, uint32_t bit_basepos)
	{
	// Grab 128 source bits starting "bit_basepos" bits into the bit stream
	__m128i src_byte0 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 0));
	__m128i src_byte1 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 1));

	// We need to consume the first (bit_basepos & 7) bits with a big-endian 128-bit
	// funnel shift, which we don't have ready at hand, so we need to get creative;
	// specifically, use 16-bit shifts by a single distance for all lanes (which we have)
	// and make sure to not grab any bits that crossed byte boundaries (which would be
	// taken from the wrong byte due to the endianness difference)
	uint32_t basepos7 = bit_basepos & 7;
	__m128i basepos_shiftamt = _mm_cvtsi32_si128(basepos7);

	// Combine to big-endian 16-bit words and shift those since we don't have 8-bit shifts
	// at hand
	__m128i merged0 = _mm_unpacklo_epi8(src_byte1, src_byte0);
	__m128i merged1 = _mm_unpackhi_epi8(src_byte1, src_byte0);
	__m128i shifted0 = _mm_sll_epi16(merged0, basepos_shiftamt);
	__m128i shifted1 = _mm_sll_epi16(merged1, basepos_shiftamt);
	__m128i reduced0 = _mm_srli_epi16(shifted0, 8);
	__m128i reduced1 = _mm_srli_epi16(shifted1, 8);
	__m128i shifted_src_bytes = _mm_packus_epi16(reduced0, reduced1);

	return shifted_src_bytes;
	}

	// individual field_widths in [0,8]
	// MSB-first bit packing convention, SSSE3+
	//
	// Once more, with feeling
	// trying to figure out a niftier way to do this that'll also allow me do to
	// wider variants that don't suck
	static inline __m128i multigetbits8b(const uint8_t in_ptr, uint32_t pbit_basepos, __m128i field_widths)
	{
	uint32_t bit_basepos = *pbit_basepos;

	// Prefix-sum the field widths and advance bit position pointer
	__m128i end_bit_index = prefix_sum_u8(field_widths);
	uint32_t total_width = (uint32_t)_mm_extract_epi16(end_bit_index, 7) >> 8; // no PEXTRB before SSE4.1, and this is the only place where SSE4.1+ helps
	*pbit_basepos = bit_basepos + total_width;

	// Doing this shift is a bit of a production, but it simplifies the rest.
	__m128i shifted_src_bytes = big_endian_load_shift128(in_ptr, bit_basepos);
	__m128i end_byte_index = _mm_and_si128(_mm_srli_epi16(end_bit_index, 3), _mm_set1_epi8(0x1f)); // no "shift bytes", sigh.

	// Grab first/second bytes for every lane
	__m128i byte1 = _mm_shuffle_epi8(shifted_src_bytes, end_byte_index);
	__m128i byte0 = _mm_shuffle_epi8(shifted_src_bytes, _mm_sub_epi8(end_byte_index, _mm_set1_epi8(1)));

	// Assemble words (byte1 << 8) \| byte0
	__m128i words0 = _mm_unpacklo_epi8(byte1, byte0);
	__m128i words1 = _mm_unpackhi_epi8(byte1, byte0);

	// Now do a left shift by 1 << (end_bit_index & 7) using a multiply,
	// putting the end of the bit field at the boundary between the low and high byte
	// in every word.
	__m128i end_bit_index7 = _mm_and_si128(end_bit_index, _mm_set1_epi8(7));
	__m128i left_shift_lut = _mm_setr_epi8(1,2,4,8, 16,32,64,-128, 1,2,4,8, 16,32,64,-128);
	__m128i shiftm = _mm_shuffle_epi8(left_shift_lut, end_bit_index7);
	__m128i shift_mul0 = _mm_unpacklo_epi8(shiftm, _mm_setzero_si128());
	__m128i shift_mul1 = _mm_unpackhi_epi8(shiftm, _mm_setzero_si128());
	__m128i shifted0 = _mm_mullo_epi16(words0, shift_mul0);
	__m128i shifted1 = _mm_mullo_epi16(words1, shift_mul1);

	// Grab the high byte of the results and pack back into bytes
	__m128i shifted_bytes = _mm_packus_epi16(_mm_srli_epi16(shifted0, 8), _mm_srli_epi16(shifted1, 8));

	// mask by field width, again using a PSHUFB LUT
	__m128i width_mask_lut = _mm_setr_epi8(0,1,3,7, 15,31,63,127, -1,-1,-1,-1, -1,-1,-1,-1);
	__m128i width_mask = _mm_shuffle_epi8(width_mask_lut, field_widths);
	__m128i result = _mm_and_si128(shifted_bytes, width_mask);

	return result;
	}