Created
December 11, 2016 12:29
-
-
Save primenumber/d9fac500ff88041be7c1cad6f541bda4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <x86intrin.h> | |
#include <boost/timer/timer.hpp> | |
inline __m128i mm_delta_swap_epi64(__m128i a, __m128i mask, int delta) { | |
__m128i x = _mm_and_si128(_mm_xor_si128(a, _mm_srli_epi64(a, delta)), mask); | |
return _mm_xor_si128(_mm_xor_si128(a, x), _mm_slli_epi64(x, delta)); | |
} | |
inline __m128i mm_unpacklo_epb_unpack_dswap(__m128i a, __m128i b) { | |
__m128i unpack8 = _mm_unpacklo_epi8(a, b); | |
__m128i unpack4 = mm_delta_swap_epi64(unpack8, _mm_set1_epi16(0x00F0), 4); | |
__m128i unpack2 = mm_delta_swap_epi64(unpack4, _mm_set1_epi8(0x0C), 2); | |
return mm_delta_swap_epi64(unpack2, _mm_set1_epi8(0x22), 1); | |
} | |
inline __m128i mm_unpackhi_epb_unpack_dswap(__m128i a, __m128i b) { | |
__m128i unpack8 = _mm_unpackhi_epi8(a, b); | |
__m128i unpack4 = mm_delta_swap_epi64(unpack8, _mm_set1_epi16(0x00F0), 4); | |
__m128i unpack2 = mm_delta_swap_epi64(unpack4, _mm_set1_epi8(0x0C), 2); | |
return mm_delta_swap_epi64(unpack2, _mm_set1_epi8(0x22), 1); | |
} | |
inline __m128i mm_unpacklo_epb_pdep(__m128i a, __m128i b) { | |
uint64_t alo = _mm_cvtsi128_si64(a); | |
uint64_t blo = _mm_cvtsi128_si64(b); | |
return _mm_set_epi64x( | |
_pdep_u64(alo >> 32, UINT64_C(0x5555555555555555)) | _pdep_u64(blo >> 32, UINT64_C(0xAAAAAAAAAAAAAAAA)), | |
_pdep_u64(alo, UINT64_C(0x5555555555555555)) | _pdep_u64(blo, UINT64_C(0xAAAAAAAAAAAAAAAA))); | |
} | |
inline __m128i mm_unpackhi_epb_pdep(__m128i a, __m128i b) { | |
uint64_t ahi = _mm_extract_epi64(a, 1); | |
uint64_t bhi = _mm_extract_epi64(b, 1); | |
return _mm_set_epi64x( | |
_pdep_u64(ahi >> 32, UINT64_C(0x5555555555555555)) | _pdep_u64(bhi >> 32, UINT64_C(0xAAAAAAAAAAAAAAAA)), | |
_pdep_u64(ahi, UINT64_C(0x5555555555555555)) | _pdep_u64(bhi, UINT64_C(0xAAAAAAAAAAAAAAAA))); | |
} | |
inline __m128i mm_unpacklo_epb_pclmulqdq(__m128i a, __m128i b) { | |
return _mm_or_si128(_mm_clmulepi64_si128(a, a, 0x00), _mm_slli_epi32(_mm_clmulepi64_si128(b, b, 0x00), 1)); | |
} | |
inline __m128i mm_unpackhi_epb_pclmulqdq(__m128i a, __m128i b) { | |
return _mm_or_si128(_mm_clmulepi64_si128(a, a, 0x11), _mm_slli_epi32(_mm_clmulepi64_si128(b, b, 0x11), 1)); | |
} | |
inline __m128i mm_xorshift32_epi32(__m128i seeds) { | |
seeds = _mm_xor_si128(seeds, _mm_slli_epi32(seeds, 13)); | |
seeds = _mm_xor_si128(seeds, _mm_srli_epi32(seeds, 17)); | |
return seeds = _mm_xor_si128(seeds, _mm_slli_epi32(seeds, 5)); | |
} | |
#define DEF_BENCH_UNPACKLO_EPB(name) \ | |
void bench_unpacklo_epb_##name() { \ | |
std::cout << "Bench "#name << std::endl; \ | |
__m128i input1 = _mm_setr_epi32(1, 2, 3, 4); \ | |
__m128i input2 = _mm_setr_epi32(5, 6, 7, 8); \ | |
__m128i result = _mm_setzero_si128(); \ | |
boost::timer::cpu_timer timer; \ | |
for (int i = 0; i < 1 << 30; ++i) { \ | |
result = _mm_xor_si128(result, mm_unpacklo_epb_##name(input1, input2)); \ | |
input1 = mm_xorshift32_epi32(input1); \ | |
input2 = mm_xorshift32_epi32(input2); \ | |
} \ | |
std::cout << _mm_extract_epi64(result, 0) << ' ' << _mm_extract_epi64(result, 1) << std::endl; \ | |
std::cout << timer.format(3, "elapsed: %ws") << std::endl; \ | |
} | |
#define DEF_BENCH_UNPACKHI_EPB(name) \ | |
void bench_unpackhi_epb_##name() { \ | |
std::cout << "Bench "#name << std::endl; \ | |
__m128i input1 = _mm_setr_epi32(1, 2, 3, 4); \ | |
__m128i input2 = _mm_setr_epi32(5, 6, 7, 8); \ | |
__m128i result = _mm_setzero_si128(); \ | |
boost::timer::cpu_timer timer; \ | |
for (int i = 0; i < 1 << 30; ++i) { \ | |
result = _mm_xor_si128(result, mm_unpackhi_epb_##name(input1, input2)); \ | |
input1 = mm_xorshift32_epi32(input1); \ | |
input2 = mm_xorshift32_epi32(input2); \ | |
} \ | |
std::cout << _mm_extract_epi64(result, 0) << ' ' << _mm_extract_epi64(result, 1) << std::endl; \ | |
std::cout << timer.format(3, "elapsed: %ws") << std::endl; \ | |
} | |
DEF_BENCH_UNPACKLO_EPB(unpack_dswap); | |
DEF_BENCH_UNPACKLO_EPB(pdep); | |
DEF_BENCH_UNPACKLO_EPB(pclmulqdq); | |
DEF_BENCH_UNPACKHI_EPB(unpack_dswap); | |
DEF_BENCH_UNPACKHI_EPB(pdep); | |
DEF_BENCH_UNPACKHI_EPB(pclmulqdq); | |
int main() { | |
std::cout << "Unpack Lo" << std::endl; | |
bench_unpacklo_epb_unpack_dswap(); | |
bench_unpacklo_epb_pdep(); | |
bench_unpacklo_epb_pclmulqdq(); | |
std::cout << "Unpack Hi" << std::endl; | |
bench_unpackhi_epb_unpack_dswap(); | |
bench_unpackhi_epb_pdep(); | |
bench_unpackhi_epb_pclmulqdq(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment