Last active
January 17, 2018 23:30
-
-
Save timshen91/0f321fe2c5cfb04015917c0529052158 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
clang++ --version | |
clang++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2 && time ./a.out | |
clang++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2 && time ./a.out | |
clang++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2 && time ./a.out | |
g++ --version | |
g++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2 && time ./a.out | |
g++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2 && time ./a.out | |
g++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2 && time ./a.out | |
Output on x86_64-linux-gnu, SSE4.2: | |
+ clang++ --version | |
clang version 3.8.1-24 (tags/RELEASE_381/final) | |
Target: x86_64-pc-linux-gnu | |
Thread model: posix | |
InstalledDir: /usr/bin | |
+ clang++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2 | |
+ ./a.out | |
3276800000 | |
real 0m2.192s | |
user 0m2.188s | |
sys 0m0.000s | |
+ clang++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2 | |
+ ./a.out | |
3276800000 | |
real 0m0.125s | |
user 0m0.124s | |
sys 0m0.000s | |
+ clang++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2 | |
+ ./a.out | |
3276800000 | |
real 0m0.183s | |
user 0m0.180s | |
sys 0m0.000s | |
+ g++ --version | |
g++ (Debian 6.3.0-18) 6.3.0 20170516 | |
Copyright (C) 2016 Free Software Foundation, Inc. | |
This is free software; see the source for copying conditions. There is NO | |
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
+ g++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2 | |
+ ./a.out | |
3276800000 | |
real 0m1.583s | |
user 0m1.580s | |
sys 0m0.000s | |
+ g++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2 | |
+ ./a.out | |
3276800000 | |
real 0m0.196s | |
user 0m0.192s | |
sys 0m0.004s | |
+ g++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2 | |
+ ./a.out | |
3276800000 | |
real 0m0.250s | |
user 0m0.248s | |
sys 0m0.000s | |
*/ | |
#include <cassert> | |
#include <cstdint> | |
#include <emmintrin.h> | |
#include <vector> | |
#include <iostream> | |
using V64 = int64_t __attribute__((vector_size(16))); | |
using V16 = int16_t __attribute__((vector_size(16))); | |
inline int64_t SumAutoVec(uint8_t *buffer, size_t n) { | |
int64_t ret = 0; | |
for (int i = 0; i < n; i++) { | |
ret += buffer[i]; | |
} | |
return ret; | |
} | |
inline int64_t SumSad(uint8_t *buffer, size_t n) { | |
assert(n % 16 == 0); | |
assert(uintptr_t(buffer) % 16 == 0); | |
__m128i acc = _mm_setzero_si128(); | |
for (int i = 0; i < n; i += 16) { | |
__m128i data = _mm_load_si128(reinterpret_cast<__m128i *>(buffer + i)); | |
acc = _mm_add_epi64(_mm_sad_epu8(data, _mm_setzero_si128()), acc); | |
} | |
return V64(acc)[0] + V64(acc)[1]; | |
} | |
inline int64_t SumElementWiseInt16Acc(uint8_t *buffer, size_t n) { | |
assert(n % 16 == 0); | |
assert(uintptr_t(buffer) % 16 == 0); | |
__m128i acc = _mm_setzero_si128(); | |
for (int i = 0; i < n; i += 16) { | |
__m128i data = _mm_load_si128(reinterpret_cast<__m128i *>(buffer + i)); | |
__m128i lo = _mm_unpacklo_epi8(data, _mm_setzero_si128()); | |
__m128i hi = _mm_unpackhi_epi8(data, _mm_setzero_si128()); | |
acc = _mm_add_epi16(_mm_add_epi16(lo, hi), acc); | |
} | |
int64_t ret = 0; | |
for (int i = 0; i < 8; i++) { | |
ret += V16(acc)[i]; | |
} | |
return ret; | |
} | |
int main() { | |
int count = 100000; | |
std::vector<uint8_t> a(32768, 1); | |
int64_t acc = 0; | |
while (count--) { | |
acc += TEST_WITH_FUNC(a.data(), a.size()); | |
} | |
std::cout << acc << "\n"; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment