Created
October 6, 2014 11:05
-
-
Save Rod-Persky/5019f95630e0fede6629 to your computer and use it in GitHub Desktop.
Pure C vector reduce
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int step_8 = size / 8; int step_8_r = size % 8; | |
int step_4 = step_8_r / 4; int step_4_r = step_8_r % 4; | |
int step_2 = step_4_r / 2; int step_2_r = step_4_r % 2; | |
double value = inital_value; | |
for (int step_n = 0; step_n < step_8; step_n++){ | |
int offset = 8 * step_n; | |
__m256d a = _mm256_set_pd(values[offset], values[offset + 1], values[offset + 2], values[offset + 3]); | |
__m256d b = _mm256_set_pd(values[offset + 4], values[offset + 5], values[offset + 6], values[offset + 7]); | |
__m256d sum = _mm256_hadd_pd(a, b); | |
__m128d sum_high = _mm256_extractf128_pd(sum, 1); | |
__m128d result = _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum)); | |
value += result.m128d_f64[0] + result.m128d_f64[1]; | |
} | |
for (int step_n = 0; step_n < step_4; step_n++){ | |
int offset = 8 * step_8 + 4 * step_n; | |
__m128d a = _mm_set_pd(values[offset], values[offset + 1]); | |
__m128d b = _mm_set_pd(values[offset + 2], values[offset + 3]); | |
__m128d sum = _mm_hadd_pd(a, b); | |
value += sum.m128d_f64[0] + sum.m128d_f64[1]; | |
} | |
if (step_2 != 0) { | |
int offset = 8 * step_8 + 4 * step_4; | |
value += values[offset] + values[offset + 1]; | |
} | |
if (step_2_r != 0) { | |
value += values[8 * step_8 + 4 * step_4 + 2 * step_2]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
step_8 requires AVX,
step_4 requires SSE3