Created
June 3, 2026 21:01
-
-
Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for Zen4 AMD 16C/32T 7950X CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX512_VNNI.vsqrtpd | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f | |
| Output: | |
| hermann@7950x:~$ ./$f | |
| Starting hardware-bound benchmark using 16 threads... | |
| ... [AVX512F] vsqrtpd(mm512d,mm512d) completed | |
| https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png | |
| ------------------------------------------- | |
| Execution Time: 4.75232 seconds | |
| Counter: 25,600,000,000 | |
| Total Compute: 204.8 double sqrt GFLOPS (counter * 8) | |
| Performance: 43.0947 GFLOPS | |
| ------------------------------------------- | |
| hermann@7950x:~$ | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int, char**) { | |
| const int iterations = 200000000; // 2*10^8 | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4"); | |
| asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5"); | |
| asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6"); | |
| asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7"); | |
| } | |
| } | |
| std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n"; | |
| std::cout << | |
| "https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t ops_per_loop = 8 * 8 * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / 8; | |
| double giga_ops = total_ops / 1e9; | |
| double performance = giga_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << giga_ops | |
| << " double sqrt GFLOPS (counter * 8)\n"; | |
| std::cout << "Performance: " << performance << " GFLOPS\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Author
Real world OpenMP program ml100K.cpp computing TSP tour length of 100,000 cities mona-lisa100K.tsp 500,000× (on each core) does 16*50*10^9 double sqrt and more, and reaches 16*50*10^9/20.1636s = 39.675 Gsqrt/s, very close to peak 43 double Gsqrt/s (>92%)!
C++ nested loops:
for (int i = 0; i < repeats; ++i) {
__m512i acc = _mm512_setzero_si512();
for (int i = 0; i < 2*N; i+=32) {
__m512i a = _mm512_load_si512((const __m512i*)(xy_even+i));
__m512i b = _mm512_load_si512((const __m512i*)(xy_odd+i));
__m512i dxy = _mm512_sub_epi16(a, b);
__m512i aux = DOT_PRODUCT_ACC(_mm512_setzero_si512(), dxy, dxy);
__m512d low_doubles = _mm512_cvtepi32_pd(_mm512_castsi512_si256(aux));
__m256i high_lanes = _mm512_extracti64x4_epi64(aux, 1);
__m512d high_doubles = _mm512_cvtepi32_pd(high_lanes);
__m512d sqrt_low = _mm512_sqrt_pd(low_doubles);
__m512d sqrt_high = _mm512_sqrt_pd(high_doubles);
__m512d res_low = _mm512_add_pd(sqrt_low, half_pd);
__m512d res_high = _mm512_add_pd(sqrt_high, half_pd);
__m256i int_low = _mm512_mask_cvtt_roundpd_epi32
(_mm256_undefined_si256(), 0xFF, res_low,
(_MM_FROUND_NO_EXC));
__m256i int_high = _mm512_mask_cvtt_roundpd_epi32
(_mm256_undefined_si256(), 0xFF, res_high,
(_MM_FROUND_NO_EXC));
__m512i euc_2d = _mm512_inserti64x4(_mm512_castsi256_si512(int_low),
int_high, 1);
acc = _mm512_add_epi32(acc, euc_2d);
}
sum += _mm512_reduce_add_epi32(acc);
}
same nested loop in "objdump -d":
1410: 31 c0 xor %eax,%eax <-------------------------------------+
1412: c5 e9 ef d2 vpxor %xmm2,%xmm2,%xmm2 |
1416: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) |
141d: 00 00 00 |
1420: 62 f1 7d 48 6f cc vmovdqa32 %zmm4,%zmm1 <----------------------------------+ |
1426: 62 f1 fd 48 6f 2c 01 vmovdqa64 (%rcx,%rax,1),%zmm5 | |
142d: 62 f1 55 48 f9 04 02 vpsubw (%rdx,%rax,1),%zmm5,%zmm0 | |
1434: 48 83 c0 40 add $0x40,%rax | |
1438: 62 f2 7d 48 52 c8 vpdpwssd %zmm0,%zmm0,%zmm1 16× 32bit "dx^2+dy^2" | |
143e: 62 f1 7e 48 e6 c1 vcvtdq2pd %ymm1,%zmm0 | |
1444: 62 f3 fd 48 3b c9 01 vextracti64x4 $0x1,%zmm1,%ymm1 | |
144b: 62 f1 7e 48 e6 c9 vcvtdq2pd %ymm1,%zmm1 | |
1451: 62 f1 fd 48 51 c0 vsqrtpd %zmm0,%zmm0 <==== 8× 64bit sqrt | |
1457: 62 f1 fd 48 58 c3 vaddpd %zmm3,%zmm0,%zmm0 | |
145d: 62 f1 fd 48 51 c9 vsqrtpd %zmm1,%zmm1 <==== 8× 64bit sqrt | |
1463: 62 f1 f5 48 58 cb vaddpd %zmm3,%zmm1,%zmm1 | |
1469: 62 f1 fd 18 e6 c0 vcvttpd2dq {sae},%zmm0,%ymm0 | |
146f: 62 f1 fd 18 e6 c9 vcvttpd2dq {sae},%zmm1,%ymm1 | |
1475: 62 f3 fd 48 3a c1 01 vinserti64x4 $0x1,%ymm1,%zmm0,%zmm0 | |
147c: 62 f1 6d 48 fe c0 vpaddd %zmm0,%zmm2,%zmm0 | |
1482: 62 f1 fd 48 6f d0 vmovdqa64 %zmm0,%zmm2 | |
1488: 48 3d 80 1a 06 00 cmp $0x61a80,%rax | |
148e: 75 90 jne 1420 <_Z6bench2v+0x60> -----------------------------------+ |
1490: 62 f3 fd 48 3b c1 01 vextracti64x4 $0x1,%zmm0,%ymm1 |
1497: c5 f5 fe c0 vpaddd %ymm0,%ymm1,%ymm0 |
149b: 62 f3 fd 28 39 c1 01 vextracti64x2 $0x1,%ymm0,%xmm1 |
14a2: c5 f1 fe c0 vpaddd %xmm0,%xmm1,%xmm0 |
14a6: c5 f9 70 c8 4e vpshufd $0x4e,%xmm0,%xmm1 |
14ab: c5 f1 fe c8 vpaddd %xmm0,%xmm1,%xmm1 |
14af: c5 f9 6f c1 vmovdqa %xmm1,%xmm0 |
14b3: c5 f9 70 c9 55 vpshufd $0x55,%xmm1,%xmm1 |
14b8: c5 f9 fe c1 vpaddd %xmm1,%xmm0,%xmm0 |
14bc: c5 f9 7e c0 vmovd %xmm0,%eax |
14c0: 48 98 cltq |
14c2: 49 01 c4 add %rax,%r12 |
14c5: ff ce dec %esi |
14c7: 0f 85 43 ff ff ff jne 1410 <_Z6bench2v+0x50> --------------------------------------+
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Main loop in C++ with inline assembly (allowing control over the 512bit register names):
Same loop with objdump: