Created
June 26, 2026 20:23
-
-
Save Hermann-SW/fdff0865b863b42e246427198bb3b4d6 to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for AVX2 CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX2.vsqrtpd | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e cycles,instructions,task-clock ./$f | |
| Output: | |
| hermann@7950x:~$ ./$f | |
| Starting hardware-bound benchmark using 16 threads... | |
| ... [AVX512F] vsqrtpd(mm512d,mm512d) completed | |
| https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png | |
| ------------------------------------------- | |
| Execution Time: 4.75232 seconds | |
| Counter: 25,600,000,000 | |
| Total Compute: 204.8 double sqrt GFLOPS (counter * 8) | |
| Performance: 43.0947 GFLOPS | |
| ------------------------------------------- | |
| hermann@7950x:~$ | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int, char**) { | |
| const int iterations = 200000000; // 2*10^8 | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ( | |
| "vsqrtpd %%ymm0, %%ymm0 \n\t" | |
| "vsqrtpd %%ymm1, %%ymm1 \n\t" | |
| "vsqrtpd %%ymm2, %%ymm2 \n\t" | |
| "vsqrtpd %%ymm3, %%ymm3 \n\t" | |
| "vsqrtpd %%ymm4, %%ymm4 \n\t" | |
| "vsqrtpd %%ymm5, %%ymm5 \n\t" | |
| "vsqrtpd %%ymm6, %%ymm6 \n\t" | |
| "vsqrtpd %%ymm7, %%ymm7 \n\t" | |
| : // No outputs | |
| : // No inputs | |
| : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7" | |
| ); | |
| } | |
| } | |
| std::cout << "... [AVX256] vsqrtpd(mm256d,mm256d) completed\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t ops_per_loop = 4 * 8 * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / 4; | |
| double giga_ops = total_ops / 1e9; | |
| double performance = giga_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << giga_ops | |
| << " double sqrt GFLOPS (counter * 4)\n"; | |
| std::cout << "Performance: " << performance << " GFLOPS\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Author
On 8-socket 192C E7-8890v4 CPUs server with SMT disabled:
hermann@x3950-X6:~$ nproc
192
hermann@x3950-X6:~$ ./$f
Starting hardware-bound benchmark using 192 threads...
... [AVX256] vsqrtpd(mm256d,mm256d) completed
-------------------------------------------
Execution Time: 12.0587 seconds
Counter: 307,200,000,000
Total Compute: 1,228.8 double sqrt GFLOPS (counter * 4)
Performance: 101.902 GFLOPS
-------------------------------------------
hermann@x3950-X6:~$
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On 2 socket 22C E5-2696v4 CPUs server with SMT disabled: