Created
June 26, 2026 20:23
-
-
Save Hermann-SW/fdff0865b863b42e246427198bb3b4d6 to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for AVX2 CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX2.vsqrtpd | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e cycles,instructions,task-clock ./$f | |
| Output: | |
| hermann@7950x:~$ ./$f | |
| Starting hardware-bound benchmark using 16 threads... | |
| ... [AVX512F] vsqrtpd(mm512d,mm512d) completed | |
| https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png | |
| ------------------------------------------- | |
| Execution Time: 4.75232 seconds | |
| Counter: 25,600,000,000 | |
| Total Compute: 204.8 double sqrt GFLOPS (counter * 8) | |
| Performance: 43.0947 GFLOPS | |
| ------------------------------------------- | |
| hermann@7950x:~$ | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int, char**) { | |
| const int iterations = 200000000; // 2*10^8 | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ( | |
| "vsqrtpd %%ymm0, %%ymm0 \n\t" | |
| "vsqrtpd %%ymm1, %%ymm1 \n\t" | |
| "vsqrtpd %%ymm2, %%ymm2 \n\t" | |
| "vsqrtpd %%ymm3, %%ymm3 \n\t" | |
| "vsqrtpd %%ymm4, %%ymm4 \n\t" | |
| "vsqrtpd %%ymm5, %%ymm5 \n\t" | |
| "vsqrtpd %%ymm6, %%ymm6 \n\t" | |
| "vsqrtpd %%ymm7, %%ymm7 \n\t" | |
| : // No outputs | |
| : // No inputs | |
| : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7" | |
| ); | |
| } | |
| } | |
| std::cout << "... [AVX256] vsqrtpd(mm256d,mm256d) completed\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t ops_per_loop = 4 * 8 * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / 4; | |
| double giga_ops = total_ops / 1e9; | |
| double performance = giga_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << giga_ops | |
| << " double sqrt GFLOPS (counter * 4)\n"; | |
| std::cout << "Performance: " << performance << " GFLOPS\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On 8-socket 192C E7-8890v4 CPUs server with SMT disabled: