Last active
May 26, 2026 19:31
-
-
Save Hermann-SW/9162a47fd59216aea77df90efc8e7ad2 to your computer and use it in GitHub Desktop.
Demonstrate multiple TOPS performance for Zen4 AMD CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX512_VNNI.DP2A_s32_s16_s16 | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f | |
| Outputs for "./f": | |
| Performance: 2.23851 TOPS (Tera-Ops/sec) on AMD 8840HS 8C laptop CPU (at 4.400 GHz) | |
| Performance: 5.12171 TOPS (Tera-Ops/sec) on AMD 7950X 16C CPU (at 5.013 GHz) | |
| Outputs for "./f byte": | |
| Performance: 4.17551 TOPS (Tera-Ops/sec) on AMD 8840HS 8C laptop CPU (at 4.116 GHz) | |
| Performance: 10.2413 TOPS (Tera-Ops/sec) on AMD 7950X 16C CPU (at 5.013 GHz) | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int argc, char**) { | |
| const int iterations = 2000000000; // 2*10^9 | |
| const bool byt = (argc > 1); | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| if (byt) { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vpdpbusd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vpdpbusd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vpdpbusd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vpdpbusd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vpdpbusd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4"); | |
| } | |
| } else { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vpdpwssd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vpdpwssd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vpdpwssd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vpdpwssd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vpdpwssd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4"); | |
| } | |
| } | |
| } | |
| std::cout << "... [AVX512_VNNI] DP2A(s32,s16,s16) completed\n"; | |
| std::cout << | |
| "https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t mult_s16_per_vpdpwssd = 32 * (byt ? 2 : 1); | |
| int64_t add_s32_per_vpdpwssd = (16 + 16) * (byt ? 2 : 1); | |
| int64_t ops_per_vpdpwssd = mult_s16_per_vpdpwssd + add_s32_per_vpdpwssd; | |
| int64_t ops_per_loop = 5 * ops_per_vpdpwssd * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / ops_per_vpdpwssd; | |
| double tera_ops = total_ops / 1e12; | |
| double performance = tera_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << tera_ops | |
| << " Tera-Operations (counter * (32+16+16) * " | |
| << (byt ? 2 : 1) << ")\n"; | |
| std::cout << "Performance: " << performance << " TOPS (Tera-Ops/sec)\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
With new "byte" option TOPS numbers double (multiplying 64 byte pairs vs. 32 word pairs):