Last active
May 26, 2026 19:31
-
-
Save Hermann-SW/9162a47fd59216aea77df90efc8e7ad2 to your computer and use it in GitHub Desktop.
Demonstrate multiple TOPS performance for Zen4 AMD CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX512_VNNI.DP2A_s32_s16_s16 | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f | |
| Outputs for "./f": | |
| Performance: 2.23851 TOPS (Tera-Ops/sec) on AMD 8840HS 8C laptop CPU (at 4.400 GHz) | |
| Performance: 5.12171 TOPS (Tera-Ops/sec) on AMD 7950X 16C CPU (at 5.013 GHz) | |
| Outputs for "./f byte": | |
| Performance: 4.17551 TOPS (Tera-Ops/sec) on AMD 8840HS 8C laptop CPU (at 4.116 GHz) | |
| Performance: 10.2413 TOPS (Tera-Ops/sec) on AMD 7950X 16C CPU (at 5.013 GHz) | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int argc, char**) { | |
| const int iterations = 2000000000; // 2*10^9 | |
| const bool byt = (argc > 1); | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| if (byt) { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vpdpbusd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vpdpbusd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vpdpbusd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vpdpbusd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vpdpbusd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4"); | |
| } | |
| } else { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vpdpwssd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vpdpwssd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vpdpwssd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vpdpwssd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vpdpwssd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4"); | |
| } | |
| } | |
| } | |
| std::cout << "... [AVX512_VNNI] DP2A(s32,s16,s16) completed\n"; | |
| std::cout << | |
| "https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t mult_s16_per_vpdpwssd = 32 * (byt ? 2 : 1); | |
| int64_t add_s32_per_vpdpwssd = (16 + 16) * (byt ? 2 : 1); | |
| int64_t ops_per_vpdpwssd = mult_s16_per_vpdpwssd + add_s32_per_vpdpwssd; | |
| int64_t ops_per_loop = 5 * ops_per_vpdpwssd * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / ops_per_vpdpwssd; | |
| double tera_ops = total_ops / 1e12; | |
| double performance = tera_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << tera_ops | |
| << " Tera-Operations (counter * (32+16+16) * " | |
| << (byt ? 2 : 1) << ")\n"; | |
| std::cout << "Performance: " << performance << " TOPS (Tera-Ops/sec)\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Author
With new "byte" option TOPS numbers double (multiplying 64 byte pairs vs. 32 word pairs):
hermann@7950x:~$ perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f byte
Starting hardware-bound benchmark using 16 threads...
... [AVX512_VNNI] DP2A(s32,s16,s16) completed
https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22
-------------------------------------------
Execution Time: 1.99975 seconds
Counter: 160,000,000,000
Total Compute: 20.48 Tera-Operations (counter * (32+16+16) * 2)
Performance: 10.2413 TOPS (Tera-Ops/sec)
-------------------------------------------
Performance counter stats for 'system wide':
160,000,038,892 fp_ops_retired_by_width.pack_512_uops_retired # 4.997 G/sec
160,518,614,310 cycles # 5.013 GHz
224,246,479,090 instructions # 1.40 insn per cycle
32,018.54 msec task-clock # 16.000 CPUs utilized
2.001118241 seconds time elapsed
hermann@7950x:~$
hermann@8840hs:~$ perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f byte
Starting hardware-bound benchmark using 8 threads...
... [AVX512_VNNI] DP2A(s32,s16,s16) completed
https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22
-------------------------------------------
Execution Time: 2.4524 seconds
Counter: 80,000,000,000
Total Compute: 10.24 Tera-Operations (counter * (32+16+16) * 2)
Performance: 4.17551 TOPS (Tera-Ops/sec)
-------------------------------------------
Performance counter stats for 'system wide':
80,001,396,997 fp_ops_retired_by_width.pack_512_uops_retired # 4.073 G/sec
80,848,144,482 cycles # 4.116 GHz
112,521,408,812 instructions # 1.39 insn per cycle
19,641,966,642 task-clock # 7.999 CPUs utilized
2.455592230 seconds time elapsed
hermann@8840hs:~$
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Main loop in C++ with inline assembly (allowing control over the 512bit register names):
Same loop with objdump: