Skip to content

Instantly share code, notes, and snippets.

@Hermann-SW
Created June 3, 2026 21:01
Show Gist options
  • Select an option

  • Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.

Select an option

Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for Zen4 AMD 16C/32T 7950X CPUs
/*
f=AVX512_VNNI.vsqrtpd
g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f
cpplint --filter=-legal/copyright $f.cpp
cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config
echo off | sudo tee /sys/devices/system/cpu/smt/control
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f
Output:
hermann@7950x:~$ ./$f
Starting hardware-bound benchmark using 16 threads...
... [AVX512F] vsqrtpd(mm512d,mm512d) completed
https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
-------------------------------------------
Execution Time: 4.75232 seconds
Counter: 25,600,000,000
Total Compute: 204.8 double sqrt GFLOPS (counter * 8)
Performance: 43.0947 GFLOPS
-------------------------------------------
hermann@7950x:~$
*/
#include <omp.h>
#include <inttypes.h>
#include <iostream>
#include <chrono> // NOLINT [build/c++11]
int main(int, char**) {
const int iterations = 200000000; // 2*10^8
std::cout << "Starting hardware-bound benchmark using "
<< omp_get_max_threads() << " threads...\n";
auto start_time = std::chrono::high_resolution_clock::now();
#pragma omp parallel
{
for (int i = 0; i < iterations; ++i) {
asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0");
asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1");
asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2");
asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3");
asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4");
asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5");
asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6");
asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7");
}
}
std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n";
std::cout <<
"https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n";
std::chrono::duration<double> duration =
std::chrono::high_resolution_clock::now() - start_time;
int64_t ops_per_loop = 8 * 8 * omp_get_max_threads();
int64_t total_ops = ops_per_loop * iterations;
int64_t giga_cnt = total_ops / 8;
double giga_ops = total_ops / 1e9;
double performance = giga_ops / duration.count();
std::cout << "-------------------------------------------\n";
std::cout << "Execution Time: " << duration.count() << " seconds\n";
std::cout.imbue(std::locale(""));
std::cout << "Counter: " << giga_cnt << "\n";
std::cout << "Total Compute: " << giga_ops
<< " double sqrt GFLOPS (counter * 8)\n";
std::cout << "Performance: " << performance << " GFLOPS\n";
std::cout << "-------------------------------------------\n";
return 0;
}
@Hermann-SW

Copy link
Copy Markdown
Author

One vsqrtpd does 8 double sqrt computations:
https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png

25600000000*8/(4.756737 * 10^9) = 43.0547243
hermann@7950x:~$ perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f
Starting hardware-bound benchmark using 16 threads...
... [AVX512F] vsqrtpd(mm512d,mm512d) completed
https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
-------------------------------------------
Execution Time: 4.75516 seconds
Counter:        25,600,000,000
Total Compute:  204.8 double sqrt GFLOPS (counter * 8)
Performance:    43.069 GFLOPS
-------------------------------------------

 Performance counter stats for 'system wide':

    25,600,037,725      fp_ops_retired_by_width.pack_512_uops_retired #  336.365 M/sec                     
   410,578,408,395      cycles                           #    5.395 GHz                       
    32,537,023,597      instructions                     #    0.08  insn per cycle            
         76,107.81 msec task-clock                       #   16.000 CPUs utilized             

       4.756736921 seconds time elapsed

hermann@7950x:~$ 

@Hermann-SW

Copy link
Copy Markdown
Author

Main loop in C++ with inline assembly (allowing control over the 512bit register names):

hermann@7950x:~$ sed -n "/pragma/,/^  }/p" $f.cpp
  #pragma omp parallel
  {
    for (int i = 0; i < iterations; ++i) {
      asm __volatile__ ("vsqrtpd  %%zmm0,  %%zmm0"  : : :  "zmm0");
      asm __volatile__ ("vsqrtpd  %%zmm1,  %%zmm1"  : : :  "zmm1");
      asm __volatile__ ("vsqrtpd  %%zmm2,  %%zmm2"  : : :  "zmm2");
      asm __volatile__ ("vsqrtpd  %%zmm3,  %%zmm3"  : : :  "zmm3");
      asm __volatile__ ("vsqrtpd  %%zmm4,  %%zmm4"  : : :  "zmm4");
      asm __volatile__ ("vsqrtpd  %%zmm5,  %%zmm5"  : : :  "zmm5");
      asm __volatile__ ("vsqrtpd  %%zmm6,  %%zmm6"  : : :  "zmm6");
      asm __volatile__ ("vsqrtpd  %%zmm7,  %%zmm7"  : : :  "zmm7");
    }
  }
hermann@7950x:~$ 

Same loop with objdump:

hermann@7950x:~$ objdump -d AVX512_VNNI.vsqrtpd | sed -n "/vsqrtpd /,/jne/p" 
    1570:       62 f1 fd 48 51 c0       vsqrtpd %zmm0,%zmm0
    1576:       62 f1 fd 48 51 c9       vsqrtpd %zmm1,%zmm1
    157c:       62 f1 fd 48 51 d2       vsqrtpd %zmm2,%zmm2
    1582:       62 f1 fd 48 51 db       vsqrtpd %zmm3,%zmm3
    1588:       62 f1 fd 48 51 e4       vsqrtpd %zmm4,%zmm4
    158e:       62 f1 fd 48 51 ed       vsqrtpd %zmm5,%zmm5
    1594:       62 f1 fd 48 51 f6       vsqrtpd %zmm6,%zmm6
    159a:       62 f1 fd 48 51 ff       vsqrtpd %zmm7,%zmm7
    15a0:       83 e8 01                sub    $0x1,%eax
    15a3:       75 cb                   jne    1570 <main._omp_fn.0+0x10>
hermann@7950x:~$ 

@Hermann-SW

Hermann-SW commented Jun 3, 2026

Copy link
Copy Markdown
Author

Real world OpenMP program ml100K.cpp computing TSP tour length of 100,000 cities mona-lisa100K.tsp 500,000× (on each core) does 16*50*10^9 double sqrt and more, and reaches 16*50*10^9/20.1636s = 39.675 Gsqrt/s, very close to peak 43 double Gsqrt/s (>92%)!

C++ nested loops:

for (int i = 0; i < repeats; ++i) {
  __m512i acc = _mm512_setzero_si512();

  for (int i = 0; i < 2*N; i+=32) {
    __m512i a = _mm512_load_si512((const __m512i*)(xy_even+i));
    __m512i b = _mm512_load_si512((const __m512i*)(xy_odd+i));

    __m512i dxy = _mm512_sub_epi16(a, b);

    __m512i aux = DOT_PRODUCT_ACC(_mm512_setzero_si512(), dxy, dxy);

    __m512d low_doubles  = _mm512_cvtepi32_pd(_mm512_castsi512_si256(aux));
    __m256i high_lanes   = _mm512_extracti64x4_epi64(aux, 1);
    __m512d high_doubles = _mm512_cvtepi32_pd(high_lanes);

    __m512d sqrt_low  = _mm512_sqrt_pd(low_doubles);
    __m512d sqrt_high = _mm512_sqrt_pd(high_doubles);

    __m512d res_low  = _mm512_add_pd(sqrt_low, half_pd);
    __m512d res_high = _mm512_add_pd(sqrt_high, half_pd);

    __m256i int_low  = _mm512_mask_cvtt_roundpd_epi32
                         (_mm256_undefined_si256(), 0xFF, res_low,
                             (_MM_FROUND_NO_EXC));
    __m256i int_high = _mm512_mask_cvtt_roundpd_epi32
                         (_mm256_undefined_si256(), 0xFF, res_high,
                           (_MM_FROUND_NO_EXC));

    __m512i euc_2d = _mm512_inserti64x4(_mm512_castsi256_si512(int_low),
                                        int_high, 1);

    acc = _mm512_add_epi32(acc, euc_2d);
  }

  sum += _mm512_reduce_add_epi32(acc);
}

same nested loop in "objdump -d":

    1410:       31 c0                   xor    %eax,%eax                    <-------------------------------------+
    1412:       c5 e9 ef d2             vpxor  %xmm2,%xmm2,%xmm2                                                  |
    1416:       66 2e 0f 1f 84 00 00    cs nopw 0x0(%rax,%rax,1)                                                  |
    141d:       00 00 00                                                                                          |
    1420:       62 f1 7d 48 6f cc       vmovdqa32 %zmm4,%zmm1               <----------------------------------+  |
    1426:       62 f1 fd 48 6f 2c 01    vmovdqa64 (%rcx,%rax,1),%zmm5                                          |  |
    142d:       62 f1 55 48 f9 04 02    vpsubw (%rdx,%rax,1),%zmm5,%zmm0                                       |  |
    1434:       48 83 c0 40             add    $0x40,%rax                                                      |  |
    1438:       62 f2 7d 48 52 c8       vpdpwssd %zmm0,%zmm0,%zmm1              16× 32bit "dx^2+dy^2"          |  |
    143e:       62 f1 7e 48 e6 c1       vcvtdq2pd %ymm1,%zmm0                                                  |  |
    1444:       62 f3 fd 48 3b c9 01    vextracti64x4 $0x1,%zmm1,%ymm1                                         |  |
    144b:       62 f1 7e 48 e6 c9       vcvtdq2pd %ymm1,%zmm1                                                  |  |
    1451:       62 f1 fd 48 51 c0       vsqrtpd %zmm0,%zmm0                     <====  8× 64bit sqrt           |  |
    1457:       62 f1 fd 48 58 c3       vaddpd %zmm3,%zmm0,%zmm0                                               |  |
    145d:       62 f1 fd 48 51 c9       vsqrtpd %zmm1,%zmm1                     <====  8× 64bit sqrt           |  |
    1463:       62 f1 f5 48 58 cb       vaddpd %zmm3,%zmm1,%zmm1                                               |  |
    1469:       62 f1 fd 18 e6 c0       vcvttpd2dq {sae},%zmm0,%ymm0                                           |  |
    146f:       62 f1 fd 18 e6 c9       vcvttpd2dq {sae},%zmm1,%ymm1                                           |  |
    1475:       62 f3 fd 48 3a c1 01    vinserti64x4 $0x1,%ymm1,%zmm0,%zmm0                                    |  |
    147c:       62 f1 6d 48 fe c0       vpaddd %zmm0,%zmm2,%zmm0                                               |  |
    1482:       62 f1 fd 48 6f d0       vmovdqa64 %zmm0,%zmm2                                                  |  |
    1488:       48 3d 80 1a 06 00       cmp    $0x61a80,%rax                                                   |  |
    148e:       75 90                   jne    1420 <_Z6bench2v+0x60>       -----------------------------------+  |
    1490:       62 f3 fd 48 3b c1 01    vextracti64x4 $0x1,%zmm0,%ymm1                                            |
    1497:       c5 f5 fe c0             vpaddd %ymm0,%ymm1,%ymm0                                                  |
    149b:       62 f3 fd 28 39 c1 01    vextracti64x2 $0x1,%ymm0,%xmm1                                            |
    14a2:       c5 f1 fe c0             vpaddd %xmm0,%xmm1,%xmm0                                                  |
    14a6:       c5 f9 70 c8 4e          vpshufd $0x4e,%xmm0,%xmm1                                                 |
    14ab:       c5 f1 fe c8             vpaddd %xmm0,%xmm1,%xmm1                                                  |
    14af:       c5 f9 6f c1             vmovdqa %xmm1,%xmm0                                                       |
    14b3:       c5 f9 70 c9 55          vpshufd $0x55,%xmm1,%xmm1                                                 |
    14b8:       c5 f9 fe c1             vpaddd %xmm1,%xmm0,%xmm0                                                  |
    14bc:       c5 f9 7e c0             vmovd  %xmm0,%eax                                                         |
    14c0:       48 98                   cltq                                                                      |
    14c2:       49 01 c4                add    %rax,%r12                                                          |
    14c5:       ff ce                   dec    %esi                                                               |
    14c7:       0f 85 43 ff ff ff       jne    1410 <_Z6bench2v+0x50>       --------------------------------------+

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment