Hermann-SW · June 3, 2026 21:01 · Hermann-SW · Jun 3, 2026 · Hermann-SW · Jun 3, 2026
diff --git a/AVX512_VNNI.vsqrtpd.cpp b/AVX512_VNNI.vsqrtpd.cpp
 /*
 f=AVX512_VNNI.vsqrtpd
 g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f
 cpplint --filter=-legal/copyright $f.cpp
 cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config

 echo off | sudo tee /sys/devices/system/cpu/smt/control
 echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
 perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f

 Output:
 hermann@7950x:~$ ./$f
 Starting hardware-bound benchmark using 16 threads...
 ... [AVX512F] vsqrtpd(mm512d,mm512d) completed
 https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
 -------------------------------------------
 Execution Time: 4.75232 seconds
 Counter:        25,600,000,000
 Total Compute:  204.8 double sqrt GFLOPS (counter * 8)
 Performance:    43.0947 GFLOPS
 -------------------------------------------
 hermann@7950x:~$ 
 */
 #include <omp.h>
 #include <inttypes.h>
 #include <iostream>
 #include <chrono>   // NOLINT [build/c++11]

 int main(int, char**) {
  const int iterations = 200000000;  // 2*10^8

  std::cout << "Starting hardware-bound benchmark using "
            << omp_get_max_threads() << " threads...\n";

  auto start_time = std::chrono::high_resolution_clock::now();

  #pragma omp parallel
  {
    for (int i = 0; i < iterations; ++i) {
      asm __volatile__ ("vsqrtpd  %%zmm0,  %%zmm0"  : : :  "zmm0");
      asm __volatile__ ("vsqrtpd  %%zmm1,  %%zmm1"  : : :  "zmm1");
      asm __volatile__ ("vsqrtpd  %%zmm2,  %%zmm2"  : : :  "zmm2");
      asm __volatile__ ("vsqrtpd  %%zmm3,  %%zmm3"  : : :  "zmm3");
      asm __volatile__ ("vsqrtpd  %%zmm4,  %%zmm4"  : : :  "zmm4");
      asm __volatile__ ("vsqrtpd  %%zmm5,  %%zmm5"  : : :  "zmm5");
      asm __volatile__ ("vsqrtpd  %%zmm6,  %%zmm6"  : : :  "zmm6");
      asm __volatile__ ("vsqrtpd  %%zmm7,  %%zmm7"  : : :  "zmm7");
    }
  }

  std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n";
  std::cout <<
    "https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n";

  std::chrono::duration<double> duration =
    std::chrono::high_resolution_clock::now() - start_time;

  int64_t ops_per_loop = 8 * 8 * omp_get_max_threads();
  int64_t total_ops = ops_per_loop * iterations;
  int64_t giga_cnt = total_ops / 8;
  double giga_ops = total_ops / 1e9;
  double performance = giga_ops / duration.count();

  std::cout << "-------------------------------------------\n";
  std::cout << "Execution Time: " << duration.count() << " seconds\n";
  std::cout.imbue(std::locale(""));
  std::cout << "Counter:        " << giga_cnt << "\n";
  std::cout << "Total Compute:  " << giga_ops
            << " double sqrt GFLOPS (counter * 8)\n";
  std::cout << "Performance:    " << performance << " GFLOPS\n";
  std::cout << "-------------------------------------------\n";

  return 0;
 }
	/*
	f=AVX512_VNNI.vsqrtpd
	g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f
	cpplint --filter=-legal/copyright $f.cpp
	cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config

	echo off \| sudo tee /sys/devices/system/cpu/smt/control
	echo 0 \| sudo tee /proc/sys/kernel/perf_event_paranoid
	perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f

	Output:
	hermann@7950x:~$ ./$f
	Starting hardware-bound benchmark using 16 threads...
	... [AVX512F] vsqrtpd(mm512d,mm512d) completed
	https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
	-------------------------------------------
	Execution Time: 4.75232 seconds
	Counter: 25,600,000,000
	Total Compute: 204.8 double sqrt GFLOPS (counter * 8)
	Performance: 43.0947 GFLOPS
	-------------------------------------------
	hermann@7950x:~$
	*/
	#include <omp.h>
	#include <inttypes.h>
	#include <iostream>
	#include <chrono> // NOLINT [build/c++11]

	int main(int, char**) {
	const int iterations = 200000000; // 2*10^8

	std::cout << "Starting hardware-bound benchmark using "
	<< omp_get_max_threads() << " threads...\n";

	auto start_time = std::chrono::high_resolution_clock::now();

	#pragma omp parallel
	{
	for (int i = 0; i < iterations; ++i) {
	asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0");
	asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1");
	asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2");
	asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3");
	asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4");
	asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5");
	asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6");
	asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7");
	}
	}

	std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n";
	std::cout <<
	"https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n";

	std::chrono::duration<double> duration =
	std::chrono::high_resolution_clock::now() - start_time;

	int64_t ops_per_loop = 8 * 8 * omp_get_max_threads();
	int64_t total_ops = ops_per_loop * iterations;
	int64_t giga_cnt = total_ops / 8;
	double giga_ops = total_ops / 1e9;
	double performance = giga_ops / duration.count();

	std::cout << "-------------------------------------------\n";
	std::cout << "Execution Time: " << duration.count() << " seconds\n";
	std::cout.imbue(std::locale(""));
	std::cout << "Counter: " << giga_cnt << "\n";
	std::cout << "Total Compute: " << giga_ops
	<< " double sqrt GFLOPS (counter * 8)\n";
	std::cout << "Performance: " << performance << " GFLOPS\n";
	std::cout << "-------------------------------------------\n";

	return 0;
	}
No results found