Hermann-SW · June 26, 2026 20:23 · Hermann-SW · Jun 26, 2026 · Hermann-SW · Jun 26, 2026
diff --git a/AVX2.vsqrtpd.cpp b/AVX2.vsqrtpd.cpp
 /*
 f=AVX2.vsqrtpd
 g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f
 cpplint --filter=-legal/copyright $f.cpp
 cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config

 echo off | sudo tee /sys/devices/system/cpu/smt/control
 echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
 perf stat -a -e cycles,instructions,task-clock ./$f

 Output:
 hermann@7950x:~$ ./$f
 Starting hardware-bound benchmark using 16 threads...
 ... [AVX512F] vsqrtpd(mm512d,mm512d) completed
 https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
 -------------------------------------------
 Execution Time: 4.75232 seconds
 Counter:        25,600,000,000
 Total Compute:  204.8 double sqrt GFLOPS (counter * 8)
 Performance:    43.0947 GFLOPS
 -------------------------------------------
 hermann@7950x:~$ 
 */
 #include <omp.h>
 #include <inttypes.h>
 #include <iostream>
 #include <chrono>   // NOLINT [build/c++11]

 int main(int, char**) {
  const int iterations = 200000000;  // 2*10^8

  std::cout << "Starting hardware-bound benchmark using "
            << omp_get_max_threads() << " threads...\n";

  auto start_time = std::chrono::high_resolution_clock::now();

  #pragma omp parallel
  {
    for (int i = 0; i < iterations; ++i) {
 asm __volatile__ (
            "vsqrtpd %%ymm0, %%ymm0 \n\t"
            "vsqrtpd %%ymm1, %%ymm1 \n\t"
            "vsqrtpd %%ymm2, %%ymm2 \n\t"
            "vsqrtpd %%ymm3, %%ymm3 \n\t"
            "vsqrtpd %%ymm4, %%ymm4 \n\t"
            "vsqrtpd %%ymm5, %%ymm5 \n\t"
            "vsqrtpd %%ymm6, %%ymm6 \n\t"
            "vsqrtpd %%ymm7, %%ymm7 \n\t"
            : // No outputs
            : // No inputs
            : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
        );
    }
  }

  std::cout << "... [AVX256] vsqrtpd(mm256d,mm256d) completed\n";

  std::chrono::duration<double> duration =
    std::chrono::high_resolution_clock::now() - start_time;

  int64_t ops_per_loop = 4 * 8 * omp_get_max_threads();
  int64_t total_ops = ops_per_loop * iterations;
  int64_t giga_cnt = total_ops / 4;
  double giga_ops = total_ops / 1e9;
  double performance = giga_ops / duration.count();

  std::cout << "-------------------------------------------\n";
  std::cout << "Execution Time: " << duration.count() << " seconds\n";
  std::cout.imbue(std::locale(""));
  std::cout << "Counter:        " << giga_cnt << "\n";
  std::cout << "Total Compute:  " << giga_ops
            << " double sqrt GFLOPS (counter * 4)\n";
  std::cout << "Performance:    " << performance << " GFLOPS\n";
  std::cout << "-------------------------------------------\n";

  return 0;
 }
	/*
	f=AVX2.vsqrtpd
	g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f
	cpplint --filter=-legal/copyright $f.cpp
	cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config

	echo off \| sudo tee /sys/devices/system/cpu/smt/control
	echo 0 \| sudo tee /proc/sys/kernel/perf_event_paranoid
	perf stat -a -e cycles,instructions,task-clock ./$f

	Output:
	hermann@7950x:~$ ./$f
	Starting hardware-bound benchmark using 16 threads...
	... [AVX512F] vsqrtpd(mm512d,mm512d) completed
	https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
	-------------------------------------------
	Execution Time: 4.75232 seconds
	Counter: 25,600,000,000
	Total Compute: 204.8 double sqrt GFLOPS (counter * 8)
	Performance: 43.0947 GFLOPS
	-------------------------------------------
	hermann@7950x:~$
	*/
	#include <omp.h>
	#include <inttypes.h>
	#include <iostream>
	#include <chrono> // NOLINT [build/c++11]

	int main(int, char**) {
	const int iterations = 200000000; // 2*10^8

	std::cout << "Starting hardware-bound benchmark using "
	<< omp_get_max_threads() << " threads...\n";

	auto start_time = std::chrono::high_resolution_clock::now();

	#pragma omp parallel
	{
	for (int i = 0; i < iterations; ++i) {
	asm __volatile__ (
	"vsqrtpd %%ymm0, %%ymm0 \n\t"
	"vsqrtpd %%ymm1, %%ymm1 \n\t"
	"vsqrtpd %%ymm2, %%ymm2 \n\t"
	"vsqrtpd %%ymm3, %%ymm3 \n\t"
	"vsqrtpd %%ymm4, %%ymm4 \n\t"
	"vsqrtpd %%ymm5, %%ymm5 \n\t"
	"vsqrtpd %%ymm6, %%ymm6 \n\t"
	"vsqrtpd %%ymm7, %%ymm7 \n\t"
	: // No outputs
	: // No inputs
	: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
	);
	}
	}

	std::cout << "... [AVX256] vsqrtpd(mm256d,mm256d) completed\n";

	std::chrono::duration<double> duration =
	std::chrono::high_resolution_clock::now() - start_time;

	int64_t ops_per_loop = 4 * 8 * omp_get_max_threads();
	int64_t total_ops = ops_per_loop * iterations;
	int64_t giga_cnt = total_ops / 4;
	double giga_ops = total_ops / 1e9;
	double performance = giga_ops / duration.count();

	std::cout << "-------------------------------------------\n";
	std::cout << "Execution Time: " << duration.count() << " seconds\n";
	std::cout.imbue(std::locale(""));
	std::cout << "Counter: " << giga_cnt << "\n";
	std::cout << "Total Compute: " << giga_ops
	<< " double sqrt GFLOPS (counter * 4)\n";
	std::cout << "Performance: " << performance << " GFLOPS\n";
	std::cout << "-------------------------------------------\n";

	return 0;
	}
No results found