IgorBaratta · January 5, 2022 12:44 · IgorBaratta · Jan 5, 2022 · IgorBaratta · Jan 5, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.18)

 set(PROJECT_NAME BMcublas)
 project(${PROJECT_NAME})

 include(CheckLanguage)
 check_language(CUDA CXX)
 enable_language(CUDA CXX)

 # Set C++ standard
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 find_package(benchmark REQUIRED)
 find_package(CUDA 11.0 REQUIRED)

 add_executable(${PROJECT_NAME} main.cpp)
 target_include_directories(${PROJECT_NAME} PUBLIC SYSTEM ${CUDA_INCLUDE_DIRS})
 target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} benchmark::benchmark)
diff --git a/main.cpp b/main.cpp
 #include "src.hpp"
 #include <benchmark/benchmark.h>
 #include <chrono>

 double value = 1. / 3.;

 using namespace std::chrono;

 template <typename T>
 static void BM_copy(benchmark::State& state) {
  std::size_t n = state.range(0);

  cublasHandle_t handle;
  cublasCreate(&handle);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  T* x = kernel::init_vector<T>(handle, n, value);
  T* y = kernel::init_vector<T>(handle, n, -value);

  for (auto _ : state) {
    cudaEventRecord(start);
    kernel::copy<T>(handle, n, x, y);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float t = 0;
    cudaEventElapsedTime(&t, start, stop);
    state.SetIterationTime(t / 1e3);
  }

  state.SetBytesProcessed(sizeof(T) * 2 * n * state.iterations());
  state.SetLabel("cublas");

  cublasDestroy(handle);
  cudaFree(&x);
  cudaFree(&y);
 }

 template <typename T>
 static void BM_axpy(benchmark::State& state) {
  std::size_t n = state.range(0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  T* x = kernel::init_vector<T>(handle, n, value);
  T* y = kernel::init_vector<T>(handle, n, -value);
  T alpha = 1.;

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  for (auto _ : state) {
    cudaEventRecord(start);
    kernel::axpy<T>(handle, n, alpha, x, y);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float t = 0;
    cudaEventElapsedTime(&t, start, stop);
    state.SetIterationTime(t / 1e3);
  }

  state.SetBytesProcessed(sizeof(T) * 2 * n * state.iterations());
  state.SetLabel("cublas");

  cublasDestroy(handle);
  cudaFree(&x);
  cudaFree(&y);
 }

 template <typename T>
 static void BM_dot(benchmark::State& state) {
  std::size_t n = state.range(0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  T* x = kernel::init_vector<T>(handle, n, value);
  T* y = kernel::init_vector<T>(handle, n, -value);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  for (auto _ : state) {
    cudaEventRecord(start);
    T result = kernel::dot<T>(handle, n, x, y);
    benchmark::DoNotOptimize(result);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float t = 0;
    cudaEventElapsedTime(&t, start, stop);
    state.SetIterationTime(t / 1e3);
  }

  state.SetBytesProcessed(sizeof(T) * 2 * n * state.iterations());
  state.SetLabel("cublas");

  cublasDestroy(handle);
  cudaFree(&x);
  cudaFree(&y);
 }

 BENCHMARK_TEMPLATE(BM_copy, double)
    ->RangeMultiplier(2)
    ->Range(1 << 8, 1 << 29)
    ->UseManualTime();
 BENCHMARK_TEMPLATE(BM_axpy, double)
    ->RangeMultiplier(2)
    ->Range(1 << 8, 1 << 29)
    ->UseManualTime();
 BENCHMARK_TEMPLATE(BM_dot, double)
    ->RangeMultiplier(2)
    ->Range(1 << 8, 1 << 29)
    ->UseManualTime();

 int main(int argc, char** argv) {
  kernel::print_device_info();
  ::benchmark::Initialize(&argc, argv);
  if (::benchmark::ReportUnrecognizedArguments(argc, argv))
    return 1;
  ::benchmark::RunSpecifiedBenchmarks();
 }
diff --git a/src.hpp b/src.hpp
 #include <cuda_runtime.h>
 #include <stdexcept>
 #include <type_traits>

 #include "cublas_v2.h"

 template <class T>
 struct unrecognized_type : std::false_type {};

 namespace kernel {

 void print_device_info() {
  int num_devices;

  cudaGetDeviceCount(&num_devices);
  for (int i = 0; i < num_devices; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    double bandwidth = 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6;
    std::cout << "Device Number: " << i << std::endl;
    std::cout << "\tDevice name: " << prop.name << std::endl;
    std::cout << "\tShared memory available per block (kB): "
              << prop.sharedMemPerBlock / 1e3 << std::endl;
    std::cout << "\tGlobal memory available (GB): " << prop.totalGlobalMem / 1e9
              << std::endl;
    std::cout << "\tPeak Memory Bandwidth (GB/s): " << bandwidth << std::endl;
  }
  std::cout << std::endl;
 }

 void assert_cuda(cudaError_t e) {
  if (e != cudaSuccess)
    throw std::runtime_error(" Unable to allocate memoy - cublas error");
 }

 template <typename T>
 T* init_vector(cublasHandle_t handle, std::size_t n, T value) {
  T* x;
  assert_cuda(cudaMalloc(&x, n * sizeof(T)));
  assert_cuda(cudaMemset(x, value, n * sizeof(T)));
  return x;
 }

 template <typename T>
 void copy(cublasHandle_t handle, std::size_t n, T* x, T* y) {
  if constexpr (std::is_same<T, double>())
    cublasDcopy(handle, n, x, 1, y, 1);
  else if constexpr (std::is_same<T, float>())
    cublasScopy(handle, n, x, 1, y, 1);
 }

 template <typename T>
 void axpy(cublasHandle_t handle, std::size_t n, T alpha, T* x, T* y) {
  if constexpr (std::is_same<T, double>())
    cublasDaxpy(handle, n, &alpha, x, 1, y, 1);
  else if constexpr (std::is_same<T, float>())
    cublasSaxpy(handle, n, &alpha, x, 1, y, 1);
 }

 template <typename T>
 T dot(cublasHandle_t handle, std::size_t n, T* x, T* y) {
  T result = 0;
  if constexpr (std::is_same<T, double>())
    cublasDdot(handle, n, x, 1, y, 1, &result);
  else if constexpr (std::is_same<T, float>())
    cublasSdot(handle, n, x, 1, y, 1, &result);

  return result;
 }

 } // namespace kernel
	cmake_minimum_required(VERSION 3.18)

	set(PROJECT_NAME BMcublas)
	project(${PROJECT_NAME})

	include(CheckLanguage)
	check_language(CUDA CXX)
	enable_language(CUDA CXX)

	# Set C++ standard
	set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
	set(CMAKE_CXX_STANDARD 17)
	set(CMAKE_CXX_STANDARD_REQUIRED ON)

	find_package(benchmark REQUIRED)
	find_package(CUDA 11.0 REQUIRED)

	add_executable(${PROJECT_NAME} main.cpp)
	target_include_directories(${PROJECT_NAME} PUBLIC SYSTEM ${CUDA_INCLUDE_DIRS})
	target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} benchmark::benchmark)
	#include "src.hpp"
	#include <benchmark/benchmark.h>
	#include <chrono>

	double value = 1. / 3.;

	using namespace std::chrono;

	template <typename T>
	static void BM_copy(benchmark::State& state) {
	std::size_t n = state.range(0);

	cublasHandle_t handle;
	cublasCreate(&handle);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	T* x = kernel::init_vector<T>(handle, n, value);
	T* y = kernel::init_vector<T>(handle, n, -value);

	for (auto _ : state) {
	cudaEventRecord(start);
	kernel::copy<T>(handle, n, x, y);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float t = 0;
	cudaEventElapsedTime(&t, start, stop);
	state.SetIterationTime(t / 1e3);
	}

	state.SetBytesProcessed(sizeof(T) * 2 * n * state.iterations());
	state.SetLabel("cublas");

	cublasDestroy(handle);
	cudaFree(&x);
	cudaFree(&y);
	}

	template <typename T>
	static void BM_axpy(benchmark::State& state) {
	std::size_t n = state.range(0);
	cublasHandle_t handle;
	cublasCreate(&handle);

	T* x = kernel::init_vector<T>(handle, n, value);
	T* y = kernel::init_vector<T>(handle, n, -value);
	T alpha = 1.;

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	for (auto _ : state) {
	cudaEventRecord(start);
	kernel::axpy<T>(handle, n, alpha, x, y);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float t = 0;
	cudaEventElapsedTime(&t, start, stop);
	state.SetIterationTime(t / 1e3);
	}

	state.SetBytesProcessed(sizeof(T) * 2 * n * state.iterations());
	state.SetLabel("cublas");

	cublasDestroy(handle);
	cudaFree(&x);
	cudaFree(&y);
	}

	template <typename T>
	static void BM_dot(benchmark::State& state) {
	std::size_t n = state.range(0);
	cublasHandle_t handle;
	cublasCreate(&handle);

	T* x = kernel::init_vector<T>(handle, n, value);
	T* y = kernel::init_vector<T>(handle, n, -value);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	for (auto _ : state) {
	cudaEventRecord(start);
	T result = kernel::dot<T>(handle, n, x, y);
	benchmark::DoNotOptimize(result);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float t = 0;
	cudaEventElapsedTime(&t, start, stop);
	state.SetIterationTime(t / 1e3);
	}

	state.SetBytesProcessed(sizeof(T) * 2 * n * state.iterations());
	state.SetLabel("cublas");

	cublasDestroy(handle);
	cudaFree(&x);
	cudaFree(&y);
	}

	BENCHMARK_TEMPLATE(BM_copy, double)
	->RangeMultiplier(2)
	->Range(1 << 8, 1 << 29)
	->UseManualTime();
	BENCHMARK_TEMPLATE(BM_axpy, double)
	->RangeMultiplier(2)
	->Range(1 << 8, 1 << 29)
	->UseManualTime();
	BENCHMARK_TEMPLATE(BM_dot, double)
	->RangeMultiplier(2)
	->Range(1 << 8, 1 << 29)
	->UseManualTime();

	int main(int argc, char** argv) {
	kernel::print_device_info();
	::benchmark::Initialize(&argc, argv);
	if (::benchmark::ReportUnrecognizedArguments(argc, argv))
	return 1;
	::benchmark::RunSpecifiedBenchmarks();
	}
	#include <cuda_runtime.h>
	#include <stdexcept>
	#include <type_traits>

	#include "cublas_v2.h"

	template <class T>
	struct unrecognized_type : std::false_type {};

	namespace kernel {

	void print_device_info() {
	int num_devices;

	cudaGetDeviceCount(&num_devices);
	for (int i = 0; i < num_devices; i++) {
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, i);
	double bandwidth = 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6;
	std::cout << "Device Number: " << i << std::endl;
	std::cout << "\tDevice name: " << prop.name << std::endl;
	std::cout << "\tShared memory available per block (kB): "
	<< prop.sharedMemPerBlock / 1e3 << std::endl;
	std::cout << "\tGlobal memory available (GB): " << prop.totalGlobalMem / 1e9
	<< std::endl;
	std::cout << "\tPeak Memory Bandwidth (GB/s): " << bandwidth << std::endl;
	}
	std::cout << std::endl;
	}

	void assert_cuda(cudaError_t e) {
	if (e != cudaSuccess)
	throw std::runtime_error(" Unable to allocate memoy - cublas error");
	}

	template <typename T>
	T* init_vector(cublasHandle_t handle, std::size_t n, T value) {
	T* x;
	assert_cuda(cudaMalloc(&x, n * sizeof(T)));
	assert_cuda(cudaMemset(x, value, n * sizeof(T)));
	return x;
	}

	template <typename T>
	void copy(cublasHandle_t handle, std::size_t n, T* x, T* y) {
	if constexpr (std::is_same<T, double>())
	cublasDcopy(handle, n, x, 1, y, 1);
	else if constexpr (std::is_same<T, float>())
	cublasScopy(handle, n, x, 1, y, 1);
	}

	template <typename T>
	void axpy(cublasHandle_t handle, std::size_t n, T alpha, T* x, T* y) {
	if constexpr (std::is_same<T, double>())
	cublasDaxpy(handle, n, &alpha, x, 1, y, 1);
	else if constexpr (std::is_same<T, float>())
	cublasSaxpy(handle, n, &alpha, x, 1, y, 1);
	}

	template <typename T>
	T dot(cublasHandle_t handle, std::size_t n, T* x, T* y) {
	T result = 0;
	if constexpr (std::is_same<T, double>())
	cublasDdot(handle, n, x, 1, y, 1, &result);
	else if constexpr (std::is_same<T, float>())
	cublasSdot(handle, n, x, 1, y, 1, &result);

	return result;
	}

	} // namespace kernel