Created
March 7, 2024 11:58
-
-
Save dajuno/3adbe8ebc2801dbb02ea4700b713e66f to your computer and use it in GitHub Desktop.
C++ vs Python vs numpy vs Numba JIT speed comparison
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* compile with: | |
g++ -O3 -std=c++11 vec_dot.cpp -I/usr/include/openblas -fopenmp -lcblas | |
(or clang++) | |
NOTE: | |
- in main, comment either the OpenMP or the BLAS block! using both leads to degraded performance. | |
- on my TP X13 G3 with Intel i7-1260P, adding -march=native will lead to lower performance!! | |
- other flags like -funroll-loops -flto -finline-functions did not seem to lead to significant performance increases | |
*/ | |
#include <iostream> | |
#include <vector> | |
#include <random> | |
#include <chrono> | |
#include <cblas.h> | |
#include <omp.h> | |
double dot(std::vector<double> const &a, std::vector<double> const &b) | |
{ | |
double sum = 0.0; | |
for(int i = 0; i < a.size(); ++i) { | |
sum += a[i] * b[i]; | |
} | |
return sum; | |
} | |
double dot_omp(std::vector<double> const &a, std::vector<double> const &b) | |
{ | |
double sum = 0.0; | |
int target_thread_num = 16; | |
omp_set_num_threads(target_thread_num); | |
#pragma omp parallel for reduction(+:sum) | |
for(int i = 0; i < a.size(); ++i) { | |
sum += a[i] * b[i]; | |
} | |
return sum; | |
} | |
double dot_blas(std::vector<double> const &a, std::vector<double> const &b) | |
{ | |
return cblas_ddot(a.size(), &a[0], 1, &b[0], 1); | |
} | |
std::vector<double> create_random_vector(int n) { | |
std::random_device rd; // Will be used to obtain a seed for the random number engine | |
std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() | |
std::uniform_real_distribution<> dis(0.0, 1.0); | |
std::vector<double> vec(n, 0.0); | |
for (int i = 0; i < n; ++i) { | |
vec[i] = dis(gen); | |
} | |
return vec; | |
} | |
int main() { | |
int n = 100000000; | |
std::vector<double> a = create_random_vector(n); | |
std::vector<double> b = create_random_vector(n); | |
std::cout << "Vector size: " << a.size() << std::endl; | |
std::cout << "C++" << std::endl; | |
auto t_start = std::chrono::steady_clock::now(); | |
std::cout << "\tdot: " << dot(a, b) << std::endl; | |
auto t_stop = std::chrono::steady_clock::now(); | |
auto t_elapsed = t_stop - t_start; | |
std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl; | |
// OMP | |
/* | |
std::cout << "C++ w/ OpenMP" << std::endl; | |
t_start = std::chrono::steady_clock::now(); | |
std::cout << "\tdot: " << dot_omp(a, b) << std::endl; | |
t_stop = std::chrono::steady_clock::now(); | |
t_elapsed = t_stop - t_start; | |
std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl; | |
//*/ | |
// OpenBLAS | |
//* | |
std::cout << "C++ w/ OpenBLAS" << std::endl; | |
std::cout << "\tOpenBLAS threads: " << openblas_get_num_threads() << std::endl; | |
t_start = std::chrono::steady_clock::now(); | |
std::cout << "\tdot: " << dot_blas(a, b) << std::endl; | |
t_stop = std::chrono::steady_clock::now(); | |
t_elapsed = t_stop - t_start; | |
std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl; | |
//*/ | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy | |
from numba import njit, prange | |
def dot_pure(a, b): | |
sum = 0.0 | |
for x, y in zip(a, b): | |
sum += x * y | |
return sum | |
@njit | |
def dot_numba(a, b): | |
sum = 0.0 | |
for x, y in zip(a, b): | |
sum += x * y | |
return sum | |
@njit(parallel=True) | |
def dot_numba_parallel(a, b): | |
sum = 0.0 | |
for i in prange(len(a)): | |
sum += a[i] * b[i] | |
return sum | |
def dot_numpy(a, b): | |
return numpy.dot(a, b) | |
def create_random_vector(n): | |
return numpy.random.rand(n).astype(numpy.float64) | |
if __name__ == "__main__": | |
n = 100000000 | |
a = create_random_vector(n) | |
b = create_random_vector(n) | |
print("Vector size: ", a.size) | |
print("Python") | |
t_start = time.perf_counter_ns() | |
print("\tdot: ", dot_pure(a, b)) | |
t_stop = time.perf_counter_ns() | |
print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms") | |
print("NumPy") | |
t_start = time.perf_counter_ns() | |
print("\tdot: ", dot_numpy(a, b)) | |
t_stop = time.perf_counter_ns() | |
print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms") | |
print("Python+numba") | |
# call once to exclude compile time in timings | |
dot_numba(a, b) | |
t_start = time.perf_counter_ns() | |
print("\tdot: ", dot_numba(a, b)) | |
t_stop = time.perf_counter_ns() | |
print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms") | |
print("Python+numba parallel") | |
# call once to exclude compile time in timings | |
dot_numba_parallel(a, b) | |
t_start = time.perf_counter_ns() | |
print("\tdot: ", dot_numba_parallel(a, b)) | |
t_stop = time.perf_counter_ns() | |
print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Some unrolled loops and removed
[]
operator calls: