Last active
January 17, 2025 22:29
-
-
Save matthieubulte/2bdf5fec301895f1a9d80753e2a50be9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -O3 -g -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -lopenblas -march=native main.c -o main && ./main | |
#include <math.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <time.h> | |
#include <arm_neon.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <cblas.h> | |
#define N 1024 | |
// This seemed to be the best size, and then I optimized with this assumption since | |
// it made it possible to use 16 registers for the C submatrix buffer | |
#define BS 8 | |
static float A[N * N] __attribute__((aligned(128))); | |
static float B[N * N] __attribute__((aligned(128))); | |
static float C[N * N] __attribute__((aligned(128))); | |
static float C_blas[N * N] __attribute__((aligned(128))); | |
// To optimize register usage, use 16 (out of 32) simd registers | |
// for the C submatrix buffer. 16 register of 4 floats each = 8x8 matrix | |
static float32x4_t C_buff[BS][2]; | |
// The submatrix buffers for A and B are just smaller 8x8 matrices | |
static float A_buff[BS * BS] __attribute__((aligned(128))); | |
static float B_buff[BS * BS] __attribute__((aligned(128))); | |
// Load a block of A into the A buffer | |
inline void load_a_buff(int row, int col) | |
{ | |
for (int i = 0; i < BS; i++) | |
{ | |
vst1q_f32(&A_buff[i * BS], vld1q_f32(&A[(row + i) * N + (col)])); | |
vst1q_f32(&A_buff[i * BS + 4], vld1q_f32(&A[(row + i) * N + (col + 4)])); | |
} | |
} | |
// Load a block of B into the B buffer | |
inline void load_b_buff(int row, int col) | |
{ | |
for (int i = 0; i < BS; i++) | |
{ | |
vst1q_f32(&B_buff[i * BS], vld1q_f32(&B[(col + i) * N + row])); | |
vst1q_f32(&B_buff[i * BS + 4], vld1q_f32(&B[(col + i) * N + row + 4])); | |
} | |
} | |
// Load a block of C into simd registers | |
inline void load_c_buff(int i, int j) | |
{ | |
for (int ii = 0; ii < BS; ii++) | |
{ | |
C_buff[ii][0] = vld1q_f32(&C[(i + ii) * N + j]); | |
C_buff[ii][1] = vld1q_f32(&C[(i + ii) * N + j + 4]); | |
} | |
} | |
// Store the simd registers back into the C matrix | |
inline void store_c_buff(int i, int j) | |
{ | |
for (int ii = 0; ii < BS; ii++) | |
{ | |
vst1q_f32(&C[(i + ii) * N + j], C_buff[ii][0]); | |
vst1q_f32(&C[(i + ii) * N + j + 4], C_buff[ii][1]); | |
} | |
} | |
inline void block_mul() | |
{ | |
for (int k = 0; k < BS; k++) | |
{ | |
// We unroll two levels of the loop to maximize the use of registers | |
float32x4_t b0 = vld1q_f32(&B_buff[k * BS]); | |
float32x4_t b1 = vld1q_f32(&B_buff[k * BS + 4]); | |
// Sadly can't find a way to force the loop to be unrolled | |
// it got me 3GFLOPS though. The point here is to maximally use | |
// the registers | |
float32x4_t a0 = vdupq_n_f32(A_buff[0 * BS + k]); | |
float32x4_t a1 = vdupq_n_f32(A_buff[1 * BS + k]); | |
float32x4_t a2 = vdupq_n_f32(A_buff[2 * BS + k]); | |
float32x4_t a3 = vdupq_n_f32(A_buff[3 * BS + k]); | |
float32x4_t a4 = vdupq_n_f32(A_buff[4 * BS + k]); | |
float32x4_t a5 = vdupq_n_f32(A_buff[5 * BS + k]); | |
float32x4_t a6 = vdupq_n_f32(A_buff[6 * BS + k]); | |
float32x4_t a7 = vdupq_n_f32(A_buff[7 * BS + k]); | |
C_buff[0][0] = vfmaq_f32(C_buff[0][0], a0, b0); | |
C_buff[0][1] = vfmaq_f32(C_buff[0][1], a0, b1); | |
C_buff[1][0] = vfmaq_f32(C_buff[1][0], a1, b0); | |
C_buff[1][1] = vfmaq_f32(C_buff[1][1], a1, b1); | |
C_buff[2][0] = vfmaq_f32(C_buff[2][0], a2, b0); | |
C_buff[2][1] = vfmaq_f32(C_buff[2][1], a2, b1); | |
C_buff[3][0] = vfmaq_f32(C_buff[3][0], a3, b0); | |
C_buff[3][1] = vfmaq_f32(C_buff[3][1], a3, b1); | |
C_buff[4][0] = vfmaq_f32(C_buff[4][0], a4, b0); | |
C_buff[4][1] = vfmaq_f32(C_buff[4][1], a4, b1); | |
C_buff[5][0] = vfmaq_f32(C_buff[5][0], a5, b0); | |
C_buff[5][1] = vfmaq_f32(C_buff[5][1], a5, b1); | |
C_buff[6][0] = vfmaq_f32(C_buff[6][0], a6, b0); | |
C_buff[6][1] = vfmaq_f32(C_buff[6][1], a6, b1); | |
C_buff[7][0] = vfmaq_f32(C_buff[7][0], a7, b0); | |
C_buff[7][1] = vfmaq_f32(C_buff[7][1], a7, b1); | |
} | |
} | |
// numpy single threaded: 90 GFLOPS/S | |
// numpy multi threaded: 200 GFLOPS/S | |
// 98.16ms 21.8765 GFLOPS/S -- baseline with the right order of loops | |
// 44.50ms 48.2624 GFLOPS/S -- + vectorization | |
// 35.47ms 60.5471 GFLOPS/S -- + block multiplication | |
// 28.38ms 75.6662 GFLOPS/S -- + choosing the right lbock size (8) | |
// 27.21ms 78.9313 GFLOPS/S -- + loop unrolling for register control | |
// 31.08ms 69.0954 GFLOPS/S -- fix mistake in loading B | |
void multiply() | |
{ | |
// Simple algorighm that uses blocks of size BS | |
// and tries to keep the same data as long as possible in registers | |
for (int i = 0; i < N; i += BS) | |
{ | |
for (int j = 0; j < N; j += BS) | |
{ | |
load_c_buff(i, j); | |
for (int k = 0; k < N; k += BS) | |
{ | |
load_a_buff(i, k); | |
load_b_buff(j, k); | |
block_mul(); | |
} | |
store_c_buff(i, j); | |
} | |
} | |
} | |
void benchmark() | |
{ | |
clock_t start, end; | |
float duration_ms; | |
float flops = 2.0 * N * N * N; | |
start = clock(); | |
multiply(); | |
end = clock(); | |
duration_ms = ((float)(end - start) * 1000.0) / CLOCKS_PER_SEC; | |
printf("\t %.2fms \t|\t %.1f GFLOPS/S \n", duration_ms, flops / duration_ms / 1e6); | |
} | |
int compare_matrices(const float *A, const float *B, float tol) | |
{ | |
for (int i = 0; i < N * N; i++) | |
{ | |
if (fabsf(A[i] - B[i]) > tol) | |
{ | |
return 0; | |
} | |
} | |
return 1; | |
} | |
int main() | |
{ | |
srand(time(NULL)); | |
// Check correctness | |
for (int i = 0; i < N * N; i++) | |
{ | |
A[i] = rand() % 10; | |
B[i] = rand() % 10; | |
} | |
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, | |
N, N, N, 1.0, A, N, B, N, 0.0, C_blas, N); | |
benchmark(); | |
if (!compare_matrices(C, C_blas, 1e-5f)) | |
{ | |
printf("Results don't match - there might be an error in the implementation\n"); | |
return 0; | |
} | |
printf("\t Time (ours) \t|\t Perf (ours) \t|\n"); | |
printf("------------------------+-----------------------+\n"); | |
while (1) | |
{ | |
for (int i = 0; i < N * N; i++) | |
{ | |
A[i] = rand() % 10; | |
B[i] = rand() % 10; | |
} | |
benchmark(); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment