matthieubulte · January 17, 2025 22:29
diff --git a/matmul.c b/matmul.c
 // gcc -O3 -g -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -lopenblas -march=native main.c -o main && ./main
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <arm_neon.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <cblas.h>

 #define N 1024

 // This seemed to be the best size, and then I optimized with this assumption since
 // it made it possible to use 16 registers for the C submatrix buffer
 #define BS 8

 static float A[N * N] __attribute__((aligned(128)));
 static float B[N * N] __attribute__((aligned(128)));
 static float C[N * N] __attribute__((aligned(128)));
 static float C_blas[N * N] __attribute__((aligned(128)));

 // To optimize register usage, use 16 (out of 32) simd registers
 // for the C submatrix buffer. 16 register of 4 floats each = 8x8 matrix
 static float32x4_t C_buff[BS][2];

 // The submatrix buffers for A and B are just smaller 8x8 matrices
 static float A_buff[BS * BS] __attribute__((aligned(128)));
 static float B_buff[BS * BS] __attribute__((aligned(128)));

 // Load a block of A into the A buffer
 inline void load_a_buff(int row, int col)
 {
    for (int i = 0; i < BS; i++)
    {
        vst1q_f32(&A_buff[i * BS], vld1q_f32(&A[(row + i) * N + (col)]));
        vst1q_f32(&A_buff[i * BS + 4], vld1q_f32(&A[(row + i) * N + (col + 4)]));
    }
 }

 // Load a block of B into the B buffer
 inline void load_b_buff(int row, int col)
 {
    for (int i = 0; i < BS; i++)
    {
        vst1q_f32(&B_buff[i * BS], vld1q_f32(&B[(col + i) * N + row]));
        vst1q_f32(&B_buff[i * BS + 4], vld1q_f32(&B[(col + i) * N + row + 4]));
    }
 }

 // Load a block of C into simd registers
 inline void load_c_buff(int i, int j)
 {
    for (int ii = 0; ii < BS; ii++)
    {
        C_buff[ii][0] = vld1q_f32(&C[(i + ii) * N + j]);
        C_buff[ii][1] = vld1q_f32(&C[(i + ii) * N + j + 4]);
    }
 }

 // Store the simd registers back into the C matrix
 inline void store_c_buff(int i, int j)
 {
    for (int ii = 0; ii < BS; ii++)
    {
        vst1q_f32(&C[(i + ii) * N + j], C_buff[ii][0]);
        vst1q_f32(&C[(i + ii) * N + j + 4], C_buff[ii][1]);
    }
 }

 inline void block_mul()
 {
    for (int k = 0; k < BS; k++)
    {
        // We unroll two levels of the loop to maximize the use of registers
        float32x4_t b0 = vld1q_f32(&B_buff[k * BS]);
        float32x4_t b1 = vld1q_f32(&B_buff[k * BS + 4]);

        // Sadly can't find a way to force the loop to be unrolled
        // it got me 3GFLOPS though. The point here is to maximally use
        // the registers
        float32x4_t a0 = vdupq_n_f32(A_buff[0 * BS + k]);
        float32x4_t a1 = vdupq_n_f32(A_buff[1 * BS + k]);
        float32x4_t a2 = vdupq_n_f32(A_buff[2 * BS + k]);
        float32x4_t a3 = vdupq_n_f32(A_buff[3 * BS + k]);
        float32x4_t a4 = vdupq_n_f32(A_buff[4 * BS + k]);
        float32x4_t a5 = vdupq_n_f32(A_buff[5 * BS + k]);
        float32x4_t a6 = vdupq_n_f32(A_buff[6 * BS + k]);
        float32x4_t a7 = vdupq_n_f32(A_buff[7 * BS + k]);

        C_buff[0][0] = vfmaq_f32(C_buff[0][0], a0, b0);
        C_buff[0][1] = vfmaq_f32(C_buff[0][1], a0, b1);

        C_buff[1][0] = vfmaq_f32(C_buff[1][0], a1, b0);
        C_buff[1][1] = vfmaq_f32(C_buff[1][1], a1, b1);

        C_buff[2][0] = vfmaq_f32(C_buff[2][0], a2, b0);
        C_buff[2][1] = vfmaq_f32(C_buff[2][1], a2, b1);

        C_buff[3][0] = vfmaq_f32(C_buff[3][0], a3, b0);
        C_buff[3][1] = vfmaq_f32(C_buff[3][1], a3, b1);

        C_buff[4][0] = vfmaq_f32(C_buff[4][0], a4, b0);
        C_buff[4][1] = vfmaq_f32(C_buff[4][1], a4, b1);

        C_buff[5][0] = vfmaq_f32(C_buff[5][0], a5, b0);
        C_buff[5][1] = vfmaq_f32(C_buff[5][1], a5, b1);

        C_buff[6][0] = vfmaq_f32(C_buff[6][0], a6, b0);
        C_buff[6][1] = vfmaq_f32(C_buff[6][1], a6, b1);

        C_buff[7][0] = vfmaq_f32(C_buff[7][0], a7, b0);
        C_buff[7][1] = vfmaq_f32(C_buff[7][1], a7, b1);
    }
 }

 // numpy single threaded: 90 GFLOPS/S
 // numpy multi threaded: 200 GFLOPS/S

 // 98.16ms         21.8765 GFLOPS/S -- baseline with the right order of loops
 // 44.50ms         48.2624 GFLOPS/S -- + vectorization
 // 35.47ms         60.5471 GFLOPS/S -- + block multiplication
 // 28.38ms         75.6662 GFLOPS/S -- + choosing the right lbock size (8)
 // 27.21ms         78.9313 GFLOPS/S -- + loop unrolling for register control
 // 31.08ms         69.0954 GFLOPS/S -- fix mistake in loading B
 void multiply()
 {
    // Simple algorighm that uses blocks of size BS
    // and tries to keep the same data as long as possible in registers
    for (int i = 0; i < N; i += BS)
    {
        for (int j = 0; j < N; j += BS)
        {
            load_c_buff(i, j);

            for (int k = 0; k < N; k += BS)
            {
                load_a_buff(i, k);
                load_b_buff(j, k);

                block_mul();
            }

            store_c_buff(i, j);
        }
    }
 }

 void benchmark()
 {
    clock_t start, end;
    float duration_ms;
    float flops = 2.0 * N * N * N;

    start = clock();
    multiply();
    end = clock();
    duration_ms = ((float)(end - start) * 1000.0) / CLOCKS_PER_SEC;

    
    printf("\t %.2fms \t|\t %.1f GFLOPS/S \n", duration_ms, flops / duration_ms / 1e6);
 }

 int compare_matrices(const float *A, const float *B, float tol)
 {
    for (int i = 0; i < N * N; i++)
    {
        if (fabsf(A[i] - B[i]) > tol)
        {
            return 0;
        }
    }
    return 1;
 }

 int main()
 {
    srand(time(NULL));

    // Check correctness
    for (int i = 0; i < N * N; i++)
    {
        A[i] = rand() % 10;
        B[i] = rand() % 10;
    }
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                N, N, N, 1.0, A, N, B, N, 0.0, C_blas, N);
    benchmark();
    if (!compare_matrices(C, C_blas, 1e-5f))
    {
        printf("Results don't match - there might be an error in the implementation\n");
        return 0;
    }

    printf("\t Time (ours) \t|\t Perf (ours) \t|\n");
    printf("------------------------+-----------------------+\n");

    while (1)
    {
        for (int i = 0; i < N * N; i++)
        {
            A[i] = rand() % 10;
            B[i] = rand() % 10;
        }

        benchmark();
    }
    return 0;
 }
	// gcc -O3 -g -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -lopenblas -march=native main.c -o main && ./main
	#include <math.h>
	#include <stdio.h>
	#include <string.h>
	#include <time.h>
	#include <arm_neon.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <cblas.h>

	#define N 1024

	// This seemed to be the best size, and then I optimized with this assumption since
	// it made it possible to use 16 registers for the C submatrix buffer
	#define BS 8

	static float A[N * N] __attribute__((aligned(128)));
	static float B[N * N] __attribute__((aligned(128)));
	static float C[N * N] __attribute__((aligned(128)));
	static float C_blas[N * N] __attribute__((aligned(128)));

	// To optimize register usage, use 16 (out of 32) simd registers
	// for the C submatrix buffer. 16 register of 4 floats each = 8x8 matrix
	static float32x4_t C_buff[BS][2];

	// The submatrix buffers for A and B are just smaller 8x8 matrices
	static float A_buff[BS * BS] __attribute__((aligned(128)));
	static float B_buff[BS * BS] __attribute__((aligned(128)));

	// Load a block of A into the A buffer
	inline void load_a_buff(int row, int col)
	{
	for (int i = 0; i < BS; i++)
	{
	vst1q_f32(&A_buff[i * BS], vld1q_f32(&A[(row + i) * N + (col)]));
	vst1q_f32(&A_buff[i * BS + 4], vld1q_f32(&A[(row + i) * N + (col + 4)]));
	}
	}

	// Load a block of B into the B buffer
	inline void load_b_buff(int row, int col)
	{
	for (int i = 0; i < BS; i++)
	{
	vst1q_f32(&B_buff[i * BS], vld1q_f32(&B[(col + i) * N + row]));
	vst1q_f32(&B_buff[i * BS + 4], vld1q_f32(&B[(col + i) * N + row + 4]));
	}
	}

	// Load a block of C into simd registers
	inline void load_c_buff(int i, int j)
	{
	for (int ii = 0; ii < BS; ii++)
	{
	C_buff[ii][0] = vld1q_f32(&C[(i + ii) * N + j]);
	C_buff[ii][1] = vld1q_f32(&C[(i + ii) * N + j + 4]);
	}
	}

	// Store the simd registers back into the C matrix
	inline void store_c_buff(int i, int j)
	{
	for (int ii = 0; ii < BS; ii++)
	{
	vst1q_f32(&C[(i + ii) * N + j], C_buff[ii][0]);
	vst1q_f32(&C[(i + ii) * N + j + 4], C_buff[ii][1]);
	}
	}

	inline void block_mul()
	{
	for (int k = 0; k < BS; k++)
	{
	// We unroll two levels of the loop to maximize the use of registers
	float32x4_t b0 = vld1q_f32(&B_buff[k * BS]);
	float32x4_t b1 = vld1q_f32(&B_buff[k * BS + 4]);

	// Sadly can't find a way to force the loop to be unrolled
	// it got me 3GFLOPS though. The point here is to maximally use
	// the registers
	float32x4_t a0 = vdupq_n_f32(A_buff[0 * BS + k]);
	float32x4_t a1 = vdupq_n_f32(A_buff[1 * BS + k]);
	float32x4_t a2 = vdupq_n_f32(A_buff[2 * BS + k]);
	float32x4_t a3 = vdupq_n_f32(A_buff[3 * BS + k]);
	float32x4_t a4 = vdupq_n_f32(A_buff[4 * BS + k]);
	float32x4_t a5 = vdupq_n_f32(A_buff[5 * BS + k]);
	float32x4_t a6 = vdupq_n_f32(A_buff[6 * BS + k]);
	float32x4_t a7 = vdupq_n_f32(A_buff[7 * BS + k]);

	C_buff[0][0] = vfmaq_f32(C_buff[0][0], a0, b0);
	C_buff[0][1] = vfmaq_f32(C_buff[0][1], a0, b1);

	C_buff[1][0] = vfmaq_f32(C_buff[1][0], a1, b0);
	C_buff[1][1] = vfmaq_f32(C_buff[1][1], a1, b1);

	C_buff[2][0] = vfmaq_f32(C_buff[2][0], a2, b0);
	C_buff[2][1] = vfmaq_f32(C_buff[2][1], a2, b1);

	C_buff[3][0] = vfmaq_f32(C_buff[3][0], a3, b0);
	C_buff[3][1] = vfmaq_f32(C_buff[3][1], a3, b1);

	C_buff[4][0] = vfmaq_f32(C_buff[4][0], a4, b0);
	C_buff[4][1] = vfmaq_f32(C_buff[4][1], a4, b1);

	C_buff[5][0] = vfmaq_f32(C_buff[5][0], a5, b0);
	C_buff[5][1] = vfmaq_f32(C_buff[5][1], a5, b1);

	C_buff[6][0] = vfmaq_f32(C_buff[6][0], a6, b0);
	C_buff[6][1] = vfmaq_f32(C_buff[6][1], a6, b1);

	C_buff[7][0] = vfmaq_f32(C_buff[7][0], a7, b0);
	C_buff[7][1] = vfmaq_f32(C_buff[7][1], a7, b1);
	}
	}

	// numpy single threaded: 90 GFLOPS/S
	// numpy multi threaded: 200 GFLOPS/S

	// 98.16ms 21.8765 GFLOPS/S -- baseline with the right order of loops
	// 44.50ms 48.2624 GFLOPS/S -- + vectorization
	// 35.47ms 60.5471 GFLOPS/S -- + block multiplication
	// 28.38ms 75.6662 GFLOPS/S -- + choosing the right lbock size (8)
	// 27.21ms 78.9313 GFLOPS/S -- + loop unrolling for register control
	// 31.08ms 69.0954 GFLOPS/S -- fix mistake in loading B
	void multiply()
	{
	// Simple algorighm that uses blocks of size BS
	// and tries to keep the same data as long as possible in registers
	for (int i = 0; i < N; i += BS)
	{
	for (int j = 0; j < N; j += BS)
	{
	load_c_buff(i, j);

	for (int k = 0; k < N; k += BS)
	{
	load_a_buff(i, k);
	load_b_buff(j, k);

	block_mul();
	}

	store_c_buff(i, j);
	}
	}
	}

	void benchmark()
	{
	clock_t start, end;
	float duration_ms;
	float flops = 2.0 * N * N * N;

	start = clock();
	multiply();
	end = clock();
	duration_ms = ((float)(end - start) * 1000.0) / CLOCKS_PER_SEC;


	printf("\t %.2fms \t\|\t %.1f GFLOPS/S \n", duration_ms, flops / duration_ms / 1e6);
	}

	int compare_matrices(const float A, const float B, float tol)
	{
	for (int i = 0; i < N * N; i++)
	{
	if (fabsf(A[i] - B[i]) > tol)
	{
	return 0;
	}
	}
	return 1;
	}

	int main()
	{
	srand(time(NULL));

	// Check correctness
	for (int i = 0; i < N * N; i++)
	{
	A[i] = rand() % 10;
	B[i] = rand() % 10;
	}
	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
	N, N, N, 1.0, A, N, B, N, 0.0, C_blas, N);
	benchmark();
	if (!compare_matrices(C, C_blas, 1e-5f))
	{
	printf("Results don't match - there might be an error in the implementation\n");
	return 0;
	}

	printf("\t Time (ours) \t\|\t Perf (ours) \t\|\n");
	printf("------------------------+-----------------------+\n");

	while (1)
	{
	for (int i = 0; i < N * N; i++)
	{
	A[i] = rand() % 10;
	B[i] = rand() % 10;
	}

	benchmark();
	}
	return 0;
	}