sansmoraxz · October 10, 2024 05:17
diff --git a/benchmark_lmul.cpp b/benchmark_lmul.cpp
 #include <iostream>
 #include <random>
 #include <chrono>
 #include <vector>
 #include <iomanip>

 using namespace std;

 const u_int32_t FLOAT32_EXPONENT_MASK = 0xC0880000; // -0x3F780000

 // Baseline C++ implementation from the paper
 _Float32 lmul(_Float32 x, _Float32 y)
 {
    _Float32 s;
    u_int32_t *x_bits_ptr = reinterpret_cast<u_int32_t *>(&x);
    u_int32_t *y_bits_ptr = reinterpret_cast<u_int32_t *>(&y);
    u_int32_t result = *x_bits_ptr + *y_bits_ptr + FLOAT32_EXPONENT_MASK;
    s = *reinterpret_cast<_Float32 *>(&result);
    return s;
 }

 // Pure assembly implementation for comparison
 _Float32 lmul_asm(_Float32 x, _Float32 y)
 {
    _Float32 s;

    asm(
        "movl %1, %%eax\n\t"    // Move x bits to eax
        "movl %2, %%ebx\n\t"    // Move y bits to ebx
        "addl %%ebx, %%eax\n\t" // Add y bits to x bits
        "addl %3, %%eax\n\t"    // Add FLOAT32_EXPONENT_MASK to the result
        "movl %%eax, %0"        // Move the result to s
        : "=m"(s)
        : "m"(x), "m"(y), "r"(FLOAT32_EXPONENT_MASK)
        : "%eax", "%ebx");

    return s;
 }

 // Function to generate random floating point numbers
 float generateRandomFloat(float min, float max)
 {
    random_device rd;
    mt19937 gen(rd());
    uniform_real_distribution<> dis(min, max);
    return dis(gen);
 }

 // Benchmark multiplication operator
 double benchmarkFunction(int iterations, _Float32 (*f)(_Float32, _Float32), vector<_Float32> x_values, vector<_Float32> y_values)
 {
    auto start = chrono::high_resolution_clock::now();
    for (int i = 0; i < iterations; ++i)
    {

        volatile _Float32 result = f(x_values[i], y_values[i]); // volatile to prevent optimization
    }
    auto end = chrono::high_resolution_clock::now();

    return chrono::duration_cast<chrono::duration<double>>(end - start).count();
 }

 int main()
 {
    const int iterations = 500; // Number of iterations for benchmarking
    const int times = 5;

    cout << "Iterations per benchmark: " << iterations << endl;

    cout << "Initializing random numbers..." << endl;
    vector<_Float32> x_values(iterations);
    vector<_Float32> y_values(iterations);
    for (int i = 0; i < iterations; ++i)
    {
        x_values[i] = generateRandomFloat(-2.0, 2.0);
        y_values[i] = generateRandomFloat(-2.0, 2.0);
    }

    for (int i = 0; i < times; i++)
    {
        cout << "Iteration: " << i << " of " << times << endl;
        // Benchmark multiplication operator
        double timeMultiplicationOperator = benchmarkFunction(iterations, [](_Float32 x, _Float32 y) -> _Float32
                                                              { return x * y; }, x_values, y_values);
        cout << "Time taken by multiplication operator: " << timeMultiplicationOperator << " seconds" << endl;

        // Benchmark lmul_asm function
        double timeLmulFunction = benchmarkFunction(iterations, lmul_asm, x_values, y_values);
        cout << "Time taken by lmul_asm function: " << timeLmulFunction << " seconds" << endl;

        // Benchmark lmul function
        double timeLmulFunction2 = benchmarkFunction(iterations, lmul, x_values, y_values);
        cout << "Time taken by lmul function: " << timeLmulFunction2 << " seconds" << endl;
    }

    // Validate results with random numbers
    float x = generateRandomFloat(-1.0, 1.0);
    float y = generateRandomFloat(-1.0, 1.0);
    cout << left << setw(20) << "x"
         << left << setw(20) << "y"
         << left << setw(20) << "x*y"
         << left << setw(20) << "lmul_asm(x, y)"
         << left << setw(20) << "lmul(x, y)" << endl;
    cout << string(100, '-') << endl;
    for (int i = 0; i < 10; ++i)
    {
        float x = generateRandomFloat(-1.0, 1.0);
        float y = generateRandomFloat(-1.0, 1.0);
        cout << left << setw(20) << x
             << left << setw(20) << y
             << left << setw(20) << x * y
             << left << setw(20) << lmul_asm(x, y)
             << left << setw(20) << lmul(x, y) << endl;
    }

    return 0;
 }
	#include <iostream>
	#include <random>
	#include <chrono>
	#include <vector>
	#include <iomanip>

	using namespace std;

	const u_int32_t FLOAT32_EXPONENT_MASK = 0xC0880000; // -0x3F780000

	// Baseline C++ implementation from the paper
	_Float32 lmul(_Float32 x, _Float32 y)
	{
	_Float32 s;
	u_int32_t x_bits_ptr = reinterpret_cast<u_int32_t >(&x);
	u_int32_t y_bits_ptr = reinterpret_cast<u_int32_t >(&y);
	u_int32_t result = x_bits_ptr + y_bits_ptr + FLOAT32_EXPONENT_MASK;
	s = reinterpret_cast<_Float32 >(&result);
	return s;
	}

	// Pure assembly implementation for comparison
	_Float32 lmul_asm(_Float32 x, _Float32 y)
	{
	_Float32 s;

	asm(
	"movl %1, %%eax\n\t" // Move x bits to eax
	"movl %2, %%ebx\n\t" // Move y bits to ebx
	"addl %%ebx, %%eax\n\t" // Add y bits to x bits
	"addl %3, %%eax\n\t" // Add FLOAT32_EXPONENT_MASK to the result
	"movl %%eax, %0" // Move the result to s
	: "=m"(s)
	: "m"(x), "m"(y), "r"(FLOAT32_EXPONENT_MASK)
	: "%eax", "%ebx");

	return s;
	}

	// Function to generate random floating point numbers
	float generateRandomFloat(float min, float max)
	{
	random_device rd;
	mt19937 gen(rd());
	uniform_real_distribution<> dis(min, max);
	return dis(gen);
	}

	// Benchmark multiplication operator
	double benchmarkFunction(int iterations, _Float32 (*f)(_Float32, _Float32), vector<_Float32> x_values, vector<_Float32> y_values)
	{
	auto start = chrono::high_resolution_clock::now();
	for (int i = 0; i < iterations; ++i)
	{

	volatile _Float32 result = f(x_values[i], y_values[i]); // volatile to prevent optimization
	}
	auto end = chrono::high_resolution_clock::now();

	return chrono::duration_cast<chrono::duration<double>>(end - start).count();
	}

	int main()
	{
	const int iterations = 500; // Number of iterations for benchmarking
	const int times = 5;

	cout << "Iterations per benchmark: " << iterations << endl;

	cout << "Initializing random numbers..." << endl;
	vector<_Float32> x_values(iterations);
	vector<_Float32> y_values(iterations);
	for (int i = 0; i < iterations; ++i)
	{
	x_values[i] = generateRandomFloat(-2.0, 2.0);
	y_values[i] = generateRandomFloat(-2.0, 2.0);
	}

	for (int i = 0; i < times; i++)
	{
	cout << "Iteration: " << i << " of " << times << endl;
	// Benchmark multiplication operator
	double timeMultiplicationOperator = benchmarkFunction(iterations, [](_Float32 x, _Float32 y) -> _Float32
	{ return x * y; }, x_values, y_values);
	cout << "Time taken by multiplication operator: " << timeMultiplicationOperator << " seconds" << endl;

	// Benchmark lmul_asm function
	double timeLmulFunction = benchmarkFunction(iterations, lmul_asm, x_values, y_values);
	cout << "Time taken by lmul_asm function: " << timeLmulFunction << " seconds" << endl;

	// Benchmark lmul function
	double timeLmulFunction2 = benchmarkFunction(iterations, lmul, x_values, y_values);
	cout << "Time taken by lmul function: " << timeLmulFunction2 << " seconds" << endl;
	}

	// Validate results with random numbers
	float x = generateRandomFloat(-1.0, 1.0);
	float y = generateRandomFloat(-1.0, 1.0);
	cout << left << setw(20) << "x"
	<< left << setw(20) << "y"
	<< left << setw(20) << "x*y"
	<< left << setw(20) << "lmul_asm(x, y)"
	<< left << setw(20) << "lmul(x, y)" << endl;
	cout << string(100, '-') << endl;
	for (int i = 0; i < 10; ++i)
	{
	float x = generateRandomFloat(-1.0, 1.0);
	float y = generateRandomFloat(-1.0, 1.0);
	cout << left << setw(20) << x
	<< left << setw(20) << y
	<< left << setw(20) << x * y
	<< left << setw(20) << lmul_asm(x, y)
	<< left << setw(20) << lmul(x, y) << endl;
	}

	return 0;
	}