jamesy0ung · December 2, 2024 12:19
diff --git a/montecarloPiMultithreadedAVX2.cpp b/montecarloPiMultithreadedAVX2.cpp
 #include <string>
 #include <iostream>
 #include <thread>
 #include <random>
 #include <atomic>
 #include <vector>
 #include <iomanip>
 #include <chrono>
 #include <algorithm>
 #include <array>
 #include <immintrin.h>

 class PiCalculator {
 private:
    const long long iterations;
    const unsigned int nthreads;
    std::atomic<long long> successCount{ 0 };
    std::atomic<long long> totalCount{ 0 };
    std::atomic<bool> shouldStop{ false };
    std::chrono::steady_clock::time_point startTime;

    // Cache line padding to prevent false sharing
    struct alignas(64) PaddedCount {
        long long count;
        char padding[64 - sizeof(long long)];
    };
    std::vector<PaddedCount> threadLocalSuccess;
    std::vector<PaddedCount> threadLocalTotal;

    // Process points using AVX2
    int processBatchAVX2(std::mt19937& gen, std::uniform_real_distribution<float>& dis) {
        alignas(32) float x_vals[8];
        alignas(32) float y_vals[8];

        // Generate 8 random points
        for (int i = 0; i < 8; ++i) {
            x_vals[i] = dis(gen);
            y_vals[i] = dis(gen);
        }

        // Load values into AVX2 registers
        __m256 x = _mm256_load_ps(x_vals);
        __m256 y = _mm256_load_ps(y_vals);

        // Calculate x*x + y*y for all 8 points simultaneously
        __m256 x_squared = _mm256_mul_ps(x, x);
        __m256 y_squared = _mm256_mul_ps(y, y);
        __m256 sum = _mm256_add_ps(x_squared, y_squared);

        // Compare with 1.0 (points inside circle)
        __m256 one = _mm256_set1_ps(1.0f);
        __m256 mask = _mm256_cmp_ps(sum, one, _CMP_LE_OQ);

        // Convert mask to integer (each 32-bit lane will be all 1s or all 0s)
        int mask_int = _mm256_movemask_ps(mask);

        // Count set bits (points inside circle)
        return _mm_popcnt_u32(mask_int);
    }

    void task(int threadId, long long threadIterations) {
        // Thread-local RNG for better performance
        std::random_device rd;
        std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId));
        std::uniform_real_distribution<float> dis(0.0f, 1.0f); // Using float instead of double

        const int POINTS_PER_AVX2 = 8;  // Process 8 points per AVX2 operation
        const long long ATOMIC_BATCH = 1000000LL;

        long long localSuccess = 0;
        long long localTotal = 0;
        long long processedIterations = 0;

        while (processedIterations < threadIterations && !shouldStop) {
            // Calculate iterations for this round
            long long remainingIterations = threadIterations - processedIterations;
            long long iterationsThisRound = std::min(remainingIterations, ATOMIC_BATCH);

            // Process AVX2 batches
            long long avx2Batches = iterationsThisRound / POINTS_PER_AVX2;

            for (long long b = 0; b < avx2Batches; ++b) {
                int successInBatch = processBatchAVX2(gen, dis);
                localSuccess += successInBatch;
                localTotal += POINTS_PER_AVX2;
            }

            // Handle remaining points
            long long remainingPoints = iterationsThisRound % POINTS_PER_AVX2;
            for (long long i = 0; i < remainingPoints; ++i) {
                float x = dis(gen);
                float y = dis(gen);
                if (x * x + y * y <= 1.0f) {
                    localSuccess++;
                }
                localTotal++;
            }

            // Update thread-local counters
            threadLocalSuccess[threadId].count += localSuccess;
            threadLocalTotal[threadId].count += localTotal;

            // Update global counters
            successCount.fetch_add(localSuccess, std::memory_order_relaxed);
            totalCount.fetch_add(localTotal, std::memory_order_relaxed);

            processedIterations += localTotal;
            localSuccess = 0;
            localTotal = 0;
        }
    }

 public:
    PiCalculator(long long n) :
        iterations(n),
        nthreads(std::thread::hardware_concurrency()),
        threadLocalSuccess(nthreads),
        threadLocalTotal(nthreads) {
    }

    double calculate() {
        startTime = std::chrono::steady_clock::now();
        std::vector<std::thread> threads;
        threads.reserve(nthreads);

        // Calculate iterations per thread
        long long iterationsPerThread = iterations / nthreads;
        long long remainder = iterations % nthreads;

        // Launch threads
        for (unsigned int i = 0; i < nthreads; i++) {
            long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0);
            threads.emplace_back(&PiCalculator::task, this, i, threadIterations);
        }

        // Progress monitoring thread
        std::thread progressThread([this]() {
            while (totalCount < iterations && !shouldStop) {
                auto now = std::chrono::steady_clock::now();
                auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
                if (duration > 0) {
                    double progress = static_cast<double>(totalCount) / iterations * 100.0;
                    double pointsPerSecond = static_cast<double>(totalCount) / duration;

                    std::cout << "\rProgress: " << std::fixed << std::setprecision(2)
                        << progress << "% ("
                        << std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)"
                        << std::flush;
                }
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            });

        // Join threads
        for (auto& thread : threads) {
            thread.join();
        }
        shouldStop = true;
        progressThread.join();

        // Calculate final result
        auto endTime = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();

        double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount);

        // Print results
        std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl;
        std::cout << "Points processed: " << totalCount << std::endl;
        std::cout << "Points inside circle: " << successCount << std::endl;
        std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl;
        std::cout << "Points per second: " << std::fixed << std::setprecision(2)
            << static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl;

        return pi;
    }
 };

 int main() {
    const long long iterations = 100000000000LL;
    try {
        PiCalculator calculator(iterations);
        calculator.calculate();
    }
    catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
	#include <string>
	#include <iostream>
	#include <thread>
	#include <random>
	#include <atomic>
	#include <vector>
	#include <iomanip>
	#include <chrono>
	#include <algorithm>
	#include <array>
	#include <immintrin.h>

	class PiCalculator {
	private:
	const long long iterations;
	const unsigned int nthreads;
	std::atomic<long long> successCount{ 0 };
	std::atomic<long long> totalCount{ 0 };
	std::atomic<bool> shouldStop{ false };
	std::chrono::steady_clock::time_point startTime;

	// Cache line padding to prevent false sharing
	struct alignas(64) PaddedCount {
	long long count;
	char padding[64 - sizeof(long long)];
	};
	std::vector<PaddedCount> threadLocalSuccess;
	std::vector<PaddedCount> threadLocalTotal;

	// Process points using AVX2
	int processBatchAVX2(std::mt19937& gen, std::uniform_real_distribution<float>& dis) {
	alignas(32) float x_vals[8];
	alignas(32) float y_vals[8];

	// Generate 8 random points
	for (int i = 0; i < 8; ++i) {
	x_vals[i] = dis(gen);
	y_vals[i] = dis(gen);
	}

	// Load values into AVX2 registers
	__m256 x = _mm256_load_ps(x_vals);
	__m256 y = _mm256_load_ps(y_vals);

	// Calculate xx + yy for all 8 points simultaneously
	__m256 x_squared = _mm256_mul_ps(x, x);
	__m256 y_squared = _mm256_mul_ps(y, y);
	__m256 sum = _mm256_add_ps(x_squared, y_squared);

	// Compare with 1.0 (points inside circle)
	__m256 one = _mm256_set1_ps(1.0f);
	__m256 mask = _mm256_cmp_ps(sum, one, _CMP_LE_OQ);

	// Convert mask to integer (each 32-bit lane will be all 1s or all 0s)
	int mask_int = _mm256_movemask_ps(mask);

	// Count set bits (points inside circle)
	return _mm_popcnt_u32(mask_int);
	}

	void task(int threadId, long long threadIterations) {
	// Thread-local RNG for better performance
	std::random_device rd;
	std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId));
	std::uniform_real_distribution<float> dis(0.0f, 1.0f); // Using float instead of double

	const int POINTS_PER_AVX2 = 8; // Process 8 points per AVX2 operation
	const long long ATOMIC_BATCH = 1000000LL;

	long long localSuccess = 0;
	long long localTotal = 0;
	long long processedIterations = 0;

	while (processedIterations < threadIterations && !shouldStop) {
	// Calculate iterations for this round
	long long remainingIterations = threadIterations - processedIterations;
	long long iterationsThisRound = std::min(remainingIterations, ATOMIC_BATCH);

	// Process AVX2 batches
	long long avx2Batches = iterationsThisRound / POINTS_PER_AVX2;

	for (long long b = 0; b < avx2Batches; ++b) {
	int successInBatch = processBatchAVX2(gen, dis);
	localSuccess += successInBatch;
	localTotal += POINTS_PER_AVX2;
	}

	// Handle remaining points
	long long remainingPoints = iterationsThisRound % POINTS_PER_AVX2;
	for (long long i = 0; i < remainingPoints; ++i) {
	float x = dis(gen);
	float y = dis(gen);
	if (x * x + y * y <= 1.0f) {
	localSuccess++;
	}
	localTotal++;
	}

	// Update thread-local counters
	threadLocalSuccess[threadId].count += localSuccess;
	threadLocalTotal[threadId].count += localTotal;

	// Update global counters
	successCount.fetch_add(localSuccess, std::memory_order_relaxed);
	totalCount.fetch_add(localTotal, std::memory_order_relaxed);

	processedIterations += localTotal;
	localSuccess = 0;
	localTotal = 0;
	}
	}

	public:
	PiCalculator(long long n) :
	iterations(n),
	nthreads(std::thread::hardware_concurrency()),
	threadLocalSuccess(nthreads),
	threadLocalTotal(nthreads) {
	}

	double calculate() {
	startTime = std::chrono::steady_clock::now();
	std::vector<std::thread> threads;
	threads.reserve(nthreads);

	// Calculate iterations per thread
	long long iterationsPerThread = iterations / nthreads;
	long long remainder = iterations % nthreads;

	// Launch threads
	for (unsigned int i = 0; i < nthreads; i++) {
	long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0);
	threads.emplace_back(&PiCalculator::task, this, i, threadIterations);
	}

	// Progress monitoring thread
	std::thread progressThread([this]() {
	while (totalCount < iterations && !shouldStop) {
	auto now = std::chrono::steady_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
	if (duration > 0) {
	double progress = static_cast<double>(totalCount) / iterations * 100.0;
	double pointsPerSecond = static_cast<double>(totalCount) / duration;

	std::cout << "\rProgress: " << std::fixed << std::setprecision(2)
	<< progress << "% ("
	<< std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)"
	<< std::flush;
	}
	std::this_thread::sleep_for(std::chrono::milliseconds(100));
	}
	});

	// Join threads
	for (auto& thread : threads) {
	thread.join();
	}
	shouldStop = true;
	progressThread.join();

	// Calculate final result
	auto endTime = std::chrono::steady_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();

	double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount);

	// Print results
	std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl;
	std::cout << "Points processed: " << totalCount << std::endl;
	std::cout << "Points inside circle: " << successCount << std::endl;
	std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl;
	std::cout << "Points per second: " << std::fixed << std::setprecision(2)
	<< static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl;

	return pi;
	}
	};

	int main() {
	const long long iterations = 100000000000LL;
	try {
	PiCalculator calculator(iterations);
	calculator.calculate();
	}
	catch (const std::exception& e) {
	std::cerr << "Error: " << e.what() << std::endl;
	return 1;
	}
	return 0;
	}