Skip to content

Instantly share code, notes, and snippets.

@jamesy0ung
Created December 2, 2024 12:19
Show Gist options
  • Save jamesy0ung/84e2c829839acd99b11683212f01ce0f to your computer and use it in GitHub Desktop.
Save jamesy0ung/84e2c829839acd99b11683212f01ce0f to your computer and use it in GitHub Desktop.
#include <string>
#include <iostream>
#include <thread>
#include <random>
#include <atomic>
#include <vector>
#include <iomanip>
#include <chrono>
#include <algorithm>
#include <array>
#include <immintrin.h>
class PiCalculator {
private:
const long long iterations;
const unsigned int nthreads;
std::atomic<long long> successCount{ 0 };
std::atomic<long long> totalCount{ 0 };
std::atomic<bool> shouldStop{ false };
std::chrono::steady_clock::time_point startTime;
// Cache line padding to prevent false sharing
struct alignas(64) PaddedCount {
long long count;
char padding[64 - sizeof(long long)];
};
std::vector<PaddedCount> threadLocalSuccess;
std::vector<PaddedCount> threadLocalTotal;
// Process points using AVX2
int processBatchAVX2(std::mt19937& gen, std::uniform_real_distribution<float>& dis) {
alignas(32) float x_vals[8];
alignas(32) float y_vals[8];
// Generate 8 random points
for (int i = 0; i < 8; ++i) {
x_vals[i] = dis(gen);
y_vals[i] = dis(gen);
}
// Load values into AVX2 registers
__m256 x = _mm256_load_ps(x_vals);
__m256 y = _mm256_load_ps(y_vals);
// Calculate x*x + y*y for all 8 points simultaneously
__m256 x_squared = _mm256_mul_ps(x, x);
__m256 y_squared = _mm256_mul_ps(y, y);
__m256 sum = _mm256_add_ps(x_squared, y_squared);
// Compare with 1.0 (points inside circle)
__m256 one = _mm256_set1_ps(1.0f);
__m256 mask = _mm256_cmp_ps(sum, one, _CMP_LE_OQ);
// Convert mask to integer (each 32-bit lane will be all 1s or all 0s)
int mask_int = _mm256_movemask_ps(mask);
// Count set bits (points inside circle)
return _mm_popcnt_u32(mask_int);
}
void task(int threadId, long long threadIterations) {
// Thread-local RNG for better performance
std::random_device rd;
std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId));
std::uniform_real_distribution<float> dis(0.0f, 1.0f); // Using float instead of double
const int POINTS_PER_AVX2 = 8; // Process 8 points per AVX2 operation
const long long ATOMIC_BATCH = 1000000LL;
long long localSuccess = 0;
long long localTotal = 0;
long long processedIterations = 0;
while (processedIterations < threadIterations && !shouldStop) {
// Calculate iterations for this round
long long remainingIterations = threadIterations - processedIterations;
long long iterationsThisRound = std::min(remainingIterations, ATOMIC_BATCH);
// Process AVX2 batches
long long avx2Batches = iterationsThisRound / POINTS_PER_AVX2;
for (long long b = 0; b < avx2Batches; ++b) {
int successInBatch = processBatchAVX2(gen, dis);
localSuccess += successInBatch;
localTotal += POINTS_PER_AVX2;
}
// Handle remaining points
long long remainingPoints = iterationsThisRound % POINTS_PER_AVX2;
for (long long i = 0; i < remainingPoints; ++i) {
float x = dis(gen);
float y = dis(gen);
if (x * x + y * y <= 1.0f) {
localSuccess++;
}
localTotal++;
}
// Update thread-local counters
threadLocalSuccess[threadId].count += localSuccess;
threadLocalTotal[threadId].count += localTotal;
// Update global counters
successCount.fetch_add(localSuccess, std::memory_order_relaxed);
totalCount.fetch_add(localTotal, std::memory_order_relaxed);
processedIterations += localTotal;
localSuccess = 0;
localTotal = 0;
}
}
public:
PiCalculator(long long n) :
iterations(n),
nthreads(std::thread::hardware_concurrency()),
threadLocalSuccess(nthreads),
threadLocalTotal(nthreads) {
}
double calculate() {
startTime = std::chrono::steady_clock::now();
std::vector<std::thread> threads;
threads.reserve(nthreads);
// Calculate iterations per thread
long long iterationsPerThread = iterations / nthreads;
long long remainder = iterations % nthreads;
// Launch threads
for (unsigned int i = 0; i < nthreads; i++) {
long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0);
threads.emplace_back(&PiCalculator::task, this, i, threadIterations);
}
// Progress monitoring thread
std::thread progressThread([this]() {
while (totalCount < iterations && !shouldStop) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
if (duration > 0) {
double progress = static_cast<double>(totalCount) / iterations * 100.0;
double pointsPerSecond = static_cast<double>(totalCount) / duration;
std::cout << "\rProgress: " << std::fixed << std::setprecision(2)
<< progress << "% ("
<< std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)"
<< std::flush;
}
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
});
// Join threads
for (auto& thread : threads) {
thread.join();
}
shouldStop = true;
progressThread.join();
// Calculate final result
auto endTime = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();
double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount);
// Print results
std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl;
std::cout << "Points processed: " << totalCount << std::endl;
std::cout << "Points inside circle: " << successCount << std::endl;
std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl;
std::cout << "Points per second: " << std::fixed << std::setprecision(2)
<< static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl;
return pi;
}
};
int main() {
const long long iterations = 100000000000LL;
try {
PiCalculator calculator(iterations);
calculator.calculate();
}
catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment