Created
December 2, 2024 12:19
-
-
Save jamesy0ung/84e2c829839acd99b11683212f01ce0f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <iostream> | |
#include <thread> | |
#include <random> | |
#include <atomic> | |
#include <vector> | |
#include <iomanip> | |
#include <chrono> | |
#include <algorithm> | |
#include <array> | |
#include <immintrin.h> | |
class PiCalculator { | |
private: | |
const long long iterations; | |
const unsigned int nthreads; | |
std::atomic<long long> successCount{ 0 }; | |
std::atomic<long long> totalCount{ 0 }; | |
std::atomic<bool> shouldStop{ false }; | |
std::chrono::steady_clock::time_point startTime; | |
// Cache line padding to prevent false sharing | |
struct alignas(64) PaddedCount { | |
long long count; | |
char padding[64 - sizeof(long long)]; | |
}; | |
std::vector<PaddedCount> threadLocalSuccess; | |
std::vector<PaddedCount> threadLocalTotal; | |
// Process points using AVX2 | |
int processBatchAVX2(std::mt19937& gen, std::uniform_real_distribution<float>& dis) { | |
alignas(32) float x_vals[8]; | |
alignas(32) float y_vals[8]; | |
// Generate 8 random points | |
for (int i = 0; i < 8; ++i) { | |
x_vals[i] = dis(gen); | |
y_vals[i] = dis(gen); | |
} | |
// Load values into AVX2 registers | |
__m256 x = _mm256_load_ps(x_vals); | |
__m256 y = _mm256_load_ps(y_vals); | |
// Calculate x*x + y*y for all 8 points simultaneously | |
__m256 x_squared = _mm256_mul_ps(x, x); | |
__m256 y_squared = _mm256_mul_ps(y, y); | |
__m256 sum = _mm256_add_ps(x_squared, y_squared); | |
// Compare with 1.0 (points inside circle) | |
__m256 one = _mm256_set1_ps(1.0f); | |
__m256 mask = _mm256_cmp_ps(sum, one, _CMP_LE_OQ); | |
// Convert mask to integer (each 32-bit lane will be all 1s or all 0s) | |
int mask_int = _mm256_movemask_ps(mask); | |
// Count set bits (points inside circle) | |
return _mm_popcnt_u32(mask_int); | |
} | |
void task(int threadId, long long threadIterations) { | |
// Thread-local RNG for better performance | |
std::random_device rd; | |
std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId)); | |
std::uniform_real_distribution<float> dis(0.0f, 1.0f); // Using float instead of double | |
const int POINTS_PER_AVX2 = 8; // Process 8 points per AVX2 operation | |
const long long ATOMIC_BATCH = 1000000LL; | |
long long localSuccess = 0; | |
long long localTotal = 0; | |
long long processedIterations = 0; | |
while (processedIterations < threadIterations && !shouldStop) { | |
// Calculate iterations for this round | |
long long remainingIterations = threadIterations - processedIterations; | |
long long iterationsThisRound = std::min(remainingIterations, ATOMIC_BATCH); | |
// Process AVX2 batches | |
long long avx2Batches = iterationsThisRound / POINTS_PER_AVX2; | |
for (long long b = 0; b < avx2Batches; ++b) { | |
int successInBatch = processBatchAVX2(gen, dis); | |
localSuccess += successInBatch; | |
localTotal += POINTS_PER_AVX2; | |
} | |
// Handle remaining points | |
long long remainingPoints = iterationsThisRound % POINTS_PER_AVX2; | |
for (long long i = 0; i < remainingPoints; ++i) { | |
float x = dis(gen); | |
float y = dis(gen); | |
if (x * x + y * y <= 1.0f) { | |
localSuccess++; | |
} | |
localTotal++; | |
} | |
// Update thread-local counters | |
threadLocalSuccess[threadId].count += localSuccess; | |
threadLocalTotal[threadId].count += localTotal; | |
// Update global counters | |
successCount.fetch_add(localSuccess, std::memory_order_relaxed); | |
totalCount.fetch_add(localTotal, std::memory_order_relaxed); | |
processedIterations += localTotal; | |
localSuccess = 0; | |
localTotal = 0; | |
} | |
} | |
public: | |
PiCalculator(long long n) : | |
iterations(n), | |
nthreads(std::thread::hardware_concurrency()), | |
threadLocalSuccess(nthreads), | |
threadLocalTotal(nthreads) { | |
} | |
double calculate() { | |
startTime = std::chrono::steady_clock::now(); | |
std::vector<std::thread> threads; | |
threads.reserve(nthreads); | |
// Calculate iterations per thread | |
long long iterationsPerThread = iterations / nthreads; | |
long long remainder = iterations % nthreads; | |
// Launch threads | |
for (unsigned int i = 0; i < nthreads; i++) { | |
long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0); | |
threads.emplace_back(&PiCalculator::task, this, i, threadIterations); | |
} | |
// Progress monitoring thread | |
std::thread progressThread([this]() { | |
while (totalCount < iterations && !shouldStop) { | |
auto now = std::chrono::steady_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count(); | |
if (duration > 0) { | |
double progress = static_cast<double>(totalCount) / iterations * 100.0; | |
double pointsPerSecond = static_cast<double>(totalCount) / duration; | |
std::cout << "\rProgress: " << std::fixed << std::setprecision(2) | |
<< progress << "% (" | |
<< std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)" | |
<< std::flush; | |
} | |
std::this_thread::sleep_for(std::chrono::milliseconds(100)); | |
} | |
}); | |
// Join threads | |
for (auto& thread : threads) { | |
thread.join(); | |
} | |
shouldStop = true; | |
progressThread.join(); | |
// Calculate final result | |
auto endTime = std::chrono::steady_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count(); | |
double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount); | |
// Print results | |
std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl; | |
std::cout << "Points processed: " << totalCount << std::endl; | |
std::cout << "Points inside circle: " << successCount << std::endl; | |
std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl; | |
std::cout << "Points per second: " << std::fixed << std::setprecision(2) | |
<< static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl; | |
return pi; | |
} | |
}; | |
int main() { | |
const long long iterations = 100000000000LL; | |
try { | |
PiCalculator calculator(iterations); | |
calculator.calculate(); | |
} | |
catch (const std::exception& e) { | |
std::cerr << "Error: " << e.what() << std::endl; | |
return 1; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment