Created
January 15, 2024 21:20
-
-
Save Earthcomputer/c2d20dd0b139f9f267eb34c760519313 to your computer and use it in GitHub Desktop.
Amethyst cluster finder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <chrono> | |
#include <iostream> | |
#include <sstream> | |
#include <thread> | |
#include <vector> | |
#ifdef _MSC_VER | |
#include <immintrin.h> | |
#endif | |
void doHandleError(cudaError_t errorCode, const char* file, int line) { | |
if (errorCode != cudaSuccess) { | |
std::cerr << file << ":" << line << ": CUDA error " << errorCode << " " << cudaGetErrorString(errorCode) << std::endl; | |
exit(errorCode); | |
} | |
} | |
#define handleError(x) doHandleError((x), __FILE__, __LINE__) | |
template<class F> | |
class EndScope { | |
F f; | |
public: | |
explicit EndScope(F f) : f(f) {} | |
~EndScope() { | |
f(); | |
} | |
}; | |
template<class F> EndScope<F> makeEndScope(F f) { return EndScope<F>(f); } | |
#define onEndScope(name, code) auto (name) = makeEndScope([&]{code;}) | |
__host__ __device__ uint64_t rotateLeft(uint64_t n, int amt) { | |
return (n << amt) | (n >> (64 - amt)); | |
} | |
#ifdef __CUDA_ARCH__ | |
__device__ double fastInverseSqrt(double x) { | |
double d = 0.5 * x; | |
int64_t l = __double_as_longlong(x); | |
l = 6910469410427058090LL - (l >> 1); | |
x = __longlong_as_double(l); | |
return x * (1.5 - d * x * x); | |
} | |
#else | |
double fastInverseSqrt(double x) { | |
double d = 0.5 * x; | |
int64_t l; | |
memcpy(&l, &x, sizeof(x)); | |
l = 6910469410427058090LL - (l >> 1); | |
memcpy(&x, &l, sizeof(l)); | |
return x * (1.5 - d * x * x); | |
} | |
#endif | |
class XOR128 { | |
uint64_t seedLo = 0; | |
uint64_t seedHi = 0; | |
__host__ __device__ static uint64_t mixStafford13(uint64_t seed) { | |
seed = (seed ^ seed >> 30) * (uint64_t) -4658895280553007687LL; | |
seed = (seed ^ seed >> 27) * (uint64_t) -7723592293110705685LL; | |
return seed ^ seed >> 31; | |
} | |
public: | |
explicit __host__ __device__ XOR128(uint64_t seed) { | |
setSeed(seed); | |
} | |
__host__ __device__ uint64_t next() { | |
uint64_t l = seedLo; | |
uint64_t m = seedHi; | |
uint64_t n = rotateLeft(l + m, 17) + l; | |
m ^= l; | |
seedLo = rotateLeft(l, 49) ^ m ^ m << 21; | |
seedHi = rotateLeft(m, 28); | |
return n; | |
} | |
private: | |
__host__ __device__ int32_t next(int bits) { | |
return (int32_t) (next() >> (64 - bits)); | |
} | |
public: | |
__host__ __device__ int64_t nextLong() { | |
int32_t i = next(32); | |
int32_t j = next(32); | |
int64_t l = (int64_t) i << 32; | |
return l + (int64_t) j; | |
} | |
__host__ __device__ float nextFloat() { | |
return (float) next(24) * 5.9604645E-8F; | |
} | |
__host__ __device__ double nextDouble() { | |
int32_t i = next(26); | |
int32_t j = next(27); | |
int64_t l = ((int64_t)i << 27) + (int64_t)j; | |
return (double)l * 1.110223E-16F; | |
} | |
__host__ __device__ int32_t nextInt(int32_t bound) { | |
if ((bound & bound - 1) == 0) { | |
return (int32_t)((int64_t)bound * (int64_t)next(31) >> 31); | |
} else { | |
int32_t i; | |
int32_t j; | |
do { | |
i = next(31); | |
j = i % bound; | |
} while (i - j + (bound - 1) < 0); | |
return j; | |
} | |
} | |
__host__ __device__ void setSeed(uint64_t seed) { | |
uint64_t l2 = seed ^ 7640891576956012809LL; | |
uint64_t l3 = l2 - 7046029254386353131LL; | |
seedLo = mixStafford13(l2); | |
seedHi = mixStafford13(l3); | |
if ((seedLo | seedHi) == 0) { | |
seedLo = (uint64_t) -7046029254386353131LL; | |
seedHi = 7640891576956012809LL; | |
} | |
} | |
__host__ __device__ static uint64_t getPopulationSeed(uint64_t seed, uint64_t l, uint64_t m, int32_t blockX, int32_t blockZ) { | |
return (uint64_t) (int64_t) blockX * l + (uint64_t) (int64_t) blockZ * m ^ seed; | |
} | |
__host__ __device__ static XOR128 withDecoratorSeed(uint64_t populationSeed, int32_t index, int32_t step) { | |
uint64_t l = populationSeed + (uint64_t) index + (uint64_t) (10000 * step); | |
return XOR128(l); | |
} | |
}; | |
class Random { | |
uint64_t seed; | |
__host__ __device__ int32_t next(int bits) { | |
return (int32_t) ((seed = (seed * 0x5deece66dLL + 11) & 0xffffffffffffLL) >> (48 - bits)); | |
} | |
public: | |
__host__ __device__ explicit Random(uint64_t seed) : seed(seed ^ 0x5deece66dLL) {} | |
__host__ __device__ int64_t nextLong() { | |
int32_t i = next(32); | |
int32_t j = next(32); | |
int64_t l = (int64_t) i << 32; | |
return l + (int64_t) j; | |
} | |
__host__ __device__ int32_t nextInt(int32_t bound) { | |
if ((bound & bound - 1) == 0) { | |
return (int32_t)((int64_t)bound * (int64_t)next(31) >> 31); | |
} else { | |
int32_t i; | |
int32_t j; | |
do { | |
i = next(31); | |
j = i % bound; | |
} while (i - j + (bound - 1) < 0); | |
return j; | |
} | |
} | |
__host__ __device__ double nextDouble() { | |
int32_t i = next(26); | |
int32_t j = next(27); | |
int64_t l = ((int64_t)i << 27) + (int64_t)j; | |
return (double)l * 1.110223E-16; | |
} | |
}; | |
class PerlinNoiseSampler { | |
static constexpr double lacunarity = 1.0 / 16.0; | |
static constexpr double persistence = 1; | |
double originX = 0, originY = 0, originZ = 0; | |
uint8_t permutation[256]{}; | |
public: | |
__host__ __device__ PerlinNoiseSampler() = default; | |
#pragma clang diagnostic push | |
#pragma ide diagnostic ignored "EndlessLoop" | |
__host__ __device__ explicit PerlinNoiseSampler(Random& randomIn) { | |
Random random(randomIn.nextLong() ^ 440898198LL); // "octave_-4".hashCode() = 440898198 | |
originX = random.nextDouble() * 256; | |
originY = random.nextDouble() * 256; | |
originZ = random.nextDouble() * 256; | |
for (int32_t i = 0; i < 256; i++) { | |
permutation[i] = (uint8_t) i; | |
} | |
for (int32_t i = 0; i < 256; i++) { | |
int32_t j = random.nextInt(256 - i); | |
std::swap(permutation[i], permutation[i + j]); | |
} | |
} | |
#pragma clang diagnostic pop | |
private: | |
__host__ __device__ double doSample(double x, double y, double z, double yScale, double yMax) const { | |
double d = x + originX; | |
double e = y + originY; | |
double f = z + originZ; | |
int32_t i = ifloor(d); | |
int32_t j = ifloor(e); | |
int32_t k = ifloor(f); | |
double g = d - i; | |
double h = e - j; | |
double l = f - k; | |
double n; | |
if (yScale != 0) { | |
double m = yMax > 0 && yMax < h ? yMax : h; | |
n = floor(m / yScale + 1.0E-7F) * yScale; | |
} else { | |
n = 0; | |
} | |
return doSample(i, j, k, g, h - n, l, h); | |
} | |
__host__ __device__ double doSample(int32_t sectionX, int32_t sectionY, int32_t sectionZ, double localX, double localY, double localZ, double fadeLocalY) const { | |
int32_t i = map(sectionX); | |
int32_t j = map(sectionX + 1); | |
int32_t k = map(i + sectionY); | |
int32_t l = map(i + sectionY + 1); | |
int32_t m = map(j + sectionY); | |
int32_t n = map(j + sectionY + 1); | |
double d = grad(map(k + sectionZ), localX, localY, localZ); | |
double e = grad(map(m + sectionZ), localX - 1, localY, localZ); | |
double f = grad(map(l + sectionZ), localX, localY - 1, localZ); | |
double g = grad(map(n + sectionZ), localX - 1, localY - 1, localZ); | |
double h = grad(map(k + sectionZ + 1), localX, localY, localZ - 1); | |
double o = grad(map(m + sectionZ + 1), localX - 1, localY, localZ - 1); | |
double p = grad(map(l + sectionZ + 1), localX, localY - 1, localZ - 1); | |
double q = grad(map(n + sectionZ + 1), localX - 1, localY - 1, localZ - 1); | |
double r = perlinFade(localX); | |
double s = perlinFade(fadeLocalY); | |
double t = perlinFade(localZ); | |
return lerp3(r, s, t, d, e, f, g, h, o, p, q); | |
} | |
__host__ __device__ uint8_t map(int32_t input) const { | |
return permutation[input & 0xff]; | |
} | |
__host__ __device__ static double grad(int hash, double x, double y, double z) { | |
static constexpr double GRADIENTS[16][3] = { | |
{1, 1, 0}, | |
{-1, 1, 0}, | |
{1, -1, 0}, | |
{-1, -1, 0}, | |
{1, 0, 1}, | |
{-1, 0, 1}, | |
{1, 0, -1}, | |
{-1, 0, -1}, | |
{0, 1, 1}, | |
{0, -1, 1}, | |
{0, 1, -1}, | |
{0, -1, -1}, | |
{1, 1, 0}, | |
{0, -1, 1}, | |
{-1, 1, 0}, | |
{0, -1, -1} | |
}; | |
return dot(GRADIENTS[hash & 15], x, y, z); | |
} | |
__host__ __device__ static double dot(const double gradient[3], double x, double y, double z) { | |
return gradient[0] * x + gradient[1] * y + gradient[2] * z; | |
} | |
__host__ __device__ static double perlinFade(double value) { | |
return value * value * value * (value * (value * 6.0 - 15.0) + 10.0); | |
} | |
__host__ __device__ static double lerp3( | |
double deltaX, | |
double deltaY, | |
double deltaZ, | |
double x0y0z0, | |
double x1y0z0, | |
double x0y1z0, | |
double x1y1z0, | |
double x0y0z1, | |
double x1y0z1, | |
double x0y1z1, | |
double x1y1z1 | |
) { | |
return lerp(deltaZ, lerp2(deltaX, deltaY, x0y0z0, x1y0z0, x0y1z0, x1y1z0), lerp2(deltaX, deltaY, x0y0z1, x1y0z1, x0y1z1, x1y1z1)); | |
} | |
__host__ __device__ static double lerp2(double deltaX, double deltaY, double x0y0, double x1y0, double x0y1, double x1y1) { | |
return lerp(deltaY, lerp(deltaX, x0y0, x1y0), lerp(deltaX, x0y1, x1y1)); | |
} | |
__host__ __device__ static double lerp(double delta, double start, double end) { | |
return start + delta * (end - start); | |
} | |
__host__ __device__ static double maintainPrecision(double value) { | |
return value - (double) lfloor(value / 3.3554432E7 + 0.5) * 3.3554432E7; | |
} | |
__host__ __device__ static int32_t ifloor(double value) { | |
auto i = (int32_t) value; | |
return value < (double) i ? i - 1 : i; | |
} | |
__host__ __device__ static int64_t lfloor(double value) { | |
auto l = (int64_t) value; | |
return value < (double) l ? l - 1 : l; | |
} | |
public: | |
__host__ __device__ double sample(double x, double y, double z, double yScale, double yMax) const { | |
double e = lacunarity; | |
double f = persistence; | |
double g = doSample(maintainPrecision(x * e), maintainPrecision(y * e), maintainPrecision(z * e), yScale * e, yMax * e); | |
return g * f; | |
} | |
}; | |
class DoublePerlinNoiseSampler { | |
PerlinNoiseSampler firstSampler{}; | |
PerlinNoiseSampler secondSampler{}; | |
public: | |
__host__ __device__ DoublePerlinNoiseSampler() = default; | |
__host__ __device__ explicit DoublePerlinNoiseSampler(Random& random) { | |
firstSampler = PerlinNoiseSampler(random); | |
secondSampler = PerlinNoiseSampler(random); | |
} | |
__host__ __device__ double sample(double x, double y, double z) const { | |
double d = x * 1.0181268882175227; | |
double e = y * 1.0181268882175227; | |
double f = z * 1.0181268882175227; | |
double first = firstSampler.sample(x, y, z, 0, 0); | |
double second = secondSampler.sample(d, e, f, 0, 0); | |
return (first + second) * (5.0 / 6.0); | |
} | |
}; | |
class ByteArray { | |
uint32_t* data; | |
public: | |
__host__ __device__ explicit ByteArray(uint32_t* data) : data(data) {} | |
__host__ __device__ uint8_t operator[](size_t index) { | |
return (data[index >> 2] >> ((index & 3) << 3)) & 0xff; | |
} | |
__device__ uint8_t atomicInc(size_t index) { | |
return (atomicAdd(data + (index >> 2), 1 << ((index & 3) << 3)) >> ((index & 3) << 3)) & 0xff; | |
} | |
}; | |
struct BlockPos { | |
int32_t x, y, z; | |
__host__ __device__ BlockPos operator+(const BlockPos& other) const { | |
return {x + other.x, y + other.y, z + other.z}; | |
} | |
__host__ __device__ double getSquaredDistance(const BlockPos& other) const { | |
double d = (double) x - (double) other.x; | |
double e = (double) y - (double) other.y; | |
double f = (double) z - (double) other.z; | |
return d * d + e * e + f * f; | |
} | |
}; | |
struct ChunkPos { | |
int32_t x, z; | |
}; | |
static_assert(sizeof(ChunkPos) == 8, "sizeof(ChunkPos) must be 8"); | |
struct PosIntPair { | |
BlockPos pos; | |
int32_t val; | |
}; | |
struct GeodeGeneratorStaticData { | |
uint64_t seed = 0; | |
DoublePerlinNoiseSampler doublePerlinNoiseSampler{}; | |
uint64_t nextA = 0; | |
uint64_t nextB = 0; | |
__host__ __device__ GeodeGeneratorStaticData() = default; | |
__host__ __device__ explicit GeodeGeneratorStaticData(uint64_t seed) : seed(seed) { | |
Random lcg(seed); | |
doublePerlinNoiseSampler = DoublePerlinNoiseSampler(lcg); | |
XOR128 rand(seed); | |
nextA = rand.nextLong() | 1; | |
nextB = rand.nextLong() | 1; | |
} | |
}; | |
__forceinline__ __host__ __device__ bool canGenerateGeode(const GeodeGeneratorStaticData& staticData, int chunkX, int chunkZ) { | |
int32_t blockX = chunkX << 4; | |
int32_t blockZ = chunkZ << 4; | |
uint64_t decorationSeed = XOR128::getPopulationSeed(staticData.seed, staticData.nextA, staticData.nextB, blockX, blockZ); | |
XOR128 rand = XOR128::withDecoratorSeed(decorationSeed, 2, 2); | |
return rand.nextFloat() < 1.0 / 24; | |
} | |
template<class PlaceCrystalFunc> | |
__forceinline__ __host__ __device__ bool generateGeode( | |
const GeodeGeneratorStaticData& staticData, | |
int32_t chunkX, | |
int32_t chunkZ, | |
PlaceCrystalFunc placeCrystalFunc | |
) { | |
int32_t blockX = chunkX << 4; | |
int32_t blockZ = chunkZ << 4; | |
uint64_t decorationSeed = XOR128::getPopulationSeed(staticData.seed, staticData.nextA, staticData.nextB, blockX, blockZ); | |
XOR128 rand = XOR128::withDecoratorSeed(decorationSeed, 2, 2); | |
rand.nextFloat(); // caller should use canGenerateGeode first | |
blockX += rand.nextInt(16); | |
blockZ += rand.nextInt(16); | |
int32_t blockY = rand.nextInt(30-(-64+6)+1)+(-64+6); | |
BlockPos blockPos{blockX, blockY, blockZ}; | |
int k = rand.nextInt((4-3+1))+3; | |
double d = (double)k / 6.0; | |
constexpr double e = 0.76696498884; // 1 / sqrt(1.7) | |
double f = 1.0 / sqrt(2.2 + d); | |
double h = 1.0 / sqrt(4.2 + d); | |
double l = 1.0 / sqrt(2.0 + rand.nextDouble() / 2.0 + (k > 3 ? d : 0.0)); | |
bool bl = rand.nextFloat() < 0.95; | |
PosIntPair list[4]; | |
int listSize = 0; | |
for (int n = 0; n < k; n++) { | |
int o = rand.nextInt(6 - 4 + 1) + 4; | |
int p = rand.nextInt(6 - 4 + 1) + 4; | |
int q = rand.nextInt(6 - 4 + 1) + 4; | |
BlockPos blockPos2 = blockPos + BlockPos{o, p, q}; | |
list[listSize++] = PosIntPair{blockPos2, rand.nextInt(2-1+1)+1}; | |
} | |
BlockPos list2[3]; | |
if (bl) { | |
int n = rand.nextInt(4); | |
int o = k * 2 + 1; | |
switch (n) { | |
case 0: | |
list2[0] = blockPos + BlockPos{o, 7, 0}; | |
list2[1] = blockPos + BlockPos{o, 5, 0}; | |
list2[2] = blockPos + BlockPos{o, 1, 0}; | |
break; | |
case 1: | |
list2[0] = blockPos + BlockPos{0, 7, o}; | |
list2[1] = blockPos + BlockPos{0, 5, o}; | |
list2[2] = blockPos + BlockPos{0, 1, o}; | |
break; | |
case 2: | |
list2[0] = blockPos + BlockPos{o, 7, o}; | |
list2[1] = blockPos + BlockPos{o, 5, o}; | |
list2[2] = blockPos + BlockPos{o, 1, o}; | |
break; | |
default: | |
list2[0] = blockPos + BlockPos{0, 7, 0}; | |
list2[1] = blockPos + BlockPos{0, 5, 0}; | |
list2[2] = blockPos + BlockPos{0, 1, 0}; | |
break; | |
} | |
} | |
for (int dz = -16; dz <= 16; dz++) { | |
for (int dy = -16; dy <= 16; dy++) { | |
for (int dx = -16; dx <= 16; dx++) { | |
BlockPos blockPos3 = blockPos + BlockPos{dx, dy, dz}; | |
double s = 0; | |
double t = 0; | |
for (int i = 0; i < listSize; i++) { | |
s += fastInverseSqrt(blockPos3.getSquaredDistance(list[i].pos) + (double)list[i].val); | |
} | |
double esx = s + 0.05 * (double)k; | |
double esn = s - 0.05 * (double)k; | |
if (esn >= e || (!((esx >= h)&&(esx >= f)))) { | |
continue; | |
} | |
if (bl) { | |
for (int i = 0; i < 3; i++) { | |
t += fastInverseSqrt(blockPos3.getSquaredDistance(list2[i]) + 2); | |
} | |
if (t - 0.05 * 3 >= l) { | |
continue; | |
} | |
} | |
double r = staticData.doublePerlinNoiseSampler.sample((double)blockPos3.x, (double)blockPos3.y, (double)blockPos3.z) * 0.05; | |
s += r * (double) k; | |
if ((!((s >= h)&&(s >= f)))||(s >= e)) | |
continue; | |
if (bl) { | |
t += r * 3; | |
if (t >= l) | |
continue; | |
} | |
bool bl2 = rand.nextFloat() < 0.083; | |
if (bl2) { | |
placeCrystalFunc(blockPos3); | |
rand.next(); | |
} | |
} | |
} | |
} | |
return true; | |
} | |
__constant__ GeodeGeneratorStaticData STATIC_DATA; | |
__global__ void filterCanGenerateGeode(int32_t minChunkX, int32_t minChunkZ, uint32_t squareSize, uint32_t* count, ChunkPos* outPositions) { | |
uint64_t idx = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x; | |
if (idx > squareSize * squareSize) { | |
return; | |
} | |
int32_t cx = minChunkX + (int32_t) (idx / squareSize); | |
int32_t cz = minChunkZ + (int32_t) (idx % squareSize); | |
if (canGenerateGeode(STATIC_DATA, cx, cz)) { | |
uint32_t index = atomicAdd(count, 1); | |
outPositions[index] = ChunkPos{cx, cz}; | |
} | |
} | |
__global__ void countCrystals(uint32_t count, ChunkPos* positions, size_t* progress, int32_t minChunkX, int32_t minChunkZ, uint32_t squareSize, ByteArray crystalCounts) { | |
uint32_t idx = (uint32_t) blockIdx.x * (uint32_t) blockDim.x + (uint32_t) threadIdx.x; | |
if (idx > count) { | |
return; | |
} | |
*progress = max(*progress, (size_t) idx); | |
ChunkPos position = positions[idx]; | |
generateGeode(STATIC_DATA, position.x, position.z, [minChunkX, minChunkZ, squareSize, &crystalCounts](const BlockPos& pos) { | |
int32_t cx = pos.x >> 4; | |
int32_t cz = pos.z >> 4; | |
if (cx >= minChunkX && cz >= minChunkZ && cx < minChunkX + (int32_t)squareSize && cz < minChunkZ + (int32_t)squareSize) { | |
uint32_t index = (uint32_t) (cx - minChunkX) + squareSize * (uint32_t) (cz - minChunkZ); | |
crystalCounts.atomicInc(index); | |
} | |
}); | |
} | |
__host__ __device__ int getCountInRange(ByteArray crystalCounts, int32_t cx, int32_t cz, uint32_t squareSize) { | |
constexpr double playerPositions[8][2] = { | |
{ 1.0625, 4.125 }, | |
{ 4.125, 1.0625 }, | |
{ 14.9375, 4.125 }, | |
{ 4.125, 14.9375 }, | |
{ 1.0625, 11.875 }, | |
{ 11.875, 1.0625 }, | |
{ 14.9375, 11.875 }, | |
{ 11.875, 14.9375 }, | |
}; | |
int maxCount = 0; | |
for (int i = 0; i < 8; i++) { | |
double playerX = playerPositions[i][0]; | |
double playerZ = playerPositions[i][1]; | |
int count = 0; | |
for (int ox = -8; ox <= 8; ox++) { | |
for (int oz = -8; oz <= 8; oz++) { | |
int chunkX = ox * 16 + 8; | |
int chunkZ = ox * 16 + 8; | |
double dx = chunkX - playerX; | |
double dz = chunkZ - playerZ; | |
if (dx * dx + dz * dz < 128 * 128) { | |
int32_t actualChunkX = cx + ox; | |
int32_t actualChunkZ = cz + oz; | |
if (actualChunkX >= 0 && actualChunkZ >= 0 && (uint32_t)actualChunkX < squareSize && (uint32_t)actualChunkZ < squareSize) { | |
uint32_t index = (uint32_t) actualChunkX + squareSize * (uint32_t) actualChunkZ; | |
count += crystalCounts[index]; | |
} | |
} | |
} | |
} | |
maxCount = max(count, maxCount); | |
} | |
return maxCount; | |
} | |
constexpr int ACCEPTABLE_BELOW_MAX = 100; | |
__global__ void processRenderDistanceRanges(size_t* progress, ByteArray crystalCounts, uint32_t squareSize, int* maxCount, uint32_t* outputCount, ChunkPos* output) { | |
uint64_t idx = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x; | |
if (idx > squareSize * squareSize) { | |
return; | |
} | |
*progress = max(*progress, (size_t) idx); | |
auto cx = (int32_t) (idx / squareSize); | |
auto cz = (int32_t) (idx % squareSize); | |
int count = getCountInRange(crystalCounts, cx, cz, squareSize); | |
int oldCount = atomicMax(maxCount, count); | |
int currentMax = max(count, oldCount); | |
if (currentMax - count <= ACCEPTABLE_BELOW_MAX) { | |
uint32_t index = atomicAdd(outputCount, 1); | |
output[index] = ChunkPos{cx, cz}; | |
return; | |
} | |
} | |
template <class F, class L> | |
void executeKernelUnderProgress(const char* displayName, size_t workSize, F kernelFunction, L kernelLauncher) { | |
int blockSize; | |
int minGridSize; | |
handleError(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernelFunction, 0, workSize)); | |
int gridSize = (int) ((workSize + (size_t) blockSize - 1) / (size_t) blockSize); | |
std::cout << "Executing " << displayName << " with gridSize=" << gridSize << " blockSize=" << blockSize << std::endl; | |
size_t* deviceProgress; | |
handleError(cudaMalloc(&deviceProgress, sizeof(*deviceProgress))); | |
size_t* hostProgress; | |
handleError(cudaHostAlloc(&hostProgress, sizeof(*hostProgress), cudaHostAllocDefault)); | |
cudaEvent_t finishEvent; | |
handleError(cudaEventCreate(&finishEvent)); | |
cudaStream_t executionStream; | |
handleError(cudaStreamCreate(&executionStream)); | |
cudaStream_t progressStream; | |
handleError(cudaStreamCreate(&progressStream)); | |
kernelLauncher(gridSize, blockSize, executionStream, deviceProgress); | |
handleError(cudaEventRecord(finishEvent, executionStream)); | |
cudaError_t queryError; | |
while ((queryError = cudaEventQuery(finishEvent)) == cudaErrorNotReady) { | |
handleError(cudaMemcpyAsync(hostProgress, deviceProgress, sizeof(*hostProgress), cudaMemcpyDeviceToHost, progressStream)); | |
handleError(cudaStreamSynchronize(progressStream)); | |
std::cout << "... Progress " << displayName << ": " << *hostProgress << " / " << workSize << " (" << ((double) *hostProgress / (double) workSize * 100) << "%)" << std::endl; | |
std::this_thread::sleep_for(std::chrono::seconds(1)); | |
} | |
handleError(queryError); | |
handleError(cudaEventDestroy(finishEvent)); | |
handleError(cudaStreamDestroy(progressStream)); | |
handleError(cudaStreamDestroy(executionStream)); | |
handleError(cudaFreeHost(hostProgress)); | |
handleError(cudaFree(deviceProgress)); | |
} | |
template <class T> | |
bool string2int(const char* str, T& result) { | |
std::stringstream ss(str); | |
return !(ss >> result).fail() && ss.eof(); | |
} | |
int main(int argc, char** argv) { | |
if (argc < 5) { | |
std::cerr << argv[0] << " <seed> <center-chunk-x> <center-chunk-z> <chunk-radius>" << std::endl; | |
return 0; | |
} | |
int64_t seed; | |
int32_t centerChunkX, centerChunkZ, chunkRadius; | |
if (!string2int(argv[1], seed) || !string2int(argv[2], centerChunkX) || !string2int(argv[3], centerChunkZ) || !string2int(argv[4], chunkRadius)) { | |
std::cerr << "Invalid integer inputs" << std::endl; | |
return 0; | |
} | |
if (chunkRadius <= 0) { | |
std::cerr << "Chunk radius must be positive" << std::endl; | |
return 0; | |
} | |
int32_t minChunkX = centerChunkX - chunkRadius; | |
int32_t minChunkZ = centerChunkZ - chunkRadius; | |
uint32_t squareSize = chunkRadius * 2; | |
size_t freeMemory; | |
size_t totalMemory; | |
handleError(cudaMemGetInfo(&freeMemory, &totalMemory)); | |
auto maxSquareSize = (size_t) (sqrt((double) freeMemory) * 0.5); | |
if (maxSquareSize == 0) { | |
std::cerr << "No free GPU memory" << std::endl; | |
return 0; | |
} | |
#ifdef _MSC_VER | |
int leadingZeros = sizeof(size_t) == 8 ? _lzcnt_u64(maxSquareSize) : _lzcnt_u32(maxSquareSize); | |
#else | |
int leadingZeros = __builtin_clz(maxSquareSize); | |
#endif | |
maxSquareSize = 1 << ((sizeof(maxSquareSize) * 8 - 1) - leadingZeros); | |
if (squareSize > maxSquareSize) { | |
std::cerr << "Not enough GPU memory for that radius. Radius must be at most " << (maxSquareSize / 2) << " chunks for this available memory" << std::endl; | |
return 0; | |
} | |
size_t workSize = squareSize * squareSize; | |
std::cout << "Searching area " << minChunkX << ", " << minChunkZ << " to " << (minChunkX + squareSize - 1) << ", " << (minChunkZ + squareSize - 1) << std::endl; | |
GeodeGeneratorStaticData staticData(seed); | |
handleError(cudaMemcpyToSymbol(STATIC_DATA, &staticData, sizeof(staticData))); | |
ChunkPos* filteredPositions; | |
cudaMalloc(&filteredPositions, workSize); // allocate 8 times less than the capacity, okay because we'll never actually reach this | |
uint32_t* countPtr; | |
cudaMallocManaged(&countPtr, sizeof(*countPtr)); | |
*countPtr = 0; | |
executeKernelUnderProgress("finding geodes", workSize, filterCanGenerateGeode, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){ | |
filterCanGenerateGeode<<<gridSize, blockSize, 0, executionStream>>>(minChunkX, minChunkZ, squareSize, countPtr, filteredPositions); | |
}); | |
uint32_t count = *countPtr; | |
handleError(cudaFree(countPtr)); | |
std::cout << "Found " << count << " geodes total in grid" << std::endl; | |
// shrink filtered positions allocation | |
// array is too large to keep on device, need to go via host | |
auto temp = new ChunkPos[count]; | |
handleError(cudaMemcpy(temp, filteredPositions, count * sizeof(*filteredPositions), cudaMemcpyDeviceToHost)); | |
handleError(cudaFree(filteredPositions)); | |
handleError(cudaMalloc(&filteredPositions, count * sizeof(*filteredPositions))); | |
handleError(cudaMemcpy(filteredPositions, temp, count * sizeof(*filteredPositions), cudaMemcpyHostToDevice)); | |
delete[] temp; | |
uint32_t* crystalCountsRaw; | |
size_t sizeOfCrystalCounts = (workSize + 3) / 4 * 4; | |
handleError(cudaMalloc(&crystalCountsRaw, sizeOfCrystalCounts)); | |
handleError(cudaMemset(crystalCountsRaw, 0, sizeOfCrystalCounts)); | |
ByteArray crystalCounts(crystalCountsRaw); | |
auto crystalCountsRawHost = new uint32_t[sizeOfCrystalCounts]; | |
ByteArray crystalCountsHost(crystalCountsRawHost); | |
executeKernelUnderProgress("crystal counting", count, countCrystals, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){ | |
countCrystals<<<gridSize, blockSize, 0, executionStream>>>(count, filteredPositions, deviceProgress, minChunkX, minChunkZ, squareSize, crystalCounts); | |
}); | |
handleError(cudaFree(filteredPositions)); | |
int* maxCount; | |
handleError(cudaMallocManaged(&maxCount, sizeof(*maxCount))); | |
uint32_t* outputCount; | |
handleError(cudaMallocManaged(&outputCount, sizeof(*outputCount))); | |
ChunkPos* output; | |
handleError(cudaMallocManaged(&output, 10000 * sizeof(*output))); | |
executeKernelUnderProgress("applying ticking ranges", workSize, processRenderDistanceRanges, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){ | |
processRenderDistanceRanges<<<gridSize, blockSize, 0, executionStream>>>(deviceProgress, crystalCounts, squareSize, maxCount, outputCount, output); | |
}); | |
handleError(cudaMemcpy(crystalCountsRawHost, crystalCountsRaw, sizeOfCrystalCounts, cudaMemcpyDeviceToHost)); | |
std::vector<std::pair<ChunkPos, int>> filteredOutput; | |
for (int i = 0; i < *outputCount; i++) { | |
int inRange = getCountInRange(crystalCountsHost, output[i].x, output[i].z, squareSize); | |
if (*maxCount - inRange <= ACCEPTABLE_BELOW_MAX) { | |
filteredOutput.emplace_back(output[i], inRange); | |
} | |
} | |
std::cout << "Max found: " << *maxCount << ", number of areas within " << ACCEPTABLE_BELOW_MAX << " of max: " << filteredOutput.size() << std::endl; | |
std::sort(filteredOutput.begin(), filteredOutput.end(), [](const std::pair<ChunkPos, int>& a, const std::pair<ChunkPos, int>& b) { return a.second >= b.second; }); | |
for (const std::pair<ChunkPos, int>& pair : filteredOutput) { | |
std::cout << "(" << (pair.first.x + minChunkX) << ", " << (pair.first.z + minChunkZ) << "): " << pair.second << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment