Skip to content

Instantly share code, notes, and snippets.

@bqqbarbhg
Last active August 28, 2024 22:48
Show Gist options
  • Save bqqbarbhg/38bbe9e7707e76ef5df8f0206696f89f to your computer and use it in GitHub Desktop.
Save bqqbarbhg/38bbe9e7707e76ef5df8f0206696f89f to your computer and use it in GitHub Desktop.
#if 1
#include <atomic>
#include <vector>
#include <mutex>
#include <unordered_set>
__declspec(noinline) void ebi_assert(bool cond)
{
if (!cond)
{
printf("ASSERTION HIT!\n");
__debugbreak();
}
}
template <typename T>
void swap_out(std::vector<T> &vec, size_t index) {
vec[index] = vec.back();
vec.pop_back();
}
#define MAX_SLOTS 128
#define MAX_FREED 32
#define MAX_MARK 32
#define MAX_THREADS 64
using mutex_guard = std::lock_guard<std::mutex>;
struct EbiVm;
template <typename T>
struct span
{
T *data;
size_t count;
T *begin() const { return data; }
T *end() const { return data + count; }
};
#define EBI_TYPE_MAGIC 0x74696265
struct EbiType
{
uint32_t magic;
std::vector<uint32_t> slotOffsets;
};
struct EbiAllocation
{
EbiType *type;
uint32_t mark;
uint32_t size;
uint32_t freeStatus;
bool reallyFreed;
};
struct EbiFree
{
void *pointer;
uint32_t generation;
};
struct EbiThread
{
// -- Hot
EbiVm *vm;
uint32_t index;
uint32_t localGeneration;
uint32_t localMark;
std::atomic_uint32_t vmGeneration;
uint64_t allocationCount;
uint64_t allocationBytes;
void *locals[MAX_SLOTS];
uint32_t numLocals = 0;
EbiFree freeList[MAX_FREED];
uint32_t numFreed = 0;
void *markList[MAX_MARK];
uint32_t numMarked = 0;
// -- Cold
char prePad[64];
std::atomic_uint32_t seenGeneration;
void *localsSnapshot[MAX_SLOTS];
uint32_t numLocalsSnapshot = 0;
uint64_t allocationCountSnapshot;
uint64_t allocationBytesSnapshot;
char postPad[64];
};
struct EbiVm
{
std::atomic_uint32_t generation;
std::atomic_uint32_t sweepMark;
std::mutex freeListMutex;
std::vector<EbiFree> freeList;
std::vector<EbiFree> internalFreeList;
std::vector<EbiFree> activeFreeList;
std::mutex markSetMutex;
std::unordered_set<void*> markSet;
std::unordered_set<void*> internalMarkSet;
std::vector<void*> internalMarkList;
std::vector<void*> activeMarkList;
EbiThread threads[MAX_THREADS];
uint32_t numThreads = 0;
uint64_t freedAllocationCount = 0;
uint64_t freedAllocationBytes = 0;
bool markedRoots = false;
uint32_t markStartGeneration = ~0u;
std::mutex allocMutex;
std::unordered_set<EbiAllocation*> allocs;
};
EbiAllocation *ebiAllocHeader(void *ptr)
{
EbiAllocation *alloc = (EbiAllocation*)((char*)ptr - sizeof(EbiAllocation));
ebi_assert(alloc->type);
ebi_assert(alloc->type->magic == EBI_TYPE_MAGIC);
ebi_assert(!alloc->reallyFreed);
return alloc;
}
EbiThread *ebiNewThread(EbiVm *vm)
{
uint32_t index = vm->numThreads++;
EbiThread *thread = &vm->threads[index];
thread->vm = vm;
thread->index = index;
thread->localMark = vm->sweepMark;
thread->localGeneration = vm->generation;
thread->seenGeneration.store(vm->generation);
thread->vmGeneration.store(vm->generation);
return thread;
}
void flushMarks(EbiThread &thread)
{
EbiVm *vm = thread.vm;
{
mutex_guard mg { vm->markSetMutex };
vm->markSet.insert(thread.markList, thread.markList + thread.numMarked);
}
thread.numMarked = 0;
}
__declspec(noinline) void ebiCheckpointSlow(EbiThread &thread)
{
uint32_t generation = thread.vmGeneration.load(std::memory_order_relaxed);
uint32_t threadIndex = (uint32_t)(&thread - thread.vm->threads);
thread.numLocalsSnapshot = thread.numLocals;
for (uint32_t i = 0; i < thread.numLocals; i++) {
thread.localsSnapshot[i] = thread.locals[i];
}
thread.allocationBytesSnapshot = thread.allocationBytes;
thread.allocationCountSnapshot = thread.allocationCount;
thread.localMark = thread.vm->sweepMark;
flushMarks(thread);
thread.localGeneration = generation;
thread.seenGeneration.store(generation);
}
__forceinline void ebiCheckpoint(EbiThread &thread)
{
uint32_t generation = thread.vmGeneration.load(std::memory_order_relaxed);
if (generation != thread.localGeneration) {
ebiCheckpointSlow(thread);
}
}
void *ebiAllocate(EbiThread &thread, EbiType *type, size_t size)
{
EbiAllocation *alloc = (EbiAllocation*)malloc(sizeof(EbiAllocation) + size);
if (!alloc) return nullptr;
alloc->mark = thread.localMark;
alloc->size = (uint32_t)size;
alloc->type = type;
alloc->freeStatus = 0;
alloc->reallyFreed = false;
{
mutex_guard mg { thread.vm->allocMutex };
thread.vm->allocs.insert(alloc);
}
thread.allocationCount += 1;
thread.allocationBytes += size;
return alloc + 1;
}
void flushFree(EbiThread &thread)
{
EbiVm *vm = thread.vm;
{
mutex_guard mg { vm->freeListMutex };
vm->freeList.insert(vm->freeList.end(), thread.freeList, thread.freeList + thread.numFreed);
}
thread.numFreed = 0;
}
void ebiFree(EbiThread &thread, void *pointer)
{
if (!pointer) return;
EbiAllocation *alloc = (EbiAllocation*)((char*)pointer - sizeof(EbiAllocation));
uint32_t index = thread.numFreed++;
thread.freeList[index] = { pointer, thread.localGeneration };
if (index + 1 == MAX_FREED) {
flushFree(thread);
}
}
void ebiMark(EbiThread &thread, void *pointer)
{
if (thread.localMark != 0 && pointer != nullptr) {
EbiAllocation *alloc = ebiAllocHeader(pointer);
if (alloc->mark < thread.localMark) {
alloc->mark = thread.localMark;
uint32_t index = thread.numMarked++;
thread.markList[index] = pointer;
if (index + 1 == MAX_MARK) {
flushMarks(thread);
}
}
}
}
void ebiSet(EbiThread &thread, void **slot, void *pointer)
{
void *prev = *slot;
ebiMark(thread, prev);
ebiMark(thread, pointer);
*slot = pointer;
}
void ebiMark(EbiVm &vm, void *pointer, uint32_t mark)
{
if (!pointer) return;
EbiAllocation *alloc = ebiAllocHeader(pointer);
if (alloc->mark < mark) {
alloc->mark = mark;
vm.internalMarkList.push_back(pointer);
}
}
void deallocate(EbiVm &vm, void *pointer)
{
EbiAllocation *alloc = (EbiAllocation*)((char*)pointer - sizeof(EbiAllocation));
vm.freedAllocationBytes += alloc->size;
vm.freedAllocationCount++;
{
mutex_guard mg { vm.allocMutex };
vm.allocs.erase(alloc);
}
alloc->reallyFreed = true;
alloc->freeStatus = 1;
free(alloc);
}
void collectStart(EbiVm &vm)
{
span<EbiThread> threads { vm.threads, vm.numThreads };
uint32_t generation = vm.generation.fetch_add(1) + 1;
// Notify all threads of the new generation
for (EbiThread &thread : threads) {
thread.vmGeneration.store(generation);
}
}
bool collectStep(EbiVm &vm)
{
span<EbiThread> threads { vm.threads, vm.numThreads };
uint32_t generation = vm.generation.load();
// 1. Make sure all threads have seen this generation
for (EbiThread &thread : threads) {
uint32_t gen = thread.seenGeneration.load(std::memory_order_relaxed);
if (gen < generation) return false;
}
std::atomic_thread_fence(std::memory_order_acquire);
// 2. Collect all held pointers
std::unordered_set<void*> heldPointers;
uint64_t totalBytes = 0;
uint64_t totalAllocs = 0;
for (EbiThread &thread : threads) {
span<void*> slots { thread.localsSnapshot, thread.numLocalsSnapshot };
for (void* slot : slots) {
if (slot) {
heldPointers.insert(slot);
}
}
totalBytes += thread.allocationBytes;
totalAllocs += thread.allocationCount;
}
// 3. Collect accumulated free list
{
mutex_guard mg { vm.freeListMutex };
vm.internalFreeList.insert(vm.internalFreeList.end(), vm.freeList.begin(), vm.freeList.end());
vm.freeList.clear();
}
// 4. Collect allocations that can be possibly freed
for (size_t i = 0; i < vm.internalFreeList.size(); i++) {
EbiFree free = vm.internalFreeList[i];
if (free.generation < generation) {
EbiAllocation *alloc = ebiAllocHeader(free.pointer);
if (alloc->freeStatus == 0) {
vm.activeFreeList.push_back(free);
alloc->freeStatus = 1;
}
swap_out(vm.internalFreeList, i);
i--;
}
}
// 5. Try to free allocations
{
mutex_guard mg { vm.markSetMutex };
for (size_t i = 0; i < vm.activeFreeList.size(); i++) {
EbiFree free = vm.activeFreeList[i];
if (heldPointers.find(free.pointer) == heldPointers.end()) {
vm.markSet.erase(free.pointer);
deallocate(vm, free.pointer);
swap_out(vm.activeFreeList, i);
i--;
}
}
}
uint64_t memoryUsed = totalBytes - vm.freedAllocationBytes;
uint64_t allocCount = totalAllocs - vm.freedAllocationCount;
if (generation % 1 == 0) {
printf("%u: %.2fk allocs, %.2fkB memory (%llu+%llu leftover)\n", generation, (double)allocCount / 1e3, (double)memoryUsed / 1e3,
vm.internalFreeList.size(), vm.activeFreeList.size());
}
return true;
}
void markRoots(EbiVm &vm);
void markStart(EbiVm &vm)
{
vm.sweepMark.fetch_add(1);
vm.markedRoots = false;
vm.markStartGeneration = vm.generation + 2;
}
bool collectMark(EbiVm &vm)
{
uint32_t mark = vm.sweepMark.load();
span<EbiThread> threads { vm.threads, vm.numThreads };
if (vm.generation < vm.markStartGeneration) return false;
if (!vm.markedRoots) {
vm.markedRoots = true;
// 1. Mark all thread local roots
for (EbiThread &thread : threads) {
span<void*> slots { thread.localsSnapshot, thread.numLocalsSnapshot };
for (void* slot : slots) {
if (slot) {
ebiMark(vm, slot, mark);
}
}
}
markRoots(vm);
}
// 2. Mark the heap
uint32_t markCycles = 0;
for (;;) {
{
mutex_guard mg { vm.markSetMutex };
vm.internalMarkSet.swap(vm.markSet);
}
for (void *pointer : vm.internalMarkSet) {
vm.internalMarkList.push_back(pointer);
}
vm.internalMarkSet.clear();
if (vm.internalMarkList.empty()) break;
markCycles++;
vm.activeMarkList.swap(vm.internalMarkList);
for (void *pointer : vm.activeMarkList) {
EbiAllocation *alloc = ebiAllocHeader(pointer);
void **slots = (void**)pointer;
for (uint32_t slot : alloc->type->slotOffsets) {
void *slotPtr = slots[slot];
if (slotPtr) {
ebiMark(vm, slotPtr, mark);
}
}
}
vm.activeMarkList.clear();
}
if (markCycles > 0) {
return false;
}
std::vector<EbiAllocation*> toFree;
{
mutex_guard mg { vm.allocMutex };
for (EbiAllocation *alloc : vm.allocs) {
if (alloc->mark < mark) {
toFree.push_back(alloc);
}
}
}
for (EbiAllocation *alloc : toFree) {
deallocate(vm, alloc + 1);
}
return true;
}
void collectorThread(EbiVm &vm)
{
bool collectionDone = true;
for (;;) {
std::this_thread::sleep_for(std::chrono::milliseconds{ 1 });
collectStart(vm);
while (!collectStep(vm)) { }
if (collectMark(vm)) {
collectionDone = true;
}
if (collectionDone && vm.generation % 6 == 0) {
collectionDone = false;
markStart(vm);
}
}
}
template <typename T>
EbiType *ebiTypeof() { return nullptr; }
template <typename T>
T *newArray(EbiThread &thread, size_t count)
{
EbiType *type = ebiTypeof<T>();
T *data = (T*)ebiAllocate(thread, type, count * sizeof(T));
for (T &t : span<T> { data, count }) {
new (&t) T();
}
return data;
}
template <typename T>
T *newObject(EbiThread &thread)
{
return newArray<T>(thread, 1);
}
#define TREE_CHILDREN 12
struct Tree
{
Tree *children[TREE_CHILDREN];
};
template <>
static EbiType *ebiTypeof<Tree>() {
static EbiType type = [](){
EbiType type = { EBI_TYPE_MAGIC };
for (uint32_t i = 0; i < TREE_CHILDREN; i++) {
type.slotOffsets.push_back(i);
}
return type;
}();
return &type;
}
Tree g_tree;
struct xorshift64_state {
uint64_t a;
};
uint64_t xorshift64(struct xorshift64_state *state)
{
uint64_t x = state->a;
x ^= x << 7;
x ^= x >> 9;
return state->a = x;
}
void freeTree(EbiThread &thread, Tree **pTree)
{
Tree *tree = *pTree;
if (!tree) return;
ebiSet(thread, (void**)pTree, nullptr);
for (Tree *&branch : tree->children) {
freeTree(thread, &branch);
}
ebiFree(thread, tree);
}
void markRoots(EbiVm &vm)
{
uint32_t mark = vm.sweepMark.load();
for (size_t i = 0; i < TREE_CHILDREN; i++) {
ebiMark(vm, g_tree.children[i], mark);
}
}
void mutatorThread(EbiThread &thread, uint64_t seed)
{
Tree *tree = nullptr;
thread.numLocals = 1;
thread.locals[0] = nullptr;
uint32_t steps = 100000;
xorshift64_state rng = { seed };
for (uint32_t iter = 0; ; iter++) {
uint32_t inst = (uint32_t)xorshift64(&rng);
ebiCheckpoint(thread);
if (iter % steps == 0) {
std::this_thread::sleep_for(std::chrono::milliseconds{1});
}
if (!tree) {
tree = &g_tree;
}
uint32_t op = inst & 0xff;
uint32_t arg = inst >> 8;
if (op < 64) {
Tree *branch = tree->children[arg % TREE_CHILDREN];
if (branch) {
ebiAllocHeader(branch);
thread.locals[0] = branch;
tree = branch;
}
} else if (op < 70) {
thread.locals[0] = nullptr;
tree = nullptr;
} else if (op < 75) {
Tree **branch = &tree->children[arg % TREE_CHILDREN];
Tree *old = *branch;
ebiSet(thread, (void**)branch, nullptr);
tree = nullptr;
thread.locals[0] = nullptr;
} else if (op < 78) {
Tree **branch = &tree->children[arg % TREE_CHILDREN];
freeTree(thread, branch);
thread.locals[0] = nullptr;
} else if (op < 240) {
Tree **branch = &tree->children[arg % TREE_CHILDREN];
tree = newObject<Tree>(thread);
ebiSet(thread, (void**)branch, tree);
thread.locals[0] = tree;
} else {
Tree **branch = &tree->children[arg % TREE_CHILDREN];
tree = newObject<Tree>(thread);
freeTree(thread, branch);
ebiSet(thread, (void**)branch, tree);
thread.locals[0] = tree;
}
}
}
void observerThread(EbiThread &thread, uint64_t seed)
{
Tree *tree = nullptr;
thread.numLocals = 1;
thread.locals[0] = nullptr;
uint32_t loadGeneration = 0;
uint32_t steps = 1000;
xorshift64_state rng = { seed };
for (uint32_t iter = 0; ; iter++) {
uint32_t inst = (uint32_t)xorshift64(&rng);
if (tree && tree != &g_tree) {
EbiAllocation *allocation = ebiAllocHeader(tree);
ebi_assert(allocation->freeStatus != 2);
}
if (iter % steps == 0) {
// std::this_thread::sleep_for(std::chrono::milliseconds{1});
}
ebiCheckpoint(thread);
if (!tree) {
tree = &g_tree;
thread.locals[0] = nullptr;
}
uint32_t op = inst & 0x7f;
uint32_t arg = inst >> 8;
if (op < 64) {
Tree *branch = tree->children[arg % TREE_CHILDREN];
if (branch) {
tree = branch;
thread.locals[0] = tree;
loadGeneration = thread.localGeneration;
EbiAllocation *allocation = ebiAllocHeader(branch);
ebi_assert(allocation->freeStatus != 2);
}
} else if (op < 126) {
// Nop
} else {
thread.locals[0] = nullptr;
tree = nullptr;
}
}
}
void recursiveObserve(EbiThread &thread, xorshift64_state &rng, Tree *tree, uint32_t depth)
{
if (depth > 64) return;
ebiCheckpoint(thread);
uint32_t loc = thread.numLocals++;
thread.locals[loc] = nullptr;
for (uint32_t i = 0; i < 70; i++) {
uint32_t value = (uint32_t)xorshift64(&rng);
uint32_t index = value % TREE_CHILDREN;
if (value % 10000 == 0) {
break;
std::this_thread::sleep_for(std::chrono::milliseconds{10});
}
Tree *branch = tree->children[index];
if (!branch) continue;
EbiAllocation *allocation = ebiAllocHeader(branch);
ebi_assert(allocation->freeStatus != 2);
thread.locals[loc] = branch;
recursiveObserve(thread, rng, branch, depth + 1);
}
thread.numLocals--;
}
void recursiveThread(EbiThread &thread, uint64_t seed)
{
uint32_t steps = 1000;
xorshift64_state rng = { seed };
for (;;) {
recursiveObserve(thread, rng, &g_tree, 0);
}
}
EbiVm g_vm;
#define NUM_OBSERVERS 10
int main(int argc, char **argv)
{
EbiVm *vm = &g_vm;
vm->generation.store(1);
std::vector<std::thread> observers;
std::thread mt1 { [=] { mutatorThread(*ebiNewThread(vm), 1); } };
std::thread mt2 { [=] { mutatorThread(*ebiNewThread(vm), 2); } };
std::thread ot { [=] { observerThread(*ebiNewThread(vm), 100); } };
std::vector<std::thread> rts;
for (size_t i = 0; i < 4; i++) {
rts.emplace_back([=] { recursiveThread(*ebiNewThread(vm), i*1000); });
}
std::thread ct { [=] { collectorThread(*vm); } };
ct.join();
return 0;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment