Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Last active February 22, 2025 07:24
Show Gist options
  • Save jweinst1/d5c6fa2e8f0dc970d6fece60660897c5 to your computer and use it in GitHub Desktop.
Save jweinst1/d5c6fa2e8f0dc970d6fece60660897c5 to your computer and use it in GitHub Desktop.
a sorted string table in C++
#include <map>
#include <string>
#include <vector>
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <ctime>
#include <optional>
#include <chrono>
using namespace std::chrono;
static void put_rand_chars_in_string(std::string& foo, size_t n) {
while(n--) {
foo.push_back((rand() % 90) + 32);
}
}
static void print_mem_as_str(const char* prefix, const void* str, size_t len) {
if (prefix != nullptr) {
printf("%s ", prefix);
}
const char* ptr = (const char*)str;
while(len--) {
putc(*ptr++, stdout);
}
putc('\n', stdout);
}
static bool cmp_mem_lt(const void* lhs, size_t llen, const void* rhs, size_t rlen) {
size_t shorter = llen < rlen ? llen : rlen;
int result = memcmp(lhs, rhs, shorter);
return (result == 0 && llen < rlen) || result < 0;
}
static bool cmp_mem_eq(const void* lhs, size_t llen, const void* rhs, size_t rlen) {
if (llen != rlen) return false;
return memcmp(lhs, rhs, llen) == 0;
}
static size_t extendByteVector(std::vector<unsigned char>& bytes, const void* data, size_t n) {
size_t off_before = bytes.size();
const unsigned char* ptr = (const unsigned char*)data;
while (n--) {
bytes.push_back(*ptr++);
}
return off_before;
}
static void extendByteVectorWithNumbers(std::vector<unsigned char>& bytes, const std::vector<uint64_t>& numbers) {
uint64_t numSize = numbers.size();
extendByteVector(bytes, &numSize, sizeof(numSize));
for (size_t i = 0; i < numSize; ++i)
{
uint64_t cur = numbers[i];
extendByteVector(bytes, &cur, sizeof(cur));
}
}
static void toSortedTable(const std::map<std::string, std::vector<uint64_t>>& pairs,
std::vector<unsigned char>& bytes) {
bytes.clear();
size_t index_block_size = pairs.size() + 1;
uint64_t* nums = new uint64_t[index_block_size]();
nums[0] = pairs.size();
extendByteVector(bytes, nums, index_block_size * sizeof(uint64_t));
std::vector<uint64_t> temp_indexes;
for (const auto& pair : pairs) {
uint64_t key_size = pair.first.size();
uint64_t key_res = extendByteVector(bytes, &key_size, sizeof(key_size));
temp_indexes.push_back(key_res);
extendByteVector(bytes, pair.first.data(), key_size);
extendByteVectorWithNumbers(bytes, pair.second);
}
uint64_t* numWriter = (uint64_t*)bytes.data();
++numWriter; // dont overwrite number of indexes
for (int i = 0; i < temp_indexes.size(); ++i)
{
numWriter[i] = temp_indexes[i];
}
delete[] nums;
}
// todo add binary search on the indexes
std::optional<uint64_t> binarySearchSSTable(const std::vector<unsigned char>& bytes, const std::string key) {
const uint64_t* numReader = (uint64_t*)bytes.data();
size_t numIndexes = numReader[0];
size_t currentL = 0;
size_t currentR = numIndexes;
size_t curIndex = (currentL + currentR) / 2;
while (currentL <= currentR) {
//printf("curIndex is %zu L is %zu, R is %zu\n", curIndex, currentL, currentR);
uint64_t keySize = 0;
const unsigned char* byteSpot = bytes.data() + numReader[curIndex];
memcpy(&keySize, byteSpot, sizeof(uint64_t));
byteSpot += sizeof(uint64_t);
if (cmp_mem_lt(byteSpot, keySize, key.data(), key.size())) {
currentL = curIndex + 1;
} else if (cmp_mem_eq(byteSpot, keySize, key.data(), key.size())) {
return curIndex;
} else {
if (currentR == 0 || curIndex == 0) {
return std::nullopt;
}
currentR = curIndex - 1;
}
curIndex = (currentL + currentR) / 2;
}
return std::nullopt;
}
static void bigTest() {
static constexpr size_t testSize = 1000000;
std::vector<uint64_t> myVec = {1, 2, 3};
std::vector<unsigned char> myBytes;
std::vector<std::string> myKeys;
std::map<std::string, std::vector<uint64_t>> myMap;
std::string myString;
for (int i = 0; i < testSize; ++i)
{
put_rand_chars_in_string(myString, 5);
myKeys.push_back(myString);
myMap.insert(std::make_pair(myString, myVec));
myString.clear();
}
{
auto start = high_resolution_clock::now();
toSortedTable(myMap, myBytes);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
std::printf("Time taken insert %llu\n", duration.count());
}
uint64_t adder = 0;
auto start = high_resolution_clock::now();
for (int j = 0; j < testSize; ++j)
{
const auto result = binarySearchSSTable(myBytes, myKeys[j]);
if (result.has_value()) {
adder += result.value();
}
}
auto stop = high_resolution_clock::now();
printf("Total adder %llu\n", adder);
auto duration = duration_cast<microseconds>(stop - start);
std::printf("Time taken search %llu\n", duration.count());
}
int main(int argc, char const *argv[])
{
srand(time(nullptr));
//Time taken insert 168110
//Total adder 499980610763
//Time taken search 427618
bigTest();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment