Last active
February 22, 2025 07:24
-
-
Save jweinst1/d5c6fa2e8f0dc970d6fece60660897c5 to your computer and use it in GitHub Desktop.
a sorted string table in C++
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <map> | |
#include <string> | |
#include <vector> | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstdint> | |
#include <ctime> | |
#include <optional> | |
#include <chrono> | |
using namespace std::chrono; | |
static void put_rand_chars_in_string(std::string& foo, size_t n) { | |
while(n--) { | |
foo.push_back((rand() % 90) + 32); | |
} | |
} | |
static void print_mem_as_str(const char* prefix, const void* str, size_t len) { | |
if (prefix != nullptr) { | |
printf("%s ", prefix); | |
} | |
const char* ptr = (const char*)str; | |
while(len--) { | |
putc(*ptr++, stdout); | |
} | |
putc('\n', stdout); | |
} | |
static bool cmp_mem_lt(const void* lhs, size_t llen, const void* rhs, size_t rlen) { | |
size_t shorter = llen < rlen ? llen : rlen; | |
int result = memcmp(lhs, rhs, shorter); | |
return (result == 0 && llen < rlen) || result < 0; | |
} | |
static bool cmp_mem_eq(const void* lhs, size_t llen, const void* rhs, size_t rlen) { | |
if (llen != rlen) return false; | |
return memcmp(lhs, rhs, llen) == 0; | |
} | |
static size_t extendByteVector(std::vector<unsigned char>& bytes, const void* data, size_t n) { | |
size_t off_before = bytes.size(); | |
const unsigned char* ptr = (const unsigned char*)data; | |
while (n--) { | |
bytes.push_back(*ptr++); | |
} | |
return off_before; | |
} | |
static void extendByteVectorWithNumbers(std::vector<unsigned char>& bytes, const std::vector<uint64_t>& numbers) { | |
uint64_t numSize = numbers.size(); | |
extendByteVector(bytes, &numSize, sizeof(numSize)); | |
for (size_t i = 0; i < numSize; ++i) | |
{ | |
uint64_t cur = numbers[i]; | |
extendByteVector(bytes, &cur, sizeof(cur)); | |
} | |
} | |
static void toSortedTable(const std::map<std::string, std::vector<uint64_t>>& pairs, | |
std::vector<unsigned char>& bytes) { | |
bytes.clear(); | |
size_t index_block_size = pairs.size() + 1; | |
uint64_t* nums = new uint64_t[index_block_size](); | |
nums[0] = pairs.size(); | |
extendByteVector(bytes, nums, index_block_size * sizeof(uint64_t)); | |
std::vector<uint64_t> temp_indexes; | |
for (const auto& pair : pairs) { | |
uint64_t key_size = pair.first.size(); | |
uint64_t key_res = extendByteVector(bytes, &key_size, sizeof(key_size)); | |
temp_indexes.push_back(key_res); | |
extendByteVector(bytes, pair.first.data(), key_size); | |
extendByteVectorWithNumbers(bytes, pair.second); | |
} | |
uint64_t* numWriter = (uint64_t*)bytes.data(); | |
++numWriter; // dont overwrite number of indexes | |
for (int i = 0; i < temp_indexes.size(); ++i) | |
{ | |
numWriter[i] = temp_indexes[i]; | |
} | |
delete[] nums; | |
} | |
// todo add binary search on the indexes | |
std::optional<uint64_t> binarySearchSSTable(const std::vector<unsigned char>& bytes, const std::string key) { | |
const uint64_t* numReader = (uint64_t*)bytes.data(); | |
size_t numIndexes = numReader[0]; | |
size_t currentL = 0; | |
size_t currentR = numIndexes; | |
size_t curIndex = (currentL + currentR) / 2; | |
while (currentL <= currentR) { | |
//printf("curIndex is %zu L is %zu, R is %zu\n", curIndex, currentL, currentR); | |
uint64_t keySize = 0; | |
const unsigned char* byteSpot = bytes.data() + numReader[curIndex]; | |
memcpy(&keySize, byteSpot, sizeof(uint64_t)); | |
byteSpot += sizeof(uint64_t); | |
if (cmp_mem_lt(byteSpot, keySize, key.data(), key.size())) { | |
currentL = curIndex + 1; | |
} else if (cmp_mem_eq(byteSpot, keySize, key.data(), key.size())) { | |
return curIndex; | |
} else { | |
if (currentR == 0 || curIndex == 0) { | |
return std::nullopt; | |
} | |
currentR = curIndex - 1; | |
} | |
curIndex = (currentL + currentR) / 2; | |
} | |
return std::nullopt; | |
} | |
static void bigTest() { | |
static constexpr size_t testSize = 1000000; | |
std::vector<uint64_t> myVec = {1, 2, 3}; | |
std::vector<unsigned char> myBytes; | |
std::vector<std::string> myKeys; | |
std::map<std::string, std::vector<uint64_t>> myMap; | |
std::string myString; | |
for (int i = 0; i < testSize; ++i) | |
{ | |
put_rand_chars_in_string(myString, 5); | |
myKeys.push_back(myString); | |
myMap.insert(std::make_pair(myString, myVec)); | |
myString.clear(); | |
} | |
{ | |
auto start = high_resolution_clock::now(); | |
toSortedTable(myMap, myBytes); | |
auto stop = high_resolution_clock::now(); | |
auto duration = duration_cast<microseconds>(stop - start); | |
std::printf("Time taken insert %llu\n", duration.count()); | |
} | |
uint64_t adder = 0; | |
auto start = high_resolution_clock::now(); | |
for (int j = 0; j < testSize; ++j) | |
{ | |
const auto result = binarySearchSSTable(myBytes, myKeys[j]); | |
if (result.has_value()) { | |
adder += result.value(); | |
} | |
} | |
auto stop = high_resolution_clock::now(); | |
printf("Total adder %llu\n", adder); | |
auto duration = duration_cast<microseconds>(stop - start); | |
std::printf("Time taken search %llu\n", duration.count()); | |
} | |
int main(int argc, char const *argv[]) | |
{ | |
srand(time(nullptr)); | |
//Time taken insert 168110 | |
//Total adder 499980610763 | |
//Time taken search 427618 | |
bigTest(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment