Last active
March 10, 2025 02:07
-
-
Save jweinst1/6fda9ddb5769633ca1bfc2a2f3bbde60 to your computer and use it in GitHub Desktop.
hash merge tree
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <map> | |
#include <unordered_map> | |
#include <string> | |
#include <vector> | |
#include <cstdio> | |
#include <cstring> | |
#include <cstdlib> | |
#include <cstdint> | |
#include <ctime> | |
#include <optional> | |
#include <chrono> | |
#include <filesystem> | |
#include <regex> | |
#include <sys/mman.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
using namespace std::chrono; | |
static const std::regex HASH_FILE_REGEX("[a-zA-Z]+-[0-9]+"); | |
static const std::string HASH_FILE_CHUNK = "chunk"; | |
static const std::string HASH_FILE_LEVEL = "level"; | |
struct ChunkCache { | |
struct Slot { | |
size_t hash = 0; | |
size_t chunk = 0; | |
}; | |
size_t count = 0; | |
size_t size = 0; | |
Slot* slots = nullptr; | |
ChunkCache(size_t amount): size(amount), slots(new Slot[amount]()) { | |
} | |
void expand() { | |
const size_t oldSize = size; | |
const size_t newSize = size * 3; | |
Slot* oldSlots = slots; | |
Slot* newBuf = new Slot[newSize](); | |
size = newSize; | |
count = 0; | |
slots = newBuf; | |
for (size_t i = 0; i < oldSize; ++i) | |
{ | |
insert(oldSlots[i].hash, oldSlots[i].chunk); | |
} | |
delete[] oldSlots; | |
} | |
size_t get(size_t hash) const { | |
const size_t mySpot = hash % size; | |
if (slots[mySpot].hash == hash) { | |
return slots[mySpot].chunk; | |
} | |
if (slots[mySpot].hash == 0) { | |
return (size_t)-1; | |
} | |
size_t i = mySpot + 1; | |
while (i < size) { | |
if (slots[i].hash == hash) { | |
return slots[i].chunk; | |
} | |
if (slots[i].hash == 0) { | |
return (size_t)-1; | |
} | |
++i; | |
} | |
i = 0; | |
while (i < mySpot) { | |
if (slots[i].hash == hash) { | |
return slots[i].chunk; | |
} | |
if (slots[i].hash == 0) { | |
return (size_t)-1; | |
} | |
++i; | |
} | |
return (size_t)-1; | |
} | |
void insert(size_t hash, size_t chunk) { | |
if (count >= size/2) { | |
expand(); | |
} | |
++count; | |
const size_t mySpot = hash % size; | |
if (slots[mySpot].hash == 0) { | |
slots[mySpot].hash = hash; | |
slots[mySpot].chunk = chunk; | |
return; | |
} | |
size_t i = mySpot + 1; | |
while (i < size) { | |
if (slots[i].hash == 0) { | |
slots[i].hash = hash; | |
slots[i].chunk = chunk; | |
return; | |
} | |
++i; | |
} | |
i = 0; | |
while (i < mySpot) { | |
if (slots[i].hash == 0) { | |
slots[i].hash = hash; | |
slots[i].chunk = chunk; | |
return; | |
} | |
++i; | |
} | |
} | |
~ChunkCache() { | |
//delete[] slots; | |
} | |
}; | |
static std::optional<size_t> getFileSizeOnDisk(const std::string& fpath) { | |
struct stat statbuf; | |
if (stat(fpath.c_str(), &statbuf) == -1) { | |
return std::nullopt; | |
} | |
return statbuf.st_size; | |
} | |
static void put_rand_chars_in_string(std::string& foo, size_t n) { | |
while(n--) { | |
foo.push_back((rand() % 90) + 32); | |
} | |
} | |
static bool stringEndsWith(const std::string& base, const std::string& suffix) { | |
return base.find(suffix) == base.size() - suffix.size(); | |
} | |
static bool stringStartsWith(const std::string& base, const std::string& prefix) { | |
return base.find(prefix) == 0; | |
} | |
struct DataLocation { | |
size_t level = 0; | |
size_t chunk = 0; | |
static std::optional<DataLocation> fromString(const std::string& base); | |
void toString(std::string& target); | |
}; | |
void DataLocation::toString(std::string& target) { | |
char buf[256] = {'\0'}; | |
snprintf(buf, sizeof(buf), "level-%zu-chunk-%zu", level, chunk); | |
target.assign(buf); | |
} | |
// parses file patterns | |
std::optional<DataLocation> DataLocation::fromString(const std::string& base) { | |
DataLocation dl; | |
bool foundLevel = false; | |
bool foundChunk = false; | |
auto wordsBegin = std::sregex_iterator(base.begin(), base.end(), HASH_FILE_REGEX); | |
auto wordsEnd = std::sregex_iterator(); | |
for (std::sregex_iterator i = wordsBegin; i != wordsEnd; ++i) { | |
const std::string match = i->str(); | |
const auto splitter = match.find('-'); | |
const auto label = match.substr(0, splitter); | |
if ( label == HASH_FILE_LEVEL) { | |
try { | |
dl.level = std::stoll(match.substr(splitter + 1)); | |
} catch (const std::exception&) { | |
return std::nullopt; | |
} | |
} else if (label == HASH_FILE_CHUNK) { | |
try { | |
dl.chunk = std::stoll(match.substr(splitter + 1)); | |
} catch (const std::exception&) { | |
return std::nullopt; | |
} | |
} | |
} | |
return dl; | |
} | |
static void print_mem_as_str(const char* prefix, const void* str, size_t len) { | |
if (prefix != nullptr) { | |
printf("%s ", prefix); | |
} | |
const char* ptr = (const char*)str; | |
while(len--) { | |
putc(*ptr++, stdout); | |
} | |
putc('\n', stdout); | |
} | |
static bool cmp_mem_lt(const void* lhs, size_t llen, const void* rhs, size_t rlen) { | |
size_t shorter = llen < rlen ? llen : rlen; | |
int result = memcmp(lhs, rhs, shorter); | |
return (result == 0 && llen < rlen) || result < 0; | |
} | |
static bool cmp_mem_eq(const void* lhs, size_t llen, const void* rhs, size_t rlen) { | |
if (llen != rlen) return false; | |
return memcmp(lhs, rhs, llen) == 0; | |
} | |
// Note: this will change the size of the vector, so | |
// make sure to fill it in! | |
static size_t resizeByteVector(std::vector<unsigned char>& bytes, size_t n) { | |
size_t off_before = bytes.size(); | |
bytes.resize(bytes.size() + n); | |
return off_before; | |
} | |
static size_t extendByteVector(std::vector<unsigned char>& bytes, const void* data, size_t n) { | |
size_t off_before = bytes.size(); | |
bytes.resize(bytes.size() + n); | |
memcpy(bytes.data() + off_before, data, n); | |
return off_before; | |
} | |
static size_t extendByteVectorWithByte(std::vector<unsigned char>& bytes, unsigned char c, size_t n) { | |
size_t off_before = bytes.size(); | |
bytes.resize(bytes.size() + n); | |
memset(bytes.data() + off_before, c, n); | |
return off_before; | |
} | |
static size_t extendByteVectorWithString(std::vector<unsigned char>& bytes, const std::string& value) { | |
uint64_t numSize = value.size(); | |
size_t first = extendByteVector(bytes, &numSize, sizeof(numSize)); | |
extendByteVector(bytes, value.data(), value.size()); | |
return first; | |
} | |
/** | |
* Non owning pointer to bytes | |
* */ | |
struct DataSegment { | |
unsigned char* dataPtr; | |
size_t len; | |
DataSegment(unsigned char* data, size_t size): dataPtr(data), len(size) {} | |
unsigned char* data() const { return dataPtr;} | |
size_t size() const { return len; } | |
}; | |
struct HashTableFile { | |
unsigned char* mapped = nullptr; | |
int fd = -1; | |
size_t file_size = 0; | |
size_t level = 0; | |
size_t chunk = 0; | |
std::string fpath; | |
bool syncToDisk() { | |
return msync(mapped, file_size, MS_SYNC) == 0; | |
} | |
static std::optional<HashTableFile> fromFile(const std::string& fpath) { | |
HashTableFile hf; | |
const auto checkSize = getFileSizeOnDisk(fpath); | |
if (!checkSize.has_value()) { | |
return std::nullopt; | |
} | |
const auto checkLevelAndChunk = DataLocation::fromString(fpath); | |
if (!checkLevelAndChunk.has_value()) { | |
return std::nullopt; | |
} | |
const DataLocation& dl = checkLevelAndChunk.value(); | |
hf.level = dl.level; | |
hf.chunk = dl.chunk; | |
hf.file_size = checkSize.value(); | |
hf.fpath = fpath; | |
hf.fd = open(fpath.c_str(), O_RDWR, 0666); | |
if (hf.fd == -1) { | |
return std::nullopt; | |
} | |
void* mapResult = mmap(nullptr, hf.file_size, PROT_READ | PROT_WRITE, MAP_SHARED, hf.fd, 0); | |
if (mapResult == MAP_FAILED) { | |
close(hf.fd); | |
return std::nullopt; | |
} | |
hf.mapped = static_cast<unsigned char*>(mapResult); | |
return hf; | |
} | |
static void loadLevelsInDir(const std::string& dpath, | |
std::vector<std::vector<HashTableFile>>& levels) { | |
const std::filesystem::path targetDir{dpath}; | |
for (auto const& dir_entry : std::filesystem::directory_iterator{targetDir}) { | |
const auto result = HashTableFile::fromFile(dir_entry.path().string()); | |
if (result.has_value()) { | |
const auto myVal = result.value(); | |
while (levels.size() < myVal.level + 1) { | |
levels.push_back(std::vector<HashTableFile>{}); | |
} | |
levels[myVal.level].push_back(myVal); | |
} | |
} | |
if (levels.size() == 0) { | |
// have at least 0 level ready | |
levels.push_back(std::vector<HashTableFile>{}); | |
} | |
} | |
unsigned char* data() const { return mapped; } | |
size_t size() const { return file_size; } | |
DataSegment asDataSegment() const { | |
return DataSegment{mapped, file_size}; | |
} | |
void closeDown() { | |
if (fd != -1 && mapped != nullptr) { | |
munmap(mapped, file_size); | |
close(fd); | |
fd = -1; | |
mapped = nullptr; | |
} | |
} | |
protected: | |
HashTableFile() = default; | |
}; | |
static bool byteVectorToFile(const std::vector<unsigned char>& bytes, const std::string& path) { | |
FILE* fp = std::fopen(path.c_str(), "wb+"); | |
if (fp == nullptr) { | |
return false; | |
} | |
std::fwrite(bytes.data(), bytes.size(), 1, fp); | |
std::fclose(fp); | |
return true; | |
} | |
static inline size_t hashWithMapFunc(const std::string& key) { | |
// todo prevent 1 | |
static const std::unordered_map<std::string, std::string> _internalMap; | |
return _internalMap.hash_function()(key); | |
} | |
static uint64_t getSlotSizeOfTable(const DataSegment& bytes) { | |
return ((uint64_t*)bytes.data())[0]; // one set for hash, other for index | |
} | |
static uint64_t getKeyAndValSizeOfTable(const std::vector<unsigned char>& bytes) { | |
return bytes.size() - (((uint64_t*)bytes.data())[0] * sizeof(uint64_t)); | |
} | |
static inline size_t incrementNumberSlot(size_t cur, size_t max) { | |
if (cur >= (max - 1)) | |
return 0; | |
else | |
return cur + 1; | |
} | |
static void toHashedTable(const std::unordered_map<std::string, std::string>& pairs, | |
std::vector<unsigned char>& bytes) { | |
bytes.clear(); | |
const size_t bucket_size = pairs.bucket_count(); | |
size_t hash_block_size = pairs.bucket_count() * 2; // [hash][index] | |
extendByteVectorWithByte(bytes, 0, (hash_block_size + 1) * sizeof(uint64_t)); | |
uint64_t* hash_nums = (uint64_t*)bytes.data(); | |
hash_nums[0] = hash_block_size / 2; | |
size_t hash_i = 0; | |
for (size_t bucket_i = 0; bucket_i < pairs.bucket_count(); ++bucket_i) { | |
hash_i = bucket_i; | |
for (auto it = pairs.begin(bucket_i); it != pairs.end(bucket_i); ++it) { | |
uint64_t spot = extendByteVectorWithString(bytes, it->first); | |
uint64_t itemHash = hashWithMapFunc(it->first); | |
extendByteVectorWithString(bytes, it->second); | |
uint64_t* hash_table = ((uint64_t*)bytes.data()) + 1; | |
while (hash_table[hash_i] != 0) { | |
hash_i = incrementNumberSlot(hash_i, bucket_size); | |
} | |
hash_table[hash_i] = itemHash; | |
hash_table[hash_i + bucket_size] = spot; | |
} | |
} | |
} | |
void convertIndexToValueString(const DataSegment& bytes, uint64_t index, std::string& val) { | |
// consider cacheing? | |
uint64_t tableSize = getSlotSizeOfTable(bytes); | |
uint64_t keyIndex = ((const uint64_t*)(bytes.data()))[1 + index + tableSize]; | |
const unsigned char* dataSpot = bytes.data() + keyIndex; | |
uint64_t keySize = 0; | |
uint64_t valSize = 0; | |
memcpy(&keySize, dataSpot, sizeof(uint64_t)); | |
dataSpot += sizeof(uint64_t) + keySize; | |
memcpy(&valSize, dataSpot, sizeof(uint64_t)); | |
dataSpot += sizeof(uint64_t); | |
val.assign((const char*)dataSpot, valSize); | |
} | |
std::optional<uint64_t> hashSearchTable(const DataSegment& bytes, const std::string& key) { | |
const uint64_t* numReader = (uint64_t*)bytes.data(); | |
size_t numHashes = numReader[0]; | |
size_t numHashesMinusOne = numHashes - 1; | |
const size_t myHash = hashWithMapFunc(key); | |
size_t myHashSpot = (myHash % numHashes); // for dual index | |
++numReader; | |
size_t hash_i = myHashSpot; | |
if (numReader[hash_i] == 0) { | |
return std::nullopt; | |
} | |
// implict delete market of 1. | |
if (numReader[hash_i] == myHash) { | |
const unsigned char* byteSpot = bytes.data() + numReader[hash_i + numHashes]; | |
uint64_t keySize = 0; | |
memcpy(&keySize, byteSpot, sizeof(uint64_t)); | |
byteSpot += sizeof(uint64_t); | |
if (cmp_mem_eq(byteSpot, keySize, key.data(), key.size())) { | |
return hash_i; | |
} | |
} | |
++hash_i; | |
while (hash_i < numHashes) { | |
if (numReader[hash_i] == 0) { | |
return std::nullopt; | |
} else if (numReader[hash_i] == myHash) { | |
const unsigned char* byteSpot = bytes.data() + numReader[hash_i + numHashes]; | |
uint64_t keySize = 0; | |
memcpy(&keySize, byteSpot, sizeof(uint64_t)); | |
byteSpot += sizeof(uint64_t); | |
if (cmp_mem_eq(byteSpot, keySize, key.data(), key.size())) { | |
return hash_i; | |
} | |
} | |
++hash_i; | |
} | |
hash_i = 0; | |
while (hash_i < myHashSpot) { | |
if (numReader[hash_i] == 0) { | |
return std::nullopt; | |
} else if (numReader[hash_i] == myHash) { | |
const unsigned char* byteSpot = bytes.data() + numReader[hash_i + numHashes]; | |
uint64_t keySize = 0; | |
memcpy(&keySize, byteSpot, sizeof(uint64_t)); | |
byteSpot += sizeof(uint64_t); | |
if (cmp_mem_eq(byteSpot, keySize, key.data(), key.size())) { | |
return hash_i; | |
} | |
} | |
++hash_i; | |
} | |
return std::nullopt; | |
} | |
bool hashTableDelete(DataSegment& bytes, const std::string& key) { | |
std::optional<uint64_t> result = hashSearchTable(bytes, key); | |
if (result.has_value()) { | |
((uint64_t*)bytes.data())[result.value() + 1] = 1; // delete marker | |
return true; | |
} | |
return false; | |
} | |
size_t hashTableMergeInto(std::unordered_map<std::string, std::string>& merger, const DataSegment& t1) { | |
// iter indexes for now | |
uint64_t t1size = getSlotSizeOfTable(t1); | |
size_t keysMerged = 0; | |
const uint64_t* reader1 = (const uint64_t*)t1.data(); | |
for (size_t i = 1; i < t1size; ++i) | |
{ | |
if (reader1[i] < 2) | |
continue; | |
const unsigned char* byteSpot = t1.data() + reader1[i + t1size]; | |
uint64_t keySize = 0; | |
uint64_t valSize = 0; | |
memcpy(&keySize, byteSpot, sizeof(uint64_t)); | |
byteSpot += sizeof(uint64_t); | |
const std::string curKey((const char*)byteSpot, keySize); | |
byteSpot += keySize; | |
memcpy(&valSize, byteSpot, sizeof(uint64_t)); | |
byteSpot += sizeof(uint64_t); | |
const std::string curVal((const char*)byteSpot, valSize); | |
merger.insert(std::make_pair(curKey, curVal)); | |
++keysMerged; | |
} | |
return keysMerged; | |
} | |
struct MemTable { | |
ChunkCache _cache; | |
std::unordered_map<std::string, std::string> _items; | |
size_t currentDataSize = 0; | |
MemTable(): _cache(40 * 1024 * 1024) { | |
_items.max_load_factor(0.8); | |
} | |
void cacheAtChunk(size_t chunk) { | |
//hashWithMapFunc(const std::string& key) | |
for (const auto& pair : _items) { | |
_cache.insert(hashWithMapFunc(pair.first), chunk); | |
} | |
} | |
}; | |
struct HashMergeTableConfig { | |
size_t maxMemTableSize = 1024 * 1024; | |
std::string dpath; | |
}; | |
class HashMergeTable { | |
public: | |
static std::optional<HashMergeTable> openDB(const HashMergeTableConfig& cfg); | |
static void closeDB(HashMergeTable& table); | |
bool get(const std::string& key, std::string& val); | |
bool del(const std::string& key); | |
bool put(const std::string&key, const std::string& value); | |
void debug() { | |
//_memTable.debugPrintCollisions(); | |
} | |
void syncAllLevelFiles() { | |
for (auto& level : _levels) { | |
for (auto& hfile : level) { | |
hfile.syncToDisk(); | |
} | |
} | |
} | |
bool dumpMemTableToDisk(); | |
private: | |
void closeDownAllFiles(); | |
HashMergeTable() = default; | |
HashMergeTableConfig _cfg; | |
std::string _dirPath; | |
MemTable _memTable; | |
std::vector<std::vector<HashTableFile>> _levels; | |
}; | |
void HashMergeTable::closeDownAllFiles() { | |
for (auto& level : _levels) { | |
for (auto& hfile : level) { | |
hfile.closeDown(); | |
} | |
} | |
} | |
std::optional<HashMergeTable> HashMergeTable::openDB(const HashMergeTableConfig& cfg) { | |
HashMergeTable ht; | |
try { | |
HashTableFile::loadLevelsInDir(cfg.dpath, ht._levels); | |
} catch (std::filesystem::filesystem_error const& ex) { | |
(void)ex; | |
return std::nullopt; | |
} | |
ht._cfg = cfg; | |
return ht; | |
} | |
void HashMergeTable::closeDB(HashMergeTable& table) { | |
table.syncAllLevelFiles(); | |
table.closeDownAllFiles(); | |
} | |
bool HashMergeTable::get(const std::string& key, std::string& val) { | |
const auto found = _memTable._items.find(key); | |
if ( found != _memTable._items.end()) { | |
val = found->second; | |
return true; | |
} | |
const auto foundCache = _memTable._cache.get(hashWithMapFunc(key)); | |
if (foundCache != (size_t)-1) { | |
//for (const auto& chunk : foundCache->second) { | |
const DataSegment got = _levels[0][foundCache].asDataSegment(); | |
const auto result = hashSearchTable(got, key); | |
if (result.has_value()) { | |
convertIndexToValueString(got, result.value(), val); | |
return true; | |
} | |
//} | |
} | |
puts("Got dup!"); | |
return false; | |
} | |
bool HashMergeTable::del(const std::string& key) { | |
size_t erased_count = _memTable._items.erase("banana"); | |
if (erased_count > 0) { | |
return true; | |
} | |
for (const auto& level: _levels) { | |
for (const auto& table: level) { | |
DataSegment got = table.asDataSegment(); | |
if (hashTableDelete(got, key)) { | |
return true; | |
} | |
} | |
} | |
return false; | |
} | |
bool HashMergeTable::dumpMemTableToDisk() { | |
unsigned long nextChunk = _levels[0].size(); | |
_memTable.cacheAtChunk(nextChunk); | |
std::vector<unsigned char> bytes; | |
toHashedTable(_memTable._items, bytes); | |
DataLocation dl; | |
std::string pathString; | |
dl.level = 0; | |
dl.chunk = nextChunk; | |
dl.toString(pathString); | |
const std::string fullPath = _cfg.dpath + "/" + pathString; | |
if (!byteVectorToFile(bytes, fullPath)) { | |
return false; | |
} | |
std::optional<HashTableFile> newHash = HashTableFile::fromFile(fullPath); | |
if (!newHash.has_value()) { | |
return false; | |
} | |
_levels[0].push_back(newHash.value()); | |
_memTable._items.clear(); | |
_memTable.currentDataSize = 0; | |
return true; | |
} | |
bool HashMergeTable::put(const std::string& key, const std::string& value) { | |
_memTable._items.insert(std::make_pair(key, value)); | |
_memTable.currentDataSize += key.size() + value.size(); | |
if (_memTable.currentDataSize > _cfg.maxMemTableSize) { | |
if (!dumpMemTableToDisk()) { | |
// consider error handling | |
return false; | |
} | |
} | |
return true; | |
} | |
// tests begin here. | |
static void bigTest1() { | |
static constexpr size_t testSize = 1000000; | |
static const std::string myValue = "foobar"; | |
std::vector<unsigned char> myBytes; | |
std::vector<std::string> myKeys; | |
std::unordered_map<std::string, std::string> myMap; | |
std::string myString; | |
myMap.max_load_factor(0.5); | |
for (int i = 0; i < testSize; ++i) | |
{ | |
put_rand_chars_in_string(myString, 16); | |
myKeys.push_back(myString); | |
// add some missed lookups | |
if (i % 2 == 0) myMap.insert(std::make_pair(myString, myValue)); | |
myString.clear(); | |
} | |
{ | |
auto start = high_resolution_clock::now(); | |
toHashedTable(myMap, myBytes); | |
auto stop = high_resolution_clock::now(); | |
auto duration = duration_cast<microseconds>(stop - start); | |
std::printf("Time taken insert %llu\n", duration.count()); | |
} | |
DataSegment ds(myBytes.data(), myBytes.size()); | |
uint64_t adder = 0; | |
auto start = high_resolution_clock::now(); | |
for (int j = 0; j < testSize; ++j) | |
{ | |
const auto result = hashSearchTable(ds, myKeys[j]); | |
if (result.has_value()) { | |
adder += result.value(); | |
} | |
} | |
auto stop = high_resolution_clock::now(); | |
printf("Total adder %llu\n", adder); | |
auto duration = duration_cast<microseconds>(stop - start); | |
std::printf("Time taken search %llu\n", duration.count()); | |
} | |
static void bigTest2() { | |
//Time taken HT insert 19004083 | |
//Time taken HT search 10317895 | |
static constexpr size_t testSize = 40000000; | |
static const std::string myValue = "foobar"; | |
static const std::string myDir = "foo"; | |
std::filesystem::create_directory(myDir); | |
std::vector<std::string> myKeys; | |
HashMergeTableConfig cfg; | |
cfg.dpath = myDir; | |
cfg.maxMemTableSize = 1024 * 1024 * 2; | |
std::optional<HashMergeTable> ht = HashMergeTable::openDB(cfg); | |
if (!ht.has_value()) { | |
fprintf(stderr, "FATAL cannot open db at %s\n", cfg.dpath.c_str()); | |
abort(); | |
} | |
auto myTable = ht.value(); | |
std::string myString; | |
for (int i = 0; i < testSize; ++i) | |
{ | |
put_rand_chars_in_string(myString, 8); | |
myKeys.push_back(myString); | |
myString.clear(); | |
} | |
auto start = high_resolution_clock::now(); | |
for (const auto& k : myKeys) { | |
myTable.put(k, myValue); | |
} | |
auto stop = high_resolution_clock::now(); | |
auto duration = duration_cast<microseconds>(stop - start); | |
std::printf("Time taken HT insert %llu\n", duration.count()); | |
std::string valRet; | |
auto start2 = high_resolution_clock::now(); | |
for (const auto& k : myKeys) { | |
myTable.get(k, valRet); | |
} | |
auto stop2 = high_resolution_clock::now(); | |
auto duration2 = duration_cast<microseconds>(stop2 - start2); | |
std::printf("Time taken HT search %llu\n", duration2.count()); | |
HashMergeTable::closeDB(myTable); | |
std::filesystem::remove_all(myDir); | |
} | |
static void checkCondition(int cond, const char* express, unsigned lineno) { | |
if (!cond) { | |
fprintf(stderr, "FAIL %s line %u\n", express, lineno); | |
} | |
} | |
#define CHECKIT(cond) checkCondition(cond, #cond, __LINE__) | |
static void testDelete() { | |
std::unordered_map<std::string, std::string> myMap; | |
myMap.insert(std::make_pair("foo", "bar")); | |
myMap.insert(std::make_pair("foo1", "bar")); | |
myMap.insert(std::make_pair("foo2", "bar")); | |
std::vector<unsigned char> bytes; | |
toHashedTable(myMap, bytes); | |
DataSegment ds(bytes.data(), bytes.size()); | |
auto res = hashSearchTable(ds, "foo1"); | |
CHECKIT(res.has_value()); | |
if (!res.has_value()) { | |
fprintf(stderr, "Expected foo1 to be in map\n"); | |
} | |
CHECKIT(hashTableDelete(ds, "foo1")); | |
res = hashSearchTable(ds, "foo1"); | |
CHECKIT(!res.has_value()); | |
if (res.has_value()) { | |
fprintf(stderr, "Expected foo1 to NOT be in map, val=%llu\n", res.value()); | |
} | |
} | |
static void testByteExtendVec() { | |
std::vector<unsigned char> bytes; | |
bytes.push_back(3); | |
extendByteVectorWithByte(bytes, 3, 10); | |
CHECKIT(bytes.size() == 11); | |
CHECKIT(bytes[1] == 3); | |
CHECKIT(bytes[10] == 3); | |
} | |
static void testMerges() { | |
std::unordered_map<std::string, std::string> myMapNew; | |
printf("Hash of foo is %zu\n", hashWithMapFunc(std::string("foo"))); | |
std::unordered_map<std::string, std::string> myMap; | |
myMap.max_load_factor(0.5); | |
myMap.insert(std::make_pair("foo", "bar")); | |
myMap.insert(std::make_pair("foo1", "bar")); | |
myMap.insert(std::make_pair("foo2", "bar")); | |
myMap.insert(std::make_pair("foo3", "barstobars")); | |
std::unordered_map<std::string, std::string> myMap2; | |
myMap2.max_load_factor(0.5); | |
myMap2.insert(std::make_pair("foo", "bar")); | |
myMap2.insert(std::make_pair("foo92", "bar")); | |
myMap2.insert(std::make_pair("foo45", "bar")); | |
myMap2.insert(std::make_pair("foo3", "barstobars")); | |
std::vector<unsigned char> bytes1; | |
std::vector<unsigned char> bytes2; | |
toHashedTable(myMap, bytes1); | |
toHashedTable(myMap2, bytes2); | |
DataSegment ds1(bytes1.data(), bytes1.size()); | |
DataSegment ds2(bytes2.data(), bytes2.size()); | |
CHECKIT(hashTableMergeInto(myMapNew, ds1) == 4); | |
hashTableMergeInto(myMapNew, ds2); | |
printf("size of map %zu\n", myMapNew.size()); | |
CHECKIT(myMapNew.size() == 6); | |
const std::string barString("bar"); | |
CHECKIT(myMapNew.find("foo")->second == barString); | |
} | |
static void testValueString() { | |
std::unordered_map<std::string, std::string> myMap; | |
std::vector<unsigned char> bytes1; | |
myMap.max_load_factor(0.5); | |
myMap.insert(std::make_pair("foo", "bar")); | |
myMap.insert(std::make_pair("foo1", "bar")); | |
myMap.insert(std::make_pair("foo2", "bar")); | |
myMap.insert(std::make_pair("foo3", "barstobars")); | |
toHashedTable(myMap, bytes1); | |
DataSegment ds1(bytes1.data(), bytes1.size()); | |
std::optional<uint64_t> got = hashSearchTable(ds1, "foo"); | |
CHECKIT(got.has_value()); | |
std::string myVal; | |
convertIndexToValueString(ds1, got.value(), myVal); | |
CHECKIT(myVal == "bar"); | |
} | |
static void testDataLocation() { | |
DataLocation dl; | |
dl.level = 5; | |
dl.chunk = 1; | |
std::string dlString; | |
dl.toString(dlString); | |
CHECKIT(dlString == "level-5-chunk-1"); | |
const std::optional<DataLocation> result = DataLocation::fromString(dlString); | |
CHECKIT(result.has_value()); | |
CHECKIT(result.value().level == 5); | |
CHECKIT(result.value().chunk == 1); | |
} | |
int main(int argc, char const *argv[]) | |
{ | |
//Time taken HT insert 6887719 | |
//Time taken HT search 6620036 | |
srand(time(nullptr)); | |
//bigTest1(); | |
bigTest2(); | |
//testDelete(); | |
//testByteExtendVec(); | |
//testMerges(); | |
//testValueString(); | |
//testDataLocation(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment