Last active
March 11, 2025 14:22
-
-
Save jwpeterson/e6150796641f41ce4fa5578f83745654 to your computer and use it in GitHub Desktop.
Test different approaches for hashing pairs of unsigned ints
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Test different approaches for hashing pairs of unsigned ints. | |
// libMesh includes | |
#include "libmesh/hashword.h" | |
#include "libmesh/hashing.h" | |
#include "libmesh/int_range.h" // make_range() | |
#include "libmesh/getpot.h" // GetPot | |
#include "libmesh/libmesh_logging.h" // LOG_SCOPE, PerfItem | |
#include "libmesh/libmesh.h" // libMesh::invalid_uint | |
// C++ includes | |
#include <unordered_map> | |
#include <tuple> | |
#include <vector> | |
#include <set> | |
using namespace libMesh; | |
/** | |
* The MapValue here is a "dummy" sequentially-increasing integer that | |
* we can use to check that the hashing worked as expected. | |
*/ | |
typedef std::tuple<unsigned int, unsigned int> MapKey; | |
typedef unsigned int MapValue; | |
struct HashCombine | |
{ | |
inline | |
std::size_t operator()(const MapKey & key) const | |
{ | |
// Set hash = hash_combine(std::hash(key.first), std::hash(key.second)) | |
// The hash_combine() approach is used in libmesh/hashing.h, | |
// but it is just a copy of something that is in boost. I think it is | |
// more geared to generating a 32-bit hash, and does not seem to do a | |
// great job of mixing up the bits of its inputs ("close together" inputs | |
// produce "close together" outputs). | |
std::size_t returnval = std::hash<std::size_t>()(static_cast<std::size_t>(std::get<0>(key))); | |
libMesh::boostcopy::hash_combine(returnval, static_cast<std::size_t>(std::get<1>(key))); | |
return returnval; | |
} | |
}; | |
struct FNVHash64 | |
{ | |
inline | |
std::size_t operator()(const MapKey & key) const | |
{ | |
// Use hashword2() function from libmesh/hashword.h | |
return libMesh::Utility::hashword2( | |
static_cast<std::size_t>(std::get<0>(key)), | |
static_cast<std::size_t>(std::get<1>(key))); | |
} | |
}; | |
struct JenkinsHash32 | |
{ | |
inline | |
std::size_t operator()(const MapKey & key) const | |
{ | |
// Call the 32-bit version of the hashword2() function, which | |
// produces a 32-bit hash, but return the result as a 64-bit | |
// unsigned int. | |
return libMesh::Utility::hashword2( | |
static_cast<uint32_t>(std::get<0>(key)), | |
static_cast<uint32_t>(std::get<1>(key))); | |
} | |
}; | |
/** | |
* Map typedefs | |
*/ | |
typedef std::unordered_map<MapKey, MapValue, HashCombine> CombineMap; | |
typedef std::unordered_map<MapKey, MapValue, FNVHash64> FNVHash64Map; | |
typedef std::unordered_map<MapKey, MapValue, JenkinsHash32> JenkinsHash32Map; | |
// Code for determining the "badness" level of a map based on its current contents. | |
// Badness = 0 means the current bucket distribution is close to optimal | |
// Badness = 1 means that, on average, 100% more comparisons than optimal are required | |
// https://artificial-mind.net/blog/2021/10/09/unordered-map-badness | |
template <class Map> | |
double unordered_map_badness(Map const& map) | |
{ | |
auto const lambda = map.size() / double(map.bucket_count()); | |
auto cost = 0.; | |
for (auto const& [k, _] : map) | |
cost += map.bucket_size(map.bucket(k)); | |
cost /= map.size(); | |
return std::max(0., cost / (1 + lambda) - 1); | |
} | |
/** | |
* Classes that provide an operator(i,j) function that converts | |
* input indices into MapKey objects. These objects are used by | |
* the test() lambda below. | |
*/ | |
class CartesianInput | |
{ | |
public: | |
CartesianInput(unsigned int offset=0, | |
unsigned int skip=1) : | |
_offset(offset), | |
_skip(skip) {} | |
inline | |
MapKey operator()(unsigned int i, unsigned int j) const | |
{ | |
return std::make_tuple(_offset + _skip*i, | |
_offset + _skip*j); | |
} | |
// Offset that is added to each (i,j) pair so that we | |
// can control the lower bound of the range. | |
unsigned int _offset; | |
// Include every n'th integer in the output. For example, if | |
// _skip == 2, then we will get ordered pairs of even integers: | |
// (0,0), (0,2), (0,4), ... | |
// (2,0), (2,2), ... | |
// ... | |
// Defaults to 1. | |
unsigned int _skip; | |
}; | |
class RandomInput | |
{ | |
public: | |
/** | |
* Constructor. Takes an optional "seed" parameter which is used to | |
* see the system's random number generator whenever the reseed() | |
* function is called. Note: you must call reseed() each time you | |
* want to repeat the same sequence of random numbers. | |
*/ | |
RandomInput(const unsigned int seed=1) : _seed(seed) | |
{} | |
/** | |
* Random input data (inputs "i" and "j" are unused) | |
*/ | |
inline | |
MapKey operator()(unsigned int /*i*/, unsigned int /*j*/) const | |
{ | |
auto t = std::make_tuple(static_cast<unsigned int>(rand()), | |
static_cast<unsigned int>(rand())); | |
// libMesh::out << "RandomInput: " << std::get<0>(t) << ", " << std::get<1>(t) << std::endl; | |
return t; | |
} | |
/** | |
* You must call this function before you use this object to | |
* generate a "sequence" of random numbers via the operator(i,j) | |
* API. It returns a copy of this object for convenience, so that it | |
* can be called in-place while passing this object to a function | |
* that will eventually use it. | |
*/ | |
RandomInput reseed() const | |
{ | |
// Debugging | |
// std::cout << "Calling srand from reseed() with seed = " << _seed << std::endl; | |
srand(_seed); | |
return *this; | |
} | |
// We store the seed so we can re-seed at any time, to repeat | |
// the same sequence of random numbers. | |
unsigned int _seed; | |
}; | |
int main(int argc, char** argv) | |
{ | |
// Set map size on command line with -N argument. | |
// The resulting Map size will be N^2 | |
GetPot command_line (argc, argv); | |
int N = 100; | |
if (command_line.search(1, "-N")) | |
N = command_line.next(N); | |
// Command line option to turn on collision counting. | |
// By default this is off, since it is not relevant when you are | |
// just testing the map::operator[] or map::insert() speed. | |
bool count_collisions = false; | |
if (command_line.search(1, "-c")) | |
count_collisions = true; | |
// Command line option to turn on "badness" check. | |
// By default this is off, since it is not relevant when you are | |
// just testing the map::operator[] or map::insert() speed. | |
bool badness_check = false; | |
if (command_line.search(1, "-b")) | |
badness_check = true; | |
// By default, we input "cartesian" data into the map, but | |
// the user can switch to "random" input using the command line. | |
int cartesian_input = true; | |
if (command_line.search(1, "-r")) | |
{ | |
libMesh::out << "Testing with random map inputs." << std::endl; | |
cartesian_input = false; | |
} | |
else | |
libMesh::out << "Testing with Cartesian map inputs." << std::endl; | |
/** | |
* Test inserting data into the map. | |
* The testing lambda takes the map type as a generic arg, so we can | |
* use the same test code for each type of hash function. The | |
* input_generator is also a generic so that it can be chosen at | |
* runtime, but also does not require a virtual function call. | |
*/ | |
auto test_insert = [&](auto & map, auto input_generator) | |
{ | |
// Use this set to make sure there is not a hash collision | |
std::set<std::size_t> previous_hashes; | |
// Count the number of hash collisions | |
unsigned int n_collisions = 0; | |
// Store entries in the Map. Note that the order of the inputs | |
// matters in general, i.e. hash(i,j) != hash(j,i) | |
unsigned int ctr=0; | |
for (auto first : make_range(N)) | |
for (auto second : make_range(N)) | |
{ | |
// Use the provided class object to generate a MapKey from the | |
// (first, second) inputs. | |
auto t = input_generator(first, second); | |
// Debugging: print the tuple generated by the | |
// input_generator, check that the input_generator is working | |
// as expected. | |
// std::cout << "test_insert: (" << std::get<0>(t) << ", " << std::get<1>(t) << ")" <<std::endl; | |
// And insert this MapKey into the map | |
map[t] = ctr++; | |
if (count_collisions) | |
{ | |
auto h = map.hash_function()(t); | |
// Check if we have seen this hash before, and print message if so | |
auto [it, emplaced] = previous_hashes.emplace(h); | |
if (!emplaced) | |
{ | |
// libMesh::out << "Hash collision detected for (i,j) = (" | |
// << first << ", " << second << ")" << std::endl; | |
n_collisions++; | |
} | |
// Print the hash computed for this key. | |
// Note: only do this for small N to get a flavor of the hash behavior... | |
if (N <= 10) | |
libMesh::out << "Hash (" << std::get<0>(t) << ", " << std::get<1>(t) << ") = " << h << std::endl; | |
} | |
} | |
// Sanity check: throw an error if the map size does not match the final value of "ctr". | |
// These should match even if there are collisions. | |
libmesh_error_msg_if( | |
ctr != map.size(), | |
"Expected " << ctr << " entries in map, not " << map.size()); | |
if (count_collisions) | |
libMesh::out << "Number of collisions = " << n_collisions << std::endl; | |
// Print the badness level for this map with its current contents | |
if (badness_check) | |
libMesh::out << "Badness: " << unordered_map_badness(map) << std::endl; | |
}; | |
/** | |
* Test looking up data in the map | |
*/ | |
auto test_lookup = [&](const auto & map, auto input_generator) | |
{ | |
std::size_t count = 0; | |
for (auto first : make_range(N)) | |
for (auto second : make_range(N)) | |
{ | |
// Use the provided class object to generate a MapKey from the | |
// (first, second) inputs. | |
auto t = input_generator(first, second); | |
// Debugging: print the tuple generated by the | |
// input_generator, check that the input_generator is working | |
// as expected. | |
// std::cout << "test_lookup: (" << std::get<0>(t) << ", " << std::get<1>(t) << ")" <<std::endl; | |
// Search for this MapKey in the map | |
auto it = map.find(t); | |
if (it != map.end()) | |
count++; | |
} | |
// Sanity check: require that all inputs were successfully found | |
libmesh_error_msg_if( | |
count != map.size(), | |
"Expected to find " << map.size() << " entries in map, instead found " << count); | |
}; | |
/** | |
* Test looking up data that is not in the map | |
*/ | |
auto test_lookup_fail = [&](const auto & map, auto input_generator) | |
{ | |
std::size_t count = 0; | |
for (auto first : make_range(N)) | |
for (auto second : make_range(N)) | |
{ | |
// The idea here is that the input_generator should only generate | |
// data that is _not_ found in the map. | |
auto t = input_generator(first, second); | |
// Debugging: print the tuple generated by the | |
// input_generator, check that the input_generator is working | |
// as expected. | |
// std::cout << "test_lookup_fail: (" << std::get<0>(t) << ", " << std::get<1>(t) << ")" <<std::endl; | |
// Search for this MapKey in the map | |
auto it = map.find(t); | |
// Increment the count if it was _not_ found | |
if (it == map.end()) | |
count++; | |
} | |
// Sanity check: require that all inputs were successfully found | |
libmesh_error_msg_if( | |
count != map.size(), | |
"Expected to not find " << map.size() << " entries, instead did not find " << count); | |
}; | |
/** | |
* Test erasing data from the map | |
*/ | |
auto test_erase = [&](auto & map, auto input_generator) | |
{ | |
std::size_t count = 0; | |
for (auto first : make_range(N)) | |
for (auto second : make_range(N)) | |
{ | |
// Use the provided class object to generate a MapKey from the | |
// (first, second) inputs. | |
auto t = input_generator(first, second); | |
// Erase this entry from the Map | |
map.erase(t); | |
} | |
// Sanity check: the map should be empty after we removed all the inputs | |
libmesh_error_msg_if( | |
map.size() != 0, | |
"Expected map to be empty after erasure, but it has " << map.size() << " entries."); | |
}; | |
/** | |
* "Top level" testing function that takes a generic map object by | |
* value in addition to a name to use in print statements and | |
* logging. | |
*/ | |
auto test_body = [&](auto map, const std::string & hashname) | |
{ | |
libMesh::out << "\nTesting " << hashname << " with " << N*N << " entries." << std::endl; | |
// CartesianInput objects used by the tests below. The first one | |
// is used by the "insert", "lookup", and "erase" tests while the | |
// second is used by the "lookup_fail" test. These must somehow | |
// be complementary to the Input objects used for the other tests, | |
// since the point is to generate values which are not in the | |
// input. Will not be used if the user passes -r on the command | |
// line. | |
// CartesianInput ci(/*offset=*/0, /*skip=*/1); | |
// CartesianInput ci_fail(/*offset=*/N, /*skip=*/1); | |
CartesianInput ci(/*offset=*/0, /*skip=*/2); // even | |
CartesianInput ci_fail(/*offset=*/1, /*skip=*/2); // odd | |
// RandomInput objects used by the tests below. See description | |
// above for which tests each is used in. | |
RandomInput ri; | |
RandomInput ri_fail(RandomInput(/*seed=*/0xe76f044)); | |
/** | |
* Test inserting entries into the map. | |
*/ | |
{ | |
// FIXME: I could not make LOG_SCOPE or PerfItem work with a | |
// runtime string, but a START/STOP_LOG pair does work for some | |
// reason, perhaps because it copies the string? | |
// LOG_SCOPE("insert", hashname.c_str()); | |
// PerfItem perf_item("insert", hashname.c_str()); | |
START_LOG("insert", hashname.c_str()); | |
if (cartesian_input) | |
test_insert(map, ci); | |
else | |
test_insert(map, ri.reseed()); | |
STOP_LOG("insert", hashname.c_str()); | |
} | |
/** | |
* Test looking up all entries that were originally added to the map. | |
*/ | |
{ | |
START_LOG("lookup", hashname.c_str()); | |
if (cartesian_input) | |
test_lookup(map, ci); | |
else | |
test_lookup(map, ri.reseed()); | |
STOP_LOG("lookup", hashname.c_str()); | |
} | |
/** | |
* Test looking up all entries that are not in the map. | |
*/ | |
{ | |
START_LOG("lookup_fail", hashname.c_str()); | |
if (cartesian_input) | |
test_lookup_fail(map, ci_fail); | |
else | |
test_lookup_fail(map, ri_fail.reseed()); | |
STOP_LOG("lookup_fail", hashname.c_str()); | |
} | |
/** | |
* Test erasing entries from the map. This should only be done | |
* _after_ any lookup tests are done. | |
*/ | |
{ | |
START_LOG("erase", hashname.c_str()); | |
if (cartesian_input) | |
test_erase(map, ci); | |
else | |
test_erase(map, ri.reseed()); | |
STOP_LOG("erase", hashname.c_str()); | |
} | |
}; | |
const std::vector<std::string> hashnames = | |
{"HashCombine", "FNVHash64", "JenkinsHash32"}; | |
// Test std::unordered_maps based on each type of hash | |
test_body(CombineMap(), std::string(hashnames[0])); | |
test_body(FNVHash64Map(), std::string(hashnames[1])); | |
test_body(JenkinsHash32Map(), std::string(hashnames[2])); | |
std::vector<std::string> stages = {"erase", "insert", "lookup", "lookup_fail"}; | |
// Format all the printed times consistently | |
libMesh::out << std::scientific << std::setprecision(16); | |
// Prefix the timing output with either Cartesian_ or Random_ | |
std::string input_type = cartesian_input ? "Cartesian" : "Random"; | |
for (const auto & hashname : hashnames) | |
{ | |
// Extract a PerfData object associated with each stage from the PerfLog | |
std::vector<PerfData> pd; | |
for (const auto & stage : stages) | |
pd.push_back(perflog.get_perf_data(stage, hashname)); | |
// Print JSON-formatted (?) timing data from the global "perflog" | |
// object so it's easier to plot/update. | |
// | |
// Ex: python code to create a dictionary | |
// my_dict = { | |
// "name": "John Doe", | |
// "age": 30, | |
// "city": "New York", | |
// "is_active": True, | |
// "address": { | |
// "street": "123 Main St", | |
// "zipcode": "10001" | |
// }, | |
// "hobbies": ["reading", "hiking", "coding"] | |
// } | |
libMesh::out << input_type << "_" << hashname << "[" << N << "] = {" << std::endl; | |
for (auto i : index_range(stages)) | |
{ | |
libMesh::out << "\"" << stages[i] << "\": " | |
<< "\"" << pd[i].tot_time << "\"" | |
<< "," << std::endl; | |
} | |
libMesh::out << "}" << std::endl; | |
// Newline before printing next hashname | |
libMesh::out << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment