Skip to content

Instantly share code, notes, and snippets.

@jwpeterson
Last active March 11, 2025 14:22
Show Gist options
  • Save jwpeterson/e6150796641f41ce4fa5578f83745654 to your computer and use it in GitHub Desktop.
Save jwpeterson/e6150796641f41ce4fa5578f83745654 to your computer and use it in GitHub Desktop.
Test different approaches for hashing pairs of unsigned ints
// Test different approaches for hashing pairs of unsigned ints.
// libMesh includes
#include "libmesh/hashword.h"
#include "libmesh/hashing.h"
#include "libmesh/int_range.h" // make_range()
#include "libmesh/getpot.h" // GetPot
#include "libmesh/libmesh_logging.h" // LOG_SCOPE, PerfItem
#include "libmesh/libmesh.h" // libMesh::invalid_uint
// C++ includes
#include <unordered_map>
#include <tuple>
#include <vector>
#include <set>
using namespace libMesh;
/**
* The MapValue here is a "dummy" sequentially-increasing integer that
* we can use to check that the hashing worked as expected.
*/
typedef std::tuple<unsigned int, unsigned int> MapKey;
typedef unsigned int MapValue;
struct HashCombine
{
inline
std::size_t operator()(const MapKey & key) const
{
// Set hash = hash_combine(std::hash(key.first), std::hash(key.second))
// The hash_combine() approach is used in libmesh/hashing.h,
// but it is just a copy of something that is in boost. I think it is
// more geared to generating a 32-bit hash, and does not seem to do a
// great job of mixing up the bits of its inputs ("close together" inputs
// produce "close together" outputs).
std::size_t returnval = std::hash<std::size_t>()(static_cast<std::size_t>(std::get<0>(key)));
libMesh::boostcopy::hash_combine(returnval, static_cast<std::size_t>(std::get<1>(key)));
return returnval;
}
};
struct FNVHash64
{
inline
std::size_t operator()(const MapKey & key) const
{
// Use hashword2() function from libmesh/hashword.h
return libMesh::Utility::hashword2(
static_cast<std::size_t>(std::get<0>(key)),
static_cast<std::size_t>(std::get<1>(key)));
}
};
struct JenkinsHash32
{
inline
std::size_t operator()(const MapKey & key) const
{
// Call the 32-bit version of the hashword2() function, which
// produces a 32-bit hash, but return the result as a 64-bit
// unsigned int.
return libMesh::Utility::hashword2(
static_cast<uint32_t>(std::get<0>(key)),
static_cast<uint32_t>(std::get<1>(key)));
}
};
/**
* Map typedefs
*/
typedef std::unordered_map<MapKey, MapValue, HashCombine> CombineMap;
typedef std::unordered_map<MapKey, MapValue, FNVHash64> FNVHash64Map;
typedef std::unordered_map<MapKey, MapValue, JenkinsHash32> JenkinsHash32Map;
// Code for determining the "badness" level of a map based on its current contents.
// Badness = 0 means the current bucket distribution is close to optimal
// Badness = 1 means that, on average, 100% more comparisons than optimal are required
// https://artificial-mind.net/blog/2021/10/09/unordered-map-badness
template <class Map>
double unordered_map_badness(Map const& map)
{
auto const lambda = map.size() / double(map.bucket_count());
auto cost = 0.;
for (auto const& [k, _] : map)
cost += map.bucket_size(map.bucket(k));
cost /= map.size();
return std::max(0., cost / (1 + lambda) - 1);
}
/**
* Classes that provide an operator(i,j) function that converts
* input indices into MapKey objects. These objects are used by
* the test() lambda below.
*/
class CartesianInput
{
public:
CartesianInput(unsigned int offset=0,
unsigned int skip=1) :
_offset(offset),
_skip(skip) {}
inline
MapKey operator()(unsigned int i, unsigned int j) const
{
return std::make_tuple(_offset + _skip*i,
_offset + _skip*j);
}
// Offset that is added to each (i,j) pair so that we
// can control the lower bound of the range.
unsigned int _offset;
// Include every n'th integer in the output. For example, if
// _skip == 2, then we will get ordered pairs of even integers:
// (0,0), (0,2), (0,4), ...
// (2,0), (2,2), ...
// ...
// Defaults to 1.
unsigned int _skip;
};
class RandomInput
{
public:
/**
* Constructor. Takes an optional "seed" parameter which is used to
* see the system's random number generator whenever the reseed()
* function is called. Note: you must call reseed() each time you
* want to repeat the same sequence of random numbers.
*/
RandomInput(const unsigned int seed=1) : _seed(seed)
{}
/**
* Random input data (inputs "i" and "j" are unused)
*/
inline
MapKey operator()(unsigned int /*i*/, unsigned int /*j*/) const
{
auto t = std::make_tuple(static_cast<unsigned int>(rand()),
static_cast<unsigned int>(rand()));
// libMesh::out << "RandomInput: " << std::get<0>(t) << ", " << std::get<1>(t) << std::endl;
return t;
}
/**
* You must call this function before you use this object to
* generate a "sequence" of random numbers via the operator(i,j)
* API. It returns a copy of this object for convenience, so that it
* can be called in-place while passing this object to a function
* that will eventually use it.
*/
RandomInput reseed() const
{
// Debugging
// std::cout << "Calling srand from reseed() with seed = " << _seed << std::endl;
srand(_seed);
return *this;
}
// We store the seed so we can re-seed at any time, to repeat
// the same sequence of random numbers.
unsigned int _seed;
};
int main(int argc, char** argv)
{
// Set map size on command line with -N argument.
// The resulting Map size will be N^2
GetPot command_line (argc, argv);
int N = 100;
if (command_line.search(1, "-N"))
N = command_line.next(N);
// Command line option to turn on collision counting.
// By default this is off, since it is not relevant when you are
// just testing the map::operator[] or map::insert() speed.
bool count_collisions = false;
if (command_line.search(1, "-c"))
count_collisions = true;
// Command line option to turn on "badness" check.
// By default this is off, since it is not relevant when you are
// just testing the map::operator[] or map::insert() speed.
bool badness_check = false;
if (command_line.search(1, "-b"))
badness_check = true;
// By default, we input "cartesian" data into the map, but
// the user can switch to "random" input using the command line.
int cartesian_input = true;
if (command_line.search(1, "-r"))
{
libMesh::out << "Testing with random map inputs." << std::endl;
cartesian_input = false;
}
else
libMesh::out << "Testing with Cartesian map inputs." << std::endl;
/**
* Test inserting data into the map.
* The testing lambda takes the map type as a generic arg, so we can
* use the same test code for each type of hash function. The
* input_generator is also a generic so that it can be chosen at
* runtime, but also does not require a virtual function call.
*/
auto test_insert = [&](auto & map, auto input_generator)
{
// Use this set to make sure there is not a hash collision
std::set<std::size_t> previous_hashes;
// Count the number of hash collisions
unsigned int n_collisions = 0;
// Store entries in the Map. Note that the order of the inputs
// matters in general, i.e. hash(i,j) != hash(j,i)
unsigned int ctr=0;
for (auto first : make_range(N))
for (auto second : make_range(N))
{
// Use the provided class object to generate a MapKey from the
// (first, second) inputs.
auto t = input_generator(first, second);
// Debugging: print the tuple generated by the
// input_generator, check that the input_generator is working
// as expected.
// std::cout << "test_insert: (" << std::get<0>(t) << ", " << std::get<1>(t) << ")" <<std::endl;
// And insert this MapKey into the map
map[t] = ctr++;
if (count_collisions)
{
auto h = map.hash_function()(t);
// Check if we have seen this hash before, and print message if so
auto [it, emplaced] = previous_hashes.emplace(h);
if (!emplaced)
{
// libMesh::out << "Hash collision detected for (i,j) = ("
// << first << ", " << second << ")" << std::endl;
n_collisions++;
}
// Print the hash computed for this key.
// Note: only do this for small N to get a flavor of the hash behavior...
if (N <= 10)
libMesh::out << "Hash (" << std::get<0>(t) << ", " << std::get<1>(t) << ") = " << h << std::endl;
}
}
// Sanity check: throw an error if the map size does not match the final value of "ctr".
// These should match even if there are collisions.
libmesh_error_msg_if(
ctr != map.size(),
"Expected " << ctr << " entries in map, not " << map.size());
if (count_collisions)
libMesh::out << "Number of collisions = " << n_collisions << std::endl;
// Print the badness level for this map with its current contents
if (badness_check)
libMesh::out << "Badness: " << unordered_map_badness(map) << std::endl;
};
/**
* Test looking up data in the map
*/
auto test_lookup = [&](const auto & map, auto input_generator)
{
std::size_t count = 0;
for (auto first : make_range(N))
for (auto second : make_range(N))
{
// Use the provided class object to generate a MapKey from the
// (first, second) inputs.
auto t = input_generator(first, second);
// Debugging: print the tuple generated by the
// input_generator, check that the input_generator is working
// as expected.
// std::cout << "test_lookup: (" << std::get<0>(t) << ", " << std::get<1>(t) << ")" <<std::endl;
// Search for this MapKey in the map
auto it = map.find(t);
if (it != map.end())
count++;
}
// Sanity check: require that all inputs were successfully found
libmesh_error_msg_if(
count != map.size(),
"Expected to find " << map.size() << " entries in map, instead found " << count);
};
/**
* Test looking up data that is not in the map
*/
auto test_lookup_fail = [&](const auto & map, auto input_generator)
{
std::size_t count = 0;
for (auto first : make_range(N))
for (auto second : make_range(N))
{
// The idea here is that the input_generator should only generate
// data that is _not_ found in the map.
auto t = input_generator(first, second);
// Debugging: print the tuple generated by the
// input_generator, check that the input_generator is working
// as expected.
// std::cout << "test_lookup_fail: (" << std::get<0>(t) << ", " << std::get<1>(t) << ")" <<std::endl;
// Search for this MapKey in the map
auto it = map.find(t);
// Increment the count if it was _not_ found
if (it == map.end())
count++;
}
// Sanity check: require that all inputs were successfully found
libmesh_error_msg_if(
count != map.size(),
"Expected to not find " << map.size() << " entries, instead did not find " << count);
};
/**
* Test erasing data from the map
*/
auto test_erase = [&](auto & map, auto input_generator)
{
std::size_t count = 0;
for (auto first : make_range(N))
for (auto second : make_range(N))
{
// Use the provided class object to generate a MapKey from the
// (first, second) inputs.
auto t = input_generator(first, second);
// Erase this entry from the Map
map.erase(t);
}
// Sanity check: the map should be empty after we removed all the inputs
libmesh_error_msg_if(
map.size() != 0,
"Expected map to be empty after erasure, but it has " << map.size() << " entries.");
};
/**
* "Top level" testing function that takes a generic map object by
* value in addition to a name to use in print statements and
* logging.
*/
auto test_body = [&](auto map, const std::string & hashname)
{
libMesh::out << "\nTesting " << hashname << " with " << N*N << " entries." << std::endl;
// CartesianInput objects used by the tests below. The first one
// is used by the "insert", "lookup", and "erase" tests while the
// second is used by the "lookup_fail" test. These must somehow
// be complementary to the Input objects used for the other tests,
// since the point is to generate values which are not in the
// input. Will not be used if the user passes -r on the command
// line.
// CartesianInput ci(/*offset=*/0, /*skip=*/1);
// CartesianInput ci_fail(/*offset=*/N, /*skip=*/1);
CartesianInput ci(/*offset=*/0, /*skip=*/2); // even
CartesianInput ci_fail(/*offset=*/1, /*skip=*/2); // odd
// RandomInput objects used by the tests below. See description
// above for which tests each is used in.
RandomInput ri;
RandomInput ri_fail(RandomInput(/*seed=*/0xe76f044));
/**
* Test inserting entries into the map.
*/
{
// FIXME: I could not make LOG_SCOPE or PerfItem work with a
// runtime string, but a START/STOP_LOG pair does work for some
// reason, perhaps because it copies the string?
// LOG_SCOPE("insert", hashname.c_str());
// PerfItem perf_item("insert", hashname.c_str());
START_LOG("insert", hashname.c_str());
if (cartesian_input)
test_insert(map, ci);
else
test_insert(map, ri.reseed());
STOP_LOG("insert", hashname.c_str());
}
/**
* Test looking up all entries that were originally added to the map.
*/
{
START_LOG("lookup", hashname.c_str());
if (cartesian_input)
test_lookup(map, ci);
else
test_lookup(map, ri.reseed());
STOP_LOG("lookup", hashname.c_str());
}
/**
* Test looking up all entries that are not in the map.
*/
{
START_LOG("lookup_fail", hashname.c_str());
if (cartesian_input)
test_lookup_fail(map, ci_fail);
else
test_lookup_fail(map, ri_fail.reseed());
STOP_LOG("lookup_fail", hashname.c_str());
}
/**
* Test erasing entries from the map. This should only be done
* _after_ any lookup tests are done.
*/
{
START_LOG("erase", hashname.c_str());
if (cartesian_input)
test_erase(map, ci);
else
test_erase(map, ri.reseed());
STOP_LOG("erase", hashname.c_str());
}
};
const std::vector<std::string> hashnames =
{"HashCombine", "FNVHash64", "JenkinsHash32"};
// Test std::unordered_maps based on each type of hash
test_body(CombineMap(), std::string(hashnames[0]));
test_body(FNVHash64Map(), std::string(hashnames[1]));
test_body(JenkinsHash32Map(), std::string(hashnames[2]));
std::vector<std::string> stages = {"erase", "insert", "lookup", "lookup_fail"};
// Format all the printed times consistently
libMesh::out << std::scientific << std::setprecision(16);
// Prefix the timing output with either Cartesian_ or Random_
std::string input_type = cartesian_input ? "Cartesian" : "Random";
for (const auto & hashname : hashnames)
{
// Extract a PerfData object associated with each stage from the PerfLog
std::vector<PerfData> pd;
for (const auto & stage : stages)
pd.push_back(perflog.get_perf_data(stage, hashname));
// Print JSON-formatted (?) timing data from the global "perflog"
// object so it's easier to plot/update.
//
// Ex: python code to create a dictionary
// my_dict = {
// "name": "John Doe",
// "age": 30,
// "city": "New York",
// "is_active": True,
// "address": {
// "street": "123 Main St",
// "zipcode": "10001"
// },
// "hobbies": ["reading", "hiking", "coding"]
// }
libMesh::out << input_type << "_" << hashname << "[" << N << "] = {" << std::endl;
for (auto i : index_range(stages))
{
libMesh::out << "\"" << stages[i] << "\": "
<< "\"" << pd[i].tot_time << "\""
<< "," << std::endl;
}
libMesh::out << "}" << std::endl;
// Newline before printing next hashname
libMesh::out << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment