Skip to content

Instantly share code, notes, and snippets.

@fabiolimace
Last active October 27, 2025 19:48
Show Gist options
  • Select an option

  • Save fabiolimace/17c5dcdeac9a0eecd69ee8f3abd36d32 to your computer and use it in GitHub Desktop.

Select an option

Save fabiolimace/17c5dcdeac9a0eecd69ee8f3abd36d32 to your computer and use it in GitHub Desktop.
Hash Function Uniqueness and Collision
#!/bin/bash
# Calculates hash uniqueness.
#
# Usage:
#
# hash-uniqueness.sh PROGRAM LENGTH WINDOW
#
# hash-uniqueness.sh md5sum
#
# hash-uniqueness.sh crc32sum 8 4
#
# hash-uniqueness.sh sha256sum 32 6
#
program=$1
length=${2:-32}
window=${3:-8}
# mimic md5sum
function crc32sum {
cksum "$1" | awk '{ $1=sprintf("%08x", $1); print $1, $3 }';
}
mkdir -p /run/shm/words;
for i in $(cat /usr/share/dict/american-english /usr/share/dict/british-english /usr/share/dict/brazilian /usr/share/dict/portuguese | sort | uniq); do [ ! -f "/run/shm/words/$i" ] && echo "$i" > "/run/shm/words/$i"; done;
[ -f /run/shm/words.$program ] && [ -d /run/shm/words ] && [ "$(cat /run/shm/words.$program | wc -l)" -ne "$(find /run/shm/words -type f | wc -l)" ] && rm -f /run/shm/words.$program;
[ ! -f /run/shm/words.$program ] && for i in $(find /run/shm/words -type f | sort | uniq); do $program "$i" >> /run/shm/words.$program; done;
counter=0
accumulator=0
maximum=$(find /run/shm/words -type f | wc -l)
for i in $(seq 1 $(( $length - $window )) ); do
counter=$(( counter + 1 ))
accumulator=$(( $accumulator + $(cut -c $i-$(( $i + $window - 1 )) /run/shm/words.$program | sort | uniq | wc -l) ))
done;
echo "uniqueness: $( echo "( 100.0 * ( $accumulator / $counter ) ) / $maximum " | bc -l ) %";
echo "collisions: $( echo "100.0 - ( 100.0 * ( $accumulator / $counter ) ) / $maximum " | bc -l ) %";
@fabiolimace
Copy link
Author

fabiolimace commented Dec 21, 2024

Examples

Dictionary: /usr/share/dict/british-english (~100k words)

Length = 8, Window = 6

./hash-uniqueness.sh crc32sum 8 6;
uniqueness: 99.68838773262218099599 %
collisions:  0.31161226737781900401 %
./hash-uniqueness.sh md5sum 8 6;
uniqueness: 99.67051230022996502212 %
collisions:  0.32948769977003497788 %
./hash-uniqueness.sh sha1sum 8 6;
uniqueness: 99.69660076912671265967 %
collisions:  0.30339923087328734033 %
./hash-uniqueness.sh sha256sum 8 6;
uniqueness: 99.69708388892109687518 %
collisions:  0.30291611107890312482 %
./hash-uniqueness.sh xxh128sum 8 6;
uniqueness: 99.69949948789301795273 %
collisions:  0.30050051210698204727 %

Length = 32, Window = 6

./hash-uniqueness.sh md5sum 32 6;
uniqueness: 99.68574915528362104975 %
collisions:  0.31425084471637895025 %
./hash-uniqueness.sh sha1sum 32 6;
uniqueness: 99.69496559443802762256 %
collisions:  0.30503440556197237744
./hash-uniqueness.sh sha256sum 32 6;
uniqueness: 99.69388786566593975719 %
collisions:  0.30611213433406024281 %
./hash-uniqueness.sh xxh128sum 32 6;
uniqueness: 99.69054319016635672673 %
collisions:  0.30945680983364327327 %

@fabiolimace
Copy link
Author

fabiolimace commented Dec 21, 2024

More Examples

Dictionaries in /usr/share/dict/: american-english british-english brazilian portuguese (~500k words)

Length = 8, Window = 6

time ./hash-uniqueness.sh crc32sum 8 6;
uniqueness: 98.28423716106460077470 %
collisions:  1.71576283893539922530 %

real	42m5,849s
user	3m24,297s
sys	61m30,946s

Length = 32, Window = 8

time ./hash-uniqueness.sh md5sum 32 8;
uniqueness: 99.99289907488395555370 %
collisions:  0.00710092511604444630 %

real	43m16,163s
user	2m4,602s
sys	41m37,513s
time ./hash-uniqueness.sh sha1sum 32 8;
uniqueness: 99.99362218334791119996 %
collisions:  0.00637781665208880004 %

real	43m2,839s
user	2m5,084s
sys	41m22,939s
time ./hash-uniqueness.sh sha256sum 32 8;
uniqueness: 99.99338355755480583670 %
collisions:  0.00661644244519416330 %

real	42m39,301s
user	2m8,414s
sys	40m56,361s
time ./hash-uniqueness.sh xxh128sum 32 8;
uniqueness: 99.99361495226327164350 %                                 
collisions:  0.00638504773672835650 %

real	33m40,216s
user	1m13,208s
sys	32m54,345s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment