Last active
October 27, 2025 19:48
-
-
Save fabiolimace/17c5dcdeac9a0eecd69ee8f3abd36d32 to your computer and use it in GitHub Desktop.
Hash Function Uniqueness and Collision
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Calculates hash uniqueness. | |
| # | |
| # Usage: | |
| # | |
| # hash-uniqueness.sh PROGRAM LENGTH WINDOW | |
| # | |
| # hash-uniqueness.sh md5sum | |
| # | |
| # hash-uniqueness.sh crc32sum 8 4 | |
| # | |
| # hash-uniqueness.sh sha256sum 32 6 | |
| # | |
| program=$1 | |
| length=${2:-32} | |
| window=${3:-8} | |
| # mimic md5sum | |
| function crc32sum { | |
| cksum "$1" | awk '{ $1=sprintf("%08x", $1); print $1, $3 }'; | |
| } | |
| mkdir -p /run/shm/words; | |
| for i in $(cat /usr/share/dict/american-english /usr/share/dict/british-english /usr/share/dict/brazilian /usr/share/dict/portuguese | sort | uniq); do [ ! -f "/run/shm/words/$i" ] && echo "$i" > "/run/shm/words/$i"; done; | |
| [ -f /run/shm/words.$program ] && [ -d /run/shm/words ] && [ "$(cat /run/shm/words.$program | wc -l)" -ne "$(find /run/shm/words -type f | wc -l)" ] && rm -f /run/shm/words.$program; | |
| [ ! -f /run/shm/words.$program ] && for i in $(find /run/shm/words -type f | sort | uniq); do $program "$i" >> /run/shm/words.$program; done; | |
| counter=0 | |
| accumulator=0 | |
| maximum=$(find /run/shm/words -type f | wc -l) | |
| for i in $(seq 1 $(( $length - $window )) ); do | |
| counter=$(( counter + 1 )) | |
| accumulator=$(( $accumulator + $(cut -c $i-$(( $i + $window - 1 )) /run/shm/words.$program | sort | uniq | wc -l) )) | |
| done; | |
| echo "uniqueness: $( echo "( 100.0 * ( $accumulator / $counter ) ) / $maximum " | bc -l ) %"; | |
| echo "collisions: $( echo "100.0 - ( 100.0 * ( $accumulator / $counter ) ) / $maximum " | bc -l ) %"; | |
Author
Author
More Examples
Dictionaries in /usr/share/dict/: american-english british-english brazilian portuguese (~500k words)
Length = 8, Window = 6
time ./hash-uniqueness.sh crc32sum 8 6;
uniqueness: 98.28423716106460077470 %
collisions: 1.71576283893539922530 %
real 42m5,849s
user 3m24,297s
sys 61m30,946sLength = 32, Window = 8
time ./hash-uniqueness.sh md5sum 32 8;
uniqueness: 99.99289907488395555370 %
collisions: 0.00710092511604444630 %
real 43m16,163s
user 2m4,602s
sys 41m37,513stime ./hash-uniqueness.sh sha1sum 32 8;
uniqueness: 99.99362218334791119996 %
collisions: 0.00637781665208880004 %
real 43m2,839s
user 2m5,084s
sys 41m22,939stime ./hash-uniqueness.sh sha256sum 32 8;
uniqueness: 99.99338355755480583670 %
collisions: 0.00661644244519416330 %
real 42m39,301s
user 2m8,414s
sys 40m56,361stime ./hash-uniqueness.sh xxh128sum 32 8;
uniqueness: 99.99361495226327164350 %
collisions: 0.00638504773672835650 %
real 33m40,216s
user 1m13,208s
sys 32m54,345s
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Examples
Dictionary:
/usr/share/dict/british-english(~100k words)Length = 8, Window = 6
./hash-uniqueness.sh crc32sum 8 6; uniqueness: 99.68838773262218099599 % collisions: 0.31161226737781900401 %./hash-uniqueness.sh md5sum 8 6; uniqueness: 99.67051230022996502212 % collisions: 0.32948769977003497788 %./hash-uniqueness.sh sha1sum 8 6; uniqueness: 99.69660076912671265967 % collisions: 0.30339923087328734033 %./hash-uniqueness.sh sha256sum 8 6; uniqueness: 99.69708388892109687518 % collisions: 0.30291611107890312482 %./hash-uniqueness.sh xxh128sum 8 6; uniqueness: 99.69949948789301795273 % collisions: 0.30050051210698204727 %Length = 32, Window = 6
./hash-uniqueness.sh md5sum 32 6; uniqueness: 99.68574915528362104975 % collisions: 0.31425084471637895025 %./hash-uniqueness.sh sha1sum 32 6; uniqueness: 99.69496559443802762256 % collisions: 0.30503440556197237744./hash-uniqueness.sh sha256sum 32 6; uniqueness: 99.69388786566593975719 % collisions: 0.30611213433406024281 %./hash-uniqueness.sh xxh128sum 32 6; uniqueness: 99.69054319016635672673 % collisions: 0.30945680983364327327 %