Created
June 21, 2022 12:04
-
-
Save dargmuesli/58a2c1c58350d2c1587ee00fa35a4ceb to your computer and use it in GitHub Desktop.
Compare all filenames inside a folder for similarity.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Exit on errors, use last pipe error code, do not overwrite files, ensure | |
# variables exist | |
set -o errexit -o pipefail -o noclobber -o nounset | |
# allow matching of files by glob directly without find | |
shopt -s globstar nullglob | |
# Set color codes for use with echo | |
LIGHT_BLUE='\e[94m' | |
LIGHT_GREEN='\e[92m' | |
LIGHT_RED='\e[91m' | |
LIGHT_YELLOW='\e[93m' | |
NC='\e[0m' | |
if ! hash fstrcmp 2>/dev/null; then | |
echo -e "${LIGHT_BLUE}fstrcmp${LIGHT_YELLOW} is not available.${NC}" | |
if ! hash lsb_release 2>/dev/null; then | |
echo -e "${LIGHT_RED}Could not determine OS distribution!${NC}" | |
echo -e "${LIGHT_BLUE}lsb_release${NC} is not available." | |
exit 1 | |
fi | |
if [[ ("$(lsb_release -is)" == "Debian") || (\ | |
"$(lsb_release -is)" == "Ubuntu") ]]; then | |
sudo apt-get install -y fstrcmp | |
else | |
echo -e "${LIGHT_RED}Could not install ${LIGHT_BLUE}fstrcmp${LIGHT_RED}" \ | |
" automatically!${NC}" | |
echo -e "Please install it manually." | |
exit 1 | |
fi | |
fi | |
DISTANCE_MINIMUM=0.9 | |
INPUT_PATH="" | |
IS_PROGRESS_SHOWN=false | |
function usage() { | |
echo -e "usage: ${0##*/} ${LIGHT_YELLOW}<options>${NC}" | |
echo -e "" | |
echo -e "${LIGHT_YELLOW}options${NC}" | |
echo -e " -d, --distance The minimum distance for comparison." | |
echo -e " -i, --input-path * The files' source path." | |
echo -e " -h, --help Display this help." | |
echo -e " -p, --progress Display progress." | |
echo -e "" | |
echo -e "*=required" | |
exit 1 | |
} | |
FILES=() | |
function scan() { | |
INPUT_PATH="$1" | |
echo -e "Scanning directory..." | |
while read -r -d $'\0' name | |
do | |
fileName="$(basename "$name")" | |
FILES+=("$fileName") | |
done < <(find "$INPUT_PATH" -type f -print0) | |
} | |
function compare() { | |
DISTANCE_MINIMUM="$1" | |
IS_PROGRESS_SHOWN="$2" | |
comparison_count=$((((${#FILES[@]})*(${#FILES[@]}-1))/2)) | |
comparison_index=0 | |
echo -e "Comparing with minimum distance $DISTANCE_MINIMUM (${#FILES[@]} files, $comparison_count comparisons)..." | |
for i in $(eval echo "{0..$((${#FILES[@]}-1))}") | |
do | |
if [ "$i" = $((${#FILES[@]}-1)) ]; then | |
break | |
fi | |
for j in $(eval echo "{$(("$i"+1))..$((${#FILES[@]}-1))}") | |
do | |
distance=$(fstrcmp "${FILES[$i]}" "${FILES[$j]}") | |
if [ "$IS_PROGRESS_SHOWN" = "true" ]; then | |
comparison_index=$(("$comparison_index"+1)) | |
echo -ne "$((100*"$comparison_index"/"$comparison_count"))% ($comparison_index/$comparison_count)\r" | |
fi | |
if (( $(echo "$distance > $DISTANCE_MINIMUM" | bc -l) )); then | |
echo -e "\n${FILES[$i]}\n${FILES[$j]}\n$distance" | |
fi | |
done | |
done | |
} | |
# Check if getopt is available | |
# shellcheck disable=SC2251 | |
! getopt --test >/dev/null | |
if [[ ${PIPESTATUS[0]} -ne 4 ]]; then | |
echo -e "${LIGHT_RED}Cannot parse parameters!${NC}" | |
exit 1 | |
fi | |
# Parse command line parameters | |
OPTIONS=d:hi:p | |
LONGOPTS=distance:,help,input-path:,progress | |
# shellcheck disable=SC2251 | |
! PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTS \ | |
--name "$0" -- "$@") | |
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then | |
exit 2 | |
fi | |
eval set -- "$PARSED" | |
while true; do | |
case "$1" in | |
-d | --distance) | |
DISTANCE_MINIMUM="$2" | |
shift 2 | |
;; | |
-h | --help) | |
echo -e "Find similar filenames." | |
echo -e "" | |
usage | |
;; | |
-i | --input-path) | |
INPUT_PATH="$2" | |
shift 2 | |
;; | |
-p | --progress) | |
IS_PROGRESS_SHOWN=true | |
shift 1 | |
;; | |
--) | |
shift | |
break | |
;; | |
*) | |
echo -e "${LIGHT_RED}Programming error!${NC}" | |
exit 2 | |
;; | |
esac | |
done | |
if [ -z "$INPUT_PATH" ]; then | |
echo -e "${LIGHT_RED}Input path not provided!${NC}" | |
usage | |
exit 1 | |
fi | |
if [ ! -d "$INPUT_PATH" ]; then | |
echo -e "${LIGHT_RED}Input path is not a directory!${NC}" | |
exit 1 | |
fi | |
scan "$INPUT_PATH" | |
compare "$DISTANCE_MINIMUM" "$IS_PROGRESS_SHOWN" | |
echo -e "\n${LIGHT_GREEN}Done${NC}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment