Created
October 1, 2023 20:34
-
-
Save geggo98/f5515268e59c51b8ba02fe5baf34a2a2 to your computer and use it in GitHub Desktop.
Estimate the pairwise edit distance between a list of file using command line tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Function to calculate edit distance | |
calculate_edit_distance() { | |
file1="$1" | |
file2="$2" | |
distance=$(diff -U 0 "$file1" "$file2" | grep -v "^@" | wc -l) | |
echo "$file1,$file2,$distance" | |
} | |
# Check if the script is called by GNU parallel | |
if [ "$#" -eq 2 ]; then | |
calculate_edit_distance "$1" "$2" | |
exit 0 | |
fi | |
files=("file1.txt" "file2.txt" "file3.txt") # Replace with your list of files | |
# Print CSV header | |
echo "File1,File2,Edit Distance" | |
# Calculate edit distance for each pair of files using GNU parallel | |
parallel -j+0 $0 ::: "${files[@]}" ::: "${files[@]}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add
--bar
toparallel -j+0 ...
to get a progress bar on the standard error stream. In this case, make sure to redirect standard output to a file.