Created
September 24, 2020 10:10
-
-
Save MalteT/41d81beedae144b58ecf6b37fb1607d3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # | |
| # Name: simple-eq.sh | |
| # Author: Malte Tammena | |
| # License: MIT 2020 Malte Tammena | |
| # | |
| # Dependencies: | |
| # - 7z | |
| # - unrar | |
| # - tar | |
| # - ripgrep | |
| # - (find, uniq, awk, sort) | |
| # | |
| # The script was mainly used to compare files submitted by students over moodle at the university of Leipzig. | |
| # Since the script expects a certain folder structure, the following procedure should be used with this program. | |
| # | |
| # 1.) Download any number of samples from the submissions done for a certain task in a single zip-file. | |
| # 2.) Unpack that zip-file into a single folder. This should leave a structure similar to this: | |
| # - . | |
| # |- Some Name 20349232/ | |
| # |- blub.zip or blub.tar.gz or blub.7z etc | |
| # |- ... | |
| # 3.) Execute the script inside the top folder. The script will now: | |
| # - Extract all archives inside those second level folders | |
| # - Delete all files that we don't care about | |
| # - Hash all files and and save them to all_hashes.md5 | |
| # - Go trough all hashes and save dublicates to dub_hashes.md5 | |
| # - Print groups of files with matching hashes to stdout | |
| # | |
| # Ignore all files with this hash, mainly used for ignoring the Minirechner Manual. | |
| IGNORE_HASHES=67bf89a9f4fbe630c817f570d61b75bc | |
| # Unzip all files | |
| for user_dir in *; do | |
| if [ -d "$user_dir" ]; then | |
| # enter dir | |
| cd "$user_dir" | |
| # Extract | |
| for file in *; do | |
| case "$file" in | |
| *.zip|*.7z) | |
| 7z -y e "$file" > /dev/null ;; | |
| *.rar) | |
| unrar -o+ e "$file" > /dev/null ;; | |
| *.tar.gz) | |
| tar xf "$file" --overwrite > /dev/null ;; | |
| *.pdf|*.sch|*.dat|*.jpg|*.jpeg|*.png) # Just some files | |
| : ;; | |
| *) # Some unexpected file! | |
| echo "Unknown file type on $file" >&2 | |
| esac || exit 1 | |
| done | |
| # exit dir | |
| cd .. | |
| fi | |
| done | |
| # Remove all garbage files | |
| find . -iname \*.dpl -exec rm '{}' \; 2>/dev/null | |
| find . -iname \*.dat -exec rm '{}' \; 2>/dev/null | |
| find . -iname \._* -exec rm '{}' \; 2>/dev/null | |
| find . -iname .DS_Store -exec rm '{}' \; 2>/dev/null | |
| find . -type d -name __MACOSX -exec rm -r '{}' \; 2>/dev/null | |
| # Prehash all files for performance reasons | |
| find . -type f -exec md5sum '{}' \; | rg -v $(echo $IGNORE_HASHES) > all_hashes.md5 | |
| # Hash all files and print checksums that occur more then once | |
| cat all_hashes.md5 \ | |
| | awk '{ print $1 }' \ | |
| | sort \ | |
| | uniq -c \ | |
| | sort -n \ | |
| | rg -v '^ *1 ' \ | |
| | awk '{ print $2 }' > dub_hashes.txt | |
| for sum in `cat dub_hashes.txt`; do | |
| cat all_hashes.md5 \ | |
| | rg "$sum" \ | |
| | awk 'BEGIN{FS=" "} { print $2 }' | |
| echo ==================== | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment