-
-
Save jspiro/9b1661d134b7949f62ee682c9bccd426 to your computer and use it in GitHub Desktop.
De-duplicate using APFS clonefile(2) and jdupes in zsh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env zsh | |
# | |
# # About | |
# Since APFS supports de-duplication on block-level, it can be useful to | |
# manually de-duplicate your files if you've migrated/upgrade to APFS not | |
# using a fresh install. | |
# | |
# I've written this simple script with the aim to: | |
# - Be simple, easy to read and understand (for users to check) | |
# - Use native cp -c for de-duplication | |
# - Use non-hashing file comparison to prevent collisions | |
# - To use jdupes for speed | |
# - Preserve file metadata | |
# | |
# # Known bugs | |
# | |
# - Does not preserve target directory timestamps | |
# | |
# # Background info | |
# https://developer.apple.com/documentation/foundation/file_system/about_apple_file_system | |
# https://developer.apple.com/support/downloads/Apple-File-System-Reference.pdf | |
# https://eclecticlight.co/2019/01/05/aliases-hard-links-symlinks-and-copies-in-mojaves-apfs/ | |
# https://eclecticlight.co/2017/11/02/taking-stock-using-apfs-in-high-sierra-10-13-1/ | |
# | |
# # Alternatives (https://apple.stackexchange.com/questions/316435/replace-existing-duplicate-files-on-apfs-with-clones) | |
# Python, uses hashes (collision risk): https://github.com/ranvel/clonefile-dedup | |
# Python, uses hashes (collision risk, does not preserve metadata?): https://bitbucket.org/dchevell/apfs-deduplicate/src/master/ | |
# Does not preserve metadata: https://github.com/deckarep/apfs-compactor | |
# Paid: http://diskdedupe.com/ | |
# Paid: https://macpaw.com/gemini | |
### Init: identify files and programs | |
# File to hold duplicate file data | |
DUPEFILE=jdupes-output | |
# File to temporarily store old file for metadata | |
TEMPFILE=tmp-preserved-for-metadata | |
# Critical programs to use | |
PCP=/bin/cp # Should be Mac native cp supporting clonefile(2)! | |
PMV=/bin/mv | |
PGCP=/opt/local/bin/gcp # Not be confused with alias for git cherry-pick | |
PJDUPES=/opt/local/bin/jdupes | |
test ! -x "${PCP}" && echo "Error: path to cp wrong" && exit | |
test ! -x "${PMV}" && echo "Error: path to mv wrong" && exit | |
test ! -x "${PGCP}" && echo "Error: path to gnu-cp wrong" && exit | |
test ! -x "${PJDUPES}" && echo "Error: path to jdupes wrong" && exit | |
### Optional: check how much data can be saved | |
${PJDUPES} --recurse --omitfirst ./ | tee ${DUPEFILE} | |
# Loop over lines, if line is not empty, check size, sum in awk | |
cat ${DUPEFILE} | while read thisfile; do | |
test ! -z $thisfile && du -k "$thisfile" | |
done | awk '{i+=$1} END {print i" kb"}' | |
### Find duplicates | |
# Find duplicates, use NUL character to separate to allow for newlines in | |
# filenames (rare but possible). | |
${PJDUPES} --printnull --recurse ./ | tee ${DUPEFILE} | |
# Check number of sets of duplicates by counting occurence of two consecutive | |
# NUL characters. | |
# Count number of NUL characters in file Source: https://stackoverflow.com/questions/371115/count-all-occurrences-of-a-string-in-lots-of-files-with-grep | |
NPAIRS=$(grep -oaE '\x00\x00' ${DUPEFILE} | wc -l) | |
echo "Found ${NPAIRS} sets of duplicates" | |
### Start de-duplication | |
# Loop over files separated by NUL characters, use first file of paired | |
# filenames as source for all other files in this set, e.g. | |
# | |
# file1\x00 | |
# file2\x00 | |
# file3\x00\x00 | |
# | |
# will cause file2 and file3 to be overwritten by file1 | |
# | |
# - If the file is empty, a new set will begin and we will unset SOURCEFILE. | |
# Also true for the first set we will encounter as SOURCEFILE starts unset | |
# - If SOURCEFILE is unset, use the current file to set this | |
# - If the file is not empty AND SOURCEFILE is set, make a copy: | |
# -- Move the target file to a new temporary location | |
# -- Clone the source file over the target file | |
# -- Copy attributes from source file to target file | |
SOURCEFILE="" | |
cat ${DUPEFILE} | while read -d $'\0' FILE; do | |
if [[ -z $FILE ]]; then | |
SOURCEFILE="" | |
elif [[ -z $SOURCEFILE ]]; then | |
SOURCEFILE=${FILE} | |
else | |
# Presever original file for metadata | |
${PMV} "${FILE}" "${TEMPFILE}"; | |
# Test that move was successful | |
test ! -e "${TEMPFILE}" && echo "Error: move failed, aborting." && break | |
# Use cp -c to use APFS clonefile(2) | |
# Use cp -a to preserve metadata, recurse, and not follow symlinks | |
${PCP} -ca "${SOURCEFILE}" "${FILE}"; | |
# Test that copy was successful (protect against e.g. empty $PCP string) | |
test ! -e "${FILE}" && echo "Error: copy failed, aborting." && break | |
# Use gnu copy to copy over all attributes | |
# https://unix.stackexchange.com/a/93842 | |
# https://unix.stackexchange.com/questions/402862/cp-losing-files-metadata#402869 | |
# Poorer alternative: https://unix.stackexchange.com/questions/91080/maintain-or-restore-file-permissions-when-replacing-file | |
${PGCP} --preserve=all --attributes-only "${TEMPFILE}" "${FILE}"; | |
fi | |
done | |
## Usin fdupes - bash (not tested) | |
# Get matches | |
# https://unix.stackexchange.com/questions/34366/is-there-a-way-of-deleting-duplicates-more-refined-than-fdupes-rdn | |
# DUPEFILE=fdupes-20200101a | |
# fdupes --sameline --recurse ./ | tee ${DUPEFILE} | |
# cat ${DUPEFILE} | while read SOURCEFILE DESTFILES; do | |
# # Split lines by spaces | |
# # Source https://stackoverflow.com/a/30212526 | |
# read -ra DESTFILESARR <<<${DESTFILES} | |
# for DEST in "${DESTFILESARR[@]}"; do | |
# mv "${DEST}" tmp | |
# echo cp -ca "${SOURCEFILE}" "${DEST}"; | |
# echo gcp --preserve=all --attributes-only tmp "${DEST}" | |
# done | |
# done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment