Last active
May 12, 2019 07:49
-
-
Save jinie/b51f75fa1ece7c02ca3f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import logging | |
import hashlib | |
def prune(dic): | |
return {key:value for key, value in dic.iteritems() if len(value) > 1} | |
def scan(path): | |
ret = {} | |
for root, dirs, files in os.walk(path): | |
for name in files: | |
fname = os.path.join(root, name) | |
size = os.stat(fname).st_size | |
if size not in ret.keys(): | |
ret[size] = [] | |
ret[size].append(fname) | |
return prune(ret) | |
def checksum(fileDict, read_full=False, blocksize=1024): | |
ret = {} | |
for key,value in fileDict.iteritems(): | |
for fname in value: | |
bcount = 1 if read_full is False else (os.stat(fname).st_size / blocksize) + (os.stat(fname).st_size % blocksize) | |
m = hashlib.sha1() | |
i = 0 | |
with open(fname,"rb") as f: | |
b = f.read(blocksize) | |
m.update(b) | |
i+=1 | |
if i >= bcount: | |
f.seek(0,2) | |
digest = m.hexdigest() | |
if digest not in ret: | |
ret[digest] = [] | |
logging.debug("{0} => {1}".format(fname,digest)) | |
ret[digest].append(fname) | |
return prune(ret) | |
def tally_files(fileDict): | |
ret = 0 | |
for key,value in fileDict.iteritems(): | |
ret += len(value) | |
return ret | |
def tally_wasted_space(fileDict): | |
ret = 0 | |
for key,value in fileDict.iteritems(): | |
ret += os.stat(value[0]).st_size * len(value)-1 | |
return ret | |
if __name__ == '__main__': | |
logging.basicConfig(level=logging.INFO) | |
logging.info("Scanning") | |
f = scan("/volume1") | |
logging.info("Quick scanning {0} files".format(tally_files(f))) | |
f = checksum(f) | |
logging.info("Slow scanning {0} files".format(tally_files(f))) | |
f = checksum(f,True) | |
logging.info("Found {0} duplicate files, total wasted space {1}".format(tally_files(f), tally_wasted_space(f))) | |
for key,value in f.iteritems(): | |
out = "" | |
for f in value: | |
out+="{0},".format(f) | |
print(out[:-1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment