-
-
Save pietrocolombo/06289718dd5c40db00de957e884eb568 to your computer and use it in GitHub Desktop.
Python script to merge or delete duplicate files from a folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# delete_duplicate_file.py | |
# Python 3.8.6 | |
""" | |
Given a folder, walk through all files within the folder and subfolders | |
and delete all file that are duplicates so you have only one copy of every file | |
The md5 checcksum for each file will determine the duplicates | |
""" | |
import os | |
import hashlib | |
from collections import defaultdict | |
import csv | |
src_folder = "../.." # directory to Search | |
def generate_md5(fname, chunk_size=1024): | |
""" | |
Function which takes a file name and returns md5 checksum of the file | |
""" | |
hash = hashlib.md5() | |
with open(fname, "rb") as f: | |
# Read the 1st block of the file | |
chunk = f.read(chunk_size) | |
# Keep reading the file until the end and update hash | |
while chunk: | |
hash.update(chunk) | |
chunk = f.read(chunk_size) | |
# Return the hex checksum | |
return hash.hexdigest() | |
if __name__ == "__main__": | |
""" | |
Starting block of script | |
""" | |
# The dict will have a list as values | |
md5_dict = defaultdict(list) | |
file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", | |
"mp4", "jpg", "png", "xls", "xlsx", "xml", | |
"vsd", "py", "json"] | |
# Walk through all files and folders within directory | |
for path, dirs, files in os.walk(src_folder): | |
print("Analyzing {}".format(path)) | |
for each_file in files: | |
if each_file.split(".")[-1].lower() in file_types_inscope: | |
# The path variable gets updated for each subfolder | |
file_path = os.path.join(os.path.abspath(path), each_file) | |
# If there are more files with same checksum append to list | |
md5_dict[generate_md5(file_path)].append(file_path) | |
for key, val in md5_dict.items(): | |
first = True | |
for file in val: | |
if first: | |
first = False | |
else: | |
os.remove(file) | |
print("Done") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# merge_directory_without_duplicate.py.py | |
# Python 3.8.6 | |
""" | |
Given a folder, walk through all files within the folder and subfolders | |
and copy all files and only one copy of the duplicate file in a new directory | |
The md5 checksum for each file will determine the duplicates | |
""" | |
import os | |
import hashlib | |
from collections import defaultdict | |
import csv | |
import shutil | |
src_folder = "../.." # directory to Search | |
folder_to_save = "/../../" # directory to copy (outside the src_folder) you have to put "/" after pat | |
def generate_md5(fname, chunk_size=1024): | |
""" | |
Function which takes a file name and returns md5 checksum of the file | |
""" | |
hash = hashlib.md5() | |
with open(fname, "rb") as f: | |
# Read the 1st block of the file | |
chunk = f.read(chunk_size) | |
# Keep reading the file until the end and update hash | |
while chunk: | |
hash.update(chunk) | |
chunk = f.read(chunk_size) | |
# Return the hex checksum | |
return hash.hexdigest() | |
if __name__ == "__main__": | |
""" | |
Starting block of script | |
""" | |
# The dict will have a list as values | |
md5_dict = defaultdict(list) | |
file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", | |
"mp4", "jpg", "png", "xls", "xlsx", "xml", | |
"vsd", "py", "json"] | |
# Walk through all files and folders within directory | |
for path, dirs, files in os.walk(src_folder): | |
print("Analyzing {}".format(path)) | |
for each_file in files: | |
if each_file.split(".")[-1].lower() in file_types_inscope: | |
# The path variable gets updated for each subfolder | |
file_path = os.path.join(os.path.abspath(path), each_file) | |
# If there are more files with same checksum append to list | |
md5_dict[generate_md5(file_path)].append(file_path) | |
for key, val in md5_dict.items(): | |
head, tail = os.path.split(val[0]) | |
subdirname = os.path.basename(os.path.dirname(val[0])) | |
newPath = shutil.copy(val[0], folder_to_save + subdirname + "_" + tail) | |
print("Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment