Skip to content

Instantly share code, notes, and snippets.

@pietrocolombo
Forked from vinovator/checkDuplicates.py
Last active November 24, 2024 07:50
Show Gist options
  • Save pietrocolombo/06289718dd5c40db00de957e884eb568 to your computer and use it in GitHub Desktop.
Save pietrocolombo/06289718dd5c40db00de957e884eb568 to your computer and use it in GitHub Desktop.
Python script to merge or delete duplicate files from a folder
# delete_duplicate_file.py
# Python 3.8.6
"""
Given a folder, walk through all files within the folder and subfolders
and delete all file that are duplicates so you have only one copy of every file
The md5 checcksum for each file will determine the duplicates
"""
import os
import hashlib
from collections import defaultdict
import csv
src_folder = "../.." # directory to Search
def generate_md5(fname, chunk_size=1024):
"""
Function which takes a file name and returns md5 checksum of the file
"""
hash = hashlib.md5()
with open(fname, "rb") as f:
# Read the 1st block of the file
chunk = f.read(chunk_size)
# Keep reading the file until the end and update hash
while chunk:
hash.update(chunk)
chunk = f.read(chunk_size)
# Return the hex checksum
return hash.hexdigest()
if __name__ == "__main__":
"""
Starting block of script
"""
# The dict will have a list as values
md5_dict = defaultdict(list)
file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
"mp4", "jpg", "png", "xls", "xlsx", "xml",
"vsd", "py", "json"]
# Walk through all files and folders within directory
for path, dirs, files in os.walk(src_folder):
print("Analyzing {}".format(path))
for each_file in files:
if each_file.split(".")[-1].lower() in file_types_inscope:
# The path variable gets updated for each subfolder
file_path = os.path.join(os.path.abspath(path), each_file)
# If there are more files with same checksum append to list
md5_dict[generate_md5(file_path)].append(file_path)
for key, val in md5_dict.items():
first = True
for file in val:
if first:
first = False
else:
os.remove(file)
print("Done")
# merge_directory_without_duplicate.py.py
# Python 3.8.6
"""
Given a folder, walk through all files within the folder and subfolders
and copy all files and only one copy of the duplicate file in a new directory
The md5 checksum for each file will determine the duplicates
"""
import os
import hashlib
from collections import defaultdict
import csv
import shutil
src_folder = "../.." # directory to Search
folder_to_save = "/../../" # directory to copy (outside the src_folder) you have to put "/" after pat
def generate_md5(fname, chunk_size=1024):
"""
Function which takes a file name and returns md5 checksum of the file
"""
hash = hashlib.md5()
with open(fname, "rb") as f:
# Read the 1st block of the file
chunk = f.read(chunk_size)
# Keep reading the file until the end and update hash
while chunk:
hash.update(chunk)
chunk = f.read(chunk_size)
# Return the hex checksum
return hash.hexdigest()
if __name__ == "__main__":
"""
Starting block of script
"""
# The dict will have a list as values
md5_dict = defaultdict(list)
file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
"mp4", "jpg", "png", "xls", "xlsx", "xml",
"vsd", "py", "json"]
# Walk through all files and folders within directory
for path, dirs, files in os.walk(src_folder):
print("Analyzing {}".format(path))
for each_file in files:
if each_file.split(".")[-1].lower() in file_types_inscope:
# The path variable gets updated for each subfolder
file_path = os.path.join(os.path.abspath(path), each_file)
# If there are more files with same checksum append to list
md5_dict[generate_md5(file_path)].append(file_path)
for key, val in md5_dict.items():
head, tail = os.path.split(val[0])
subdirname = os.path.basename(os.path.dirname(val[0]))
newPath = shutil.copy(val[0], folder_to_save + subdirname + "_" + tail)
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment