pietrocolombo · November 24, 2024 07:50
diff --git a/delete_duplicate_file.py b/delete_duplicate_file.py
 # delete_duplicate_file.py
 # Python 3.8.6

 """
 Given a folder, walk through all files within the folder and subfolders
 and delete all file that are duplicates so you have only one copy of every file
 The md5 checcksum for each file will determine the duplicates
 """

 import os
 import hashlib
 from collections import defaultdict
 import csv

 src_folder = "../.." # directory to Search 


 def generate_md5(fname, chunk_size=1024):
    """
    Function which takes a file name and returns md5 checksum of the file
    """
    hash = hashlib.md5()
    with open(fname, "rb") as f:
        # Read the 1st block of the file
        chunk = f.read(chunk_size)
        # Keep reading the file until the end and update hash
        while chunk:
            hash.update(chunk)
            chunk = f.read(chunk_size)

    # Return the hex checksum
    return hash.hexdigest()


 if __name__ == "__main__":
    """
    Starting block of script
    """

    # The dict will have a list as values
    md5_dict = defaultdict(list)

    file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
                          "mp4", "jpg", "png", "xls", "xlsx", "xml",
                          "vsd", "py", "json"]

    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
        print("Analyzing {}".format(path))
        for each_file in files:
            if each_file.split(".")[-1].lower() in file_types_inscope:
                # The path variable gets updated for each subfolder
                file_path = os.path.join(os.path.abspath(path), each_file)
                # If there are more files with same checksum append to list
                md5_dict[generate_md5(file_path)].append(file_path)


    for key, val in md5_dict.items():
        first = True
        for file in val:
            if first:
                first = False
            else:
                os.remove(file)

    print("Done")
diff --git a/merge_directory_without_duplicate.py b/merge_directory_without_duplicate.py
 # merge_directory_without_duplicate.py.py
 # Python 3.8.6

 """
 Given a folder, walk through all files within the folder and subfolders
 and copy all files and only one copy of the duplicate file in a new directory
 The md5 checksum for each file will determine the duplicates
 """

 import os
 import hashlib
 from collections import defaultdict
 import csv
 import shutil

 src_folder = "../.." # directory to Search 
 folder_to_save = "/../../" # directory to copy (outside the src_folder) you have to put "/" after pat


 def generate_md5(fname, chunk_size=1024):
    """
    Function which takes a file name and returns md5 checksum of the file
    """
    hash = hashlib.md5()
    with open(fname, "rb") as f:
        # Read the 1st block of the file
        chunk = f.read(chunk_size)
        # Keep reading the file until the end and update hash
        while chunk:
            hash.update(chunk)
            chunk = f.read(chunk_size)

    # Return the hex checksum
    return hash.hexdigest()


 if __name__ == "__main__":
    """
    Starting block of script
    """

    # The dict will have a list as values
    md5_dict = defaultdict(list)

    file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
                          "mp4", "jpg", "png", "xls", "xlsx", "xml",
                          "vsd", "py", "json"]

    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
        print("Analyzing {}".format(path))
        for each_file in files:
            if each_file.split(".")[-1].lower() in file_types_inscope:
                # The path variable gets updated for each subfolder
                file_path = os.path.join(os.path.abspath(path), each_file)
                # If there are more files with same checksum append to list
                md5_dict[generate_md5(file_path)].append(file_path)


    for key, val in md5_dict.items():
        head, tail = os.path.split(val[0])
        subdirname = os.path.basename(os.path.dirname(val[0]))
        newPath = shutil.copy(val[0], folder_to_save + subdirname + "_" + tail)

    print("Done")
	# delete_duplicate_file.py
	# Python 3.8.6

	"""
	Given a folder, walk through all files within the folder and subfolders
	and delete all file that are duplicates so you have only one copy of every file
	The md5 checcksum for each file will determine the duplicates
	"""

	import os
	import hashlib
	from collections import defaultdict
	import csv

	src_folder = "../.." # directory to Search


	def generate_md5(fname, chunk_size=1024):
	"""
	Function which takes a file name and returns md5 checksum of the file
	"""
	hash = hashlib.md5()
	with open(fname, "rb") as f:
	# Read the 1st block of the file
	chunk = f.read(chunk_size)
	# Keep reading the file until the end and update hash
	while chunk:
	hash.update(chunk)
	chunk = f.read(chunk_size)

	# Return the hex checksum
	return hash.hexdigest()


	if __name__ == "__main__":
	"""
	Starting block of script
	"""

	# The dict will have a list as values
	md5_dict = defaultdict(list)

	file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
	"mp4", "jpg", "png", "xls", "xlsx", "xml",
	"vsd", "py", "json"]

	# Walk through all files and folders within directory
	for path, dirs, files in os.walk(src_folder):
	print("Analyzing {}".format(path))
	for each_file in files:
	if each_file.split(".")[-1].lower() in file_types_inscope:
	# The path variable gets updated for each subfolder
	file_path = os.path.join(os.path.abspath(path), each_file)
	# If there are more files with same checksum append to list
	md5_dict[generate_md5(file_path)].append(file_path)


	for key, val in md5_dict.items():
	first = True
	for file in val:
	if first:
	first = False
	else:
	os.remove(file)

	print("Done")
	# merge_directory_without_duplicate.py.py
	# Python 3.8.6

	"""
	Given a folder, walk through all files within the folder and subfolders
	and copy all files and only one copy of the duplicate file in a new directory
	The md5 checksum for each file will determine the duplicates
	"""

	import os
	import hashlib
	from collections import defaultdict
	import csv
	import shutil

	src_folder = "../.." # directory to Search
	folder_to_save = "/../../" # directory to copy (outside the src_folder) you have to put "/" after pat


	def generate_md5(fname, chunk_size=1024):
	"""
	Function which takes a file name and returns md5 checksum of the file
	"""
	hash = hashlib.md5()
	with open(fname, "rb") as f:
	# Read the 1st block of the file
	chunk = f.read(chunk_size)
	# Keep reading the file until the end and update hash
	while chunk:
	hash.update(chunk)
	chunk = f.read(chunk_size)

	# Return the hex checksum
	return hash.hexdigest()


	if __name__ == "__main__":
	"""
	Starting block of script
	"""

	# The dict will have a list as values
	md5_dict = defaultdict(list)

	file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
	"mp4", "jpg", "png", "xls", "xlsx", "xml",
	"vsd", "py", "json"]

	# Walk through all files and folders within directory
	for path, dirs, files in os.walk(src_folder):
	print("Analyzing {}".format(path))
	for each_file in files:
	if each_file.split(".")[-1].lower() in file_types_inscope:
	# The path variable gets updated for each subfolder
	file_path = os.path.join(os.path.abspath(path), each_file)
	# If there are more files with same checksum append to list
	md5_dict[generate_md5(file_path)].append(file_path)


	for key, val in md5_dict.items():
	head, tail = os.path.split(val[0])
	subdirname = os.path.basename(os.path.dirname(val[0]))
	newPath = shutil.copy(val[0], folder_to_save + subdirname + "_" + tail)

	print("Done")