Last active
May 3, 2025 17:54
-
-
Save duganchen/1e917c11fce44267b4c4 to your computer and use it in GitHub Desktop.
dupe_files.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Python script that takes a directory, and displays files in that directory | |
tree with the same sizes, grouped by size. | |
Useful for hunting down duplicate files. | |
""" | |
from argparse import ArgumentParser, ArgumentTypeError | |
from os import walk | |
from os.path import getsize | |
from pathlib import Path | |
from typing import DefaultDict, Generator, List | |
def main() -> None: | |
parser = ArgumentParser("Find files with the same sizes in a directory tree.") | |
parser.add_argument( | |
"starting_point", | |
type=directory, | |
help="The directory to start looking in.", | |
) | |
args = parser.parse_args() | |
file_sizes = DefaultDict[float, List[Path]](list) | |
for path in iter_files(args.starting_point): | |
file_sizes[getsize(path)].append(path) | |
for size in sorted(file_sizes.keys()): | |
if len(file_sizes[size]) > 1: | |
print(sizeof_fmt(size)) | |
for path in file_sizes[size]: | |
print(f"\t{path}") | |
def iter_files(starting_point: str): | |
for dirpath, _, filenames in walk(starting_point): | |
for filename in filenames: | |
path = Path(dirpath) / Path(filename) | |
if path.is_file() and not path.is_symlink(): | |
yield path | |
def directory(path: str) -> Path: | |
dir_path = Path(path) | |
if not dir_path.is_dir(): | |
raise ArgumentTypeError("not a directory") | |
return dir_path | |
def sizeof_fmt(num: float) -> str: | |
# http://stackoverflow.com/a/1094933/240515 | |
for x in ["bytes", "KB", "MB", "GB"]: | |
if num < 1024.0: | |
return f"{num:3.1f}{x}" | |
num /= 1024.0 | |
return f"{num:3.1f}TB" | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment