toddbirchard · August 26, 2023 01:26
diff --git a/config.py b/config.py
 """Script configuration."""
 from os import path


 BASE_DIR = path.abspath(path.dirname(__file__))


 # CSV Exported from `deadlinkchecker`
 CSV_EXPORTED_BROKEN_LINKS = f"{BASE_DIR}/data/brokenlinks.csv"

 # Local directories containing images
 IMAGE_DIRECTORY_1 = path.abspath('/Users/username/Dropbox/Sites/_Personal/hackersslackers/')
 IMAGE_DIRECTORY_2 = path.abspath('/Users/username/projects/hackers/content/images/')

 # Local directory to save images to
 IMAGE_OUTPUT_DIRECTORY = path.abspath("/Users/username/Desktop/python-collected-images/")
diff --git a/find_dead_link_images.py b/find_dead_link_images.py
 """Collects locally stored images listed in report generated from `https://www.deadlinkchecker.com/`."""
 from os import path, mkdir, walk, listdir
 import logging
 import pandas as pd
 from pandas import DataFrame
 from shutil import copyfile

 from config import (
    CSV_EXPORTED_BROKEN_LINKS,
    IMAGE_DIRECTORY_1,
    IMAGE_DIRECTORY_2,
    IMAGE_OUTPUT_DIRECTORY,
 )

 logging.basicConfig(
    format='%(levelname)s %(asctime)s: %(message)s',
    level=logging.INFO'
 )


 def read_csv_of_images() -> DataFrame:
    """Read CSV of broken images generated from link checker."""
    links_df = pd.read_csv(CSV_EXPORTED_BROKEN_LINKS)
    links_df.dropna(subset=["anchor"], inplace=True, axis=0)
    links_df = links_df[links_df["anchor"].str.contains("img")]
    links_df["filename"] = links_df["URL"].apply(lambda x: x.rstrip("/"))
    links_df["filename"] = links_df["filename"].apply(lambda x: x.split("/")[-1])
    links_df = links_df[["filename", "Linked"]]
    links_df.to_csv(CSV_EXPORTED_BROKEN_LINKS)
    return links_df


 def search_paths_for_images(links_df: DataFrame):
    """Find images locally and save to folder."""
    if not path.exists(IMAGE_DIRECTORY_OUTPUT):
        mkdir(IMAGE_DIRECTORY_OUTPUT)

    for path in IMAGE_DIRECTORY_1, IMAGE_DIRECTORY_2:
        for dirpath, subdirs, files in walk(path):
            for index, name in enumerate(files):
                for image in links_df['filename']:
                    if name == image:
                        source_file = path.join(dirpath, name)
                        logging.info(f"Found file: {source_file}")
                        copyfile(source_file, path.abspath(f"{IMAGE_OUTPUT_DIRECTORY}{name}))
    num_files = len(listdir(IMAGE_OUTPUT_DIRECTORY))
    return num_files


 def init_script() :
    """Start script to find broken images."""
    links_df = read_csv_of_images()
    files = search_paths_for_images(links_df)
    return f"Found {files} files"

                                                           
 if __name__ == "__main__":
    init_script()
	"""Script configuration."""
	from os import path


	BASE_DIR = path.abspath(path.dirname(__file__))


	# CSV Exported from `deadlinkchecker`
	CSV_EXPORTED_BROKEN_LINKS = f"{BASE_DIR}/data/brokenlinks.csv"

	# Local directories containing images
	IMAGE_DIRECTORY_1 = path.abspath('/Users/username/Dropbox/Sites/_Personal/hackersslackers/')
	IMAGE_DIRECTORY_2 = path.abspath('/Users/username/projects/hackers/content/images/')

	# Local directory to save images to
	IMAGE_OUTPUT_DIRECTORY = path.abspath("/Users/username/Desktop/python-collected-images/")
	"""Collects locally stored images listed in report generated from `https://www.deadlinkchecker.com/`."""
	from os import path, mkdir, walk, listdir
	import logging
	import pandas as pd
	from pandas import DataFrame
	from shutil import copyfile

	from config import (
	CSV_EXPORTED_BROKEN_LINKS,
	IMAGE_DIRECTORY_1,
	IMAGE_DIRECTORY_2,
	IMAGE_OUTPUT_DIRECTORY,
	)

	logging.basicConfig(
	format='%(levelname)s %(asctime)s: %(message)s',
	level=logging.INFO'
	)


	def read_csv_of_images() -> DataFrame:
	"""Read CSV of broken images generated from link checker."""
	links_df = pd.read_csv(CSV_EXPORTED_BROKEN_LINKS)
	links_df.dropna(subset=["anchor"], inplace=True, axis=0)
	links_df = links_df[links_df["anchor"].str.contains("img")]
	links_df["filename"] = links_df["URL"].apply(lambda x: x.rstrip("/"))
	links_df["filename"] = links_df["filename"].apply(lambda x: x.split("/")[-1])
	links_df = links_df[["filename", "Linked"]]
	links_df.to_csv(CSV_EXPORTED_BROKEN_LINKS)
	return links_df


	def search_paths_for_images(links_df: DataFrame):
	"""Find images locally and save to folder."""
	if not path.exists(IMAGE_DIRECTORY_OUTPUT):
	mkdir(IMAGE_DIRECTORY_OUTPUT)

	for path in IMAGE_DIRECTORY_1, IMAGE_DIRECTORY_2:
	for dirpath, subdirs, files in walk(path):
	for index, name in enumerate(files):
	for image in links_df['filename']:
	if name == image:
	source_file = path.join(dirpath, name)
	logging.info(f"Found file: {source_file}")
	copyfile(source_file, path.abspath(f"{IMAGE_OUTPUT_DIRECTORY}{name}))
	num_files = len(listdir(IMAGE_OUTPUT_DIRECTORY))
	return num_files


	def init_script() :
	"""Start script to find broken images."""
	links_df = read_csv_of_images()
	files = search_paths_for_images(links_df)
	return f"Found {files} files"


	if __name__ == "__main__":
	init_script()