Last active
August 26, 2023 01:26
-
-
Save toddbirchard/5847b34da2a2aa2c75fd57b4e076a892 to your computer and use it in GitHub Desktop.
Collects locally stored images listed in report generated from `https://www.deadlinkchecker.com/`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script configuration.""" | |
from os import path | |
BASE_DIR = path.abspath(path.dirname(__file__)) | |
# CSV Exported from `deadlinkchecker` | |
CSV_EXPORTED_BROKEN_LINKS = f"{BASE_DIR}/data/brokenlinks.csv" | |
# Local directories containing images | |
IMAGE_DIRECTORY_1 = path.abspath('/Users/username/Dropbox/Sites/_Personal/hackersslackers/') | |
IMAGE_DIRECTORY_2 = path.abspath('/Users/username/projects/hackers/content/images/') | |
# Local directory to save images to | |
IMAGE_OUTPUT_DIRECTORY = path.abspath("/Users/username/Desktop/python-collected-images/") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Collects locally stored images listed in report generated from `https://www.deadlinkchecker.com/`.""" | |
from os import path, mkdir, walk, listdir | |
import logging | |
import pandas as pd | |
from pandas import DataFrame | |
from shutil import copyfile | |
from config import ( | |
CSV_EXPORTED_BROKEN_LINKS, | |
IMAGE_DIRECTORY_1, | |
IMAGE_DIRECTORY_2, | |
IMAGE_OUTPUT_DIRECTORY, | |
) | |
logging.basicConfig( | |
format='%(levelname)s %(asctime)s: %(message)s', | |
level=logging.INFO' | |
) | |
def read_csv_of_images() -> DataFrame: | |
"""Read CSV of broken images generated from link checker.""" | |
links_df = pd.read_csv(CSV_EXPORTED_BROKEN_LINKS) | |
links_df.dropna(subset=["anchor"], inplace=True, axis=0) | |
links_df = links_df[links_df["anchor"].str.contains("img")] | |
links_df["filename"] = links_df["URL"].apply(lambda x: x.rstrip("/")) | |
links_df["filename"] = links_df["filename"].apply(lambda x: x.split("/")[-1]) | |
links_df = links_df[["filename", "Linked"]] | |
links_df.to_csv(CSV_EXPORTED_BROKEN_LINKS) | |
return links_df | |
def search_paths_for_images(links_df: DataFrame): | |
"""Find images locally and save to folder.""" | |
if not path.exists(IMAGE_DIRECTORY_OUTPUT): | |
mkdir(IMAGE_DIRECTORY_OUTPUT) | |
for path in IMAGE_DIRECTORY_1, IMAGE_DIRECTORY_2: | |
for dirpath, subdirs, files in walk(path): | |
for index, name in enumerate(files): | |
for image in links_df['filename']: | |
if name == image: | |
source_file = path.join(dirpath, name) | |
logging.info(f"Found file: {source_file}") | |
copyfile(source_file, path.abspath(f"{IMAGE_OUTPUT_DIRECTORY}{name})) | |
num_files = len(listdir(IMAGE_OUTPUT_DIRECTORY)) | |
return num_files | |
def init_script() : | |
"""Start script to find broken images.""" | |
links_df = read_csv_of_images() | |
files = search_paths_for_images(links_df) | |
return f"Found {files} files" | |
if __name__ == "__main__": | |
init_script() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment