Last active
November 22, 2024 16:11
-
-
Save jin-zhe/2efc348f58002f54e1ed90ab5323e56a to your computer and use it in GitHub Desktop.
Python script to split a directory of pdfs into smaller pdfs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Intended usage scenario: | |
You have a directory of pdfs, each comprising of sequential image scans of | |
human-annotated documents (e.g. written questionaries/forms/exams) where every | |
document share the same number of pages. Each pdf may contain different | |
numbers of such scanned documents. You want to split all these pdfs up into | |
smaller pdfs at fixed page index intervals such that each smaller pdf | |
correspond to a single scanned document. In addition, you want to place them | |
place them under a specific output directory while ensuring no filename | |
collisons. | |
""" | |
__author__ = "Jin Zhe" | |
from datetime import datetime | |
from tqdm import tqdm | |
import argparse | |
import PyPDF2 | |
import sys | |
import os | |
# Generates unique filename for each pdf split | |
def generate_unique_filename(): | |
filename_fmt = "{}.pdf" | |
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") | |
return filename_fmt.format(current_timestamp) | |
def main(): | |
parser = argparse.ArgumentParser(description="Splits every pdf from the input directory into the output directory.") | |
parser.add_argument('--input_dir', '-in', type=str, required=True, help='The input directory containing one or more pdfs.') | |
parser.add_argument('--output_dir', '-out', type=str, required=True, help='The output directory to store the split pdfs.') | |
parser.add_argument('--page_stride', '-s', type=int, required=True, default=1, help='The number of pages in each split pdf.') | |
parser.add_argument('--drop_last', '-d', action='store_true', default=False, help='Ignore last split if its page length is less than page stride') | |
args = parser.parse_args() | |
input_dir = os.path.abspath(args.input_dir) | |
output_dir = os.path.abspath(args.output_dir) | |
page_stride = args.page_stride | |
drop_last = args.drop_last | |
# For each pdf in input directory to split | |
pdf_filenames = list(filter(lambda f: f.endswith('.pdf'), os.listdir(input_dir))) | |
for pdf_path in [os.path.join(input_dir, x) for x in pdf_filenames]: | |
input_f = open(pdf_path, 'rb') | |
pdf_in = PyPDF2.PdfFileReader(input_f) | |
num_pages = pdf_in.getNumPages() | |
page_indices = range(0,num_pages) | |
splits = [page_indices[i:i+page_stride] for i in range(0, num_pages, page_stride)] | |
# If last split is not full size and we indicated to drop the last split | |
if len(splits[-1]) < page_stride and drop_last: | |
del splits[-1] | |
print('Splitting {} into {} pdfs'.format(pdf_path, len(splits))) | |
# For each split in current pdf | |
for split in tqdm(splits): | |
# Determine filename for new pdf | |
output_path = os.path.join(output_dir, generate_unique_filename()) | |
if os.path.exists(output_path): | |
raise IOError("File {} already exists!".format(output_path)) | |
# Add each page to new pdf | |
output_f = open(output_path, 'wb') | |
pdf_out = PyPDF2.PdfFileWriter() | |
for page_index in split: | |
pdf_out.addPage(pdf_in.getPage(page_index)) | |
# Write new pdf to file | |
pdf_out.write(output_f) | |
output_f.close() | |
print() | |
input_f.close() | |
if __name__ == "__main__": main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment