Created
November 22, 2024 15:58
-
-
Save jin-zhe/3272bf05eb7cdfca5a2cd857bced8650 to your computer and use it in GitHub Desktop.
Simple convenience script to split a PDF using PyPDF2 package in Python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Simple script to split a PDF using PyPDF2 package in Python. | |
Often times we would need to split an academic paper into the main paper and the | |
supplementary material before submission. | |
To do that, the script may be simply run as: | |
`python split_pdf.py -in CVPR.pdf -s 15 -o` | |
This produces 2 files: 'CVPR.01-14.pdf' and 'CVPR.15-20.pdf', where the starting | |
page numbers for each split file are 1 and 15 respectively. | |
This script supports any arbitrary number of splits using the `-s` argument, | |
which can be a comma-separated list of ascending page numbers. Each page number | |
represents the starting page of a new split. E.g. `-s 7,9,15` | |
''' | |
__author__ = "Jin Zhe" | |
import argparse | |
from pathlib import Path | |
import PyPDF2 | |
def extract_pages(pdf, output_path, page_start, page_end, overwrite_file): | |
# Note: page_end is an exclusive end | |
print(f'Extracting pages {page_start}-{page_end-1}...') | |
if not overwrite_file and output_path.is_file(): | |
raise IOError(f'File {output_path} already exists! Pages not extracted.') | |
index_range = range(page_start-1, page_end-1) # convert from page number to page indices | |
output_f = output_path.open('wb') | |
pdf_out = PyPDF2.PdfFileWriter() | |
for page_index in index_range: | |
pdf_out.addPage(pdf.getPage(page_index)) | |
pdf_out.write(output_f) | |
output_f.close() | |
def get_output_filepath(input_filepath, start_page, end_page, num_digits): | |
start_page = '{:0>{}}'.format(start_page, num_digits) | |
end_page = '{:0>{}}'.format(end_page, num_digits) | |
return input_filepath.parent / f'{input_filepath.stem}.{start_page}-{end_page}.pdf' | |
def get_args(): | |
parser = argparse.ArgumentParser(description='A simple script to split a PDF file into 2 parts.') | |
parser.add_argument('--pdf_filepath', '-in', type=Path, help='The pdf file to split.') | |
parser.add_argument('--split_pages', '-s', type=str, help='Comma separated page numbers indicating starting page numbers of new splits. 1 split page number will produce 2 splits, 2 split page numbers produces 3 splits and so on.') | |
parser.add_argument('--overwrite', '-o', action='store_true', help='Ignore and overwrite existing files for output.') | |
args = parser.parse_args() | |
args.split_pages = [int(s.strip()) for s in args.split_pages.split(',')] | |
assert all([sp < args.split_pages[i+1] if i+1 < len(args.split_pages) else True for i,sp in enumerate(args.split_pages)]), 'Split pages must be in ascending order!' | |
return args | |
def main(): | |
args = get_args() | |
input_f = args.pdf_filepath.open('rb') | |
pdf_in = PyPDF2.PdfFileReader(input_f) | |
num_pages = pdf_in.getNumPages() | |
num_digits = len(str(num_pages)) | |
assert all(sp <= num_pages+1 for sp in args.split_pages), 'All split page numbers should be within the page limit!' | |
start_page = 1 | |
for sp in args.split_pages: | |
output_filepath = get_output_filepath(args.pdf_filepath, start_page, sp-1, num_digits) | |
extract_pages(pdf_in, output_filepath, start_page, sp, args.overwrite) | |
start_page = sp | |
output_filepath = get_output_filepath(args.pdf_filepath, start_page, num_pages, num_digits) | |
extract_pages(pdf_in, output_filepath, start_page, num_pages+1, args.overwrite) | |
input_f.close() | |
if __name__ == '__main__': main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment