jin-zhe · November 22, 2024 15:58
diff --git a/split_pdf.py b/split_pdf.py
 '''
 Simple script to split a PDF using PyPDF2 package in Python.
 Often times we would need to split an academic paper into the main paper and the
 supplementary material before submission.
 To do that, the script may be simply run as:

 `python split_pdf.py -in CVPR.pdf -s 15 -o`

 This produces 2 files: 'CVPR.01-14.pdf' and 'CVPR.15-20.pdf', where the starting
 page numbers for each split file are 1 and 15 respectively.

 This script supports any arbitrary number of splits using the `-s` argument,
 which can be a comma-separated list of ascending page numbers. Each page number
 represents the starting page of a new split. E.g. `-s 7,9,15`
 '''
 __author__ = "Jin Zhe"

 import argparse
 from pathlib import Path

 import PyPDF2


 def extract_pages(pdf, output_path, page_start, page_end, overwrite_file):
 	# Note: page_end is an exclusive end
 	print(f'Extracting pages {page_start}-{page_end-1}...')
 	if not overwrite_file and output_path.is_file():
 		raise IOError(f'File {output_path} already exists! Pages not extracted.')

 	index_range = range(page_start-1, page_end-1)	# convert from page number to page indices
 	output_f = output_path.open('wb')
 	pdf_out = PyPDF2.PdfFileWriter()
 	for page_index in index_range:
 		pdf_out.addPage(pdf.getPage(page_index))
 	pdf_out.write(output_f)
 	output_f.close()


 def get_output_filepath(input_filepath, start_page, end_page, num_digits):
 	start_page = '{:0>{}}'.format(start_page, num_digits)
 	end_page = '{:0>{}}'.format(end_page, num_digits)
 	return input_filepath.parent / f'{input_filepath.stem}.{start_page}-{end_page}.pdf'


 def get_args():
 	parser = argparse.ArgumentParser(description='A simple script to split a PDF file into 2 parts.')
 	parser.add_argument('--pdf_filepath',	'-in',  type=Path,						help='The pdf file to split.')
 	parser.add_argument('--split_pages',  '-s', 	type=str,							help='Comma separated page numbers indicating starting page numbers of new splits. 1 split page number will produce 2 splits, 2 split page numbers produces 3 splits and so on.')
 	parser.add_argument('--overwrite',    '-o',   action='store_true',  help='Ignore and overwrite existing files for output.')
 	args = parser.parse_args()
 	args.split_pages = [int(s.strip()) for s in args.split_pages.split(',')]
 	assert all([sp < args.split_pages[i+1] if i+1 < len(args.split_pages) else True for i,sp in enumerate(args.split_pages)]), 'Split pages must be in ascending order!'
 	return args


 def main():
 	args = get_args()
 	input_f = args.pdf_filepath.open('rb')
 	pdf_in = PyPDF2.PdfFileReader(input_f)
 	num_pages = pdf_in.getNumPages()
 	num_digits = len(str(num_pages))
 	assert all(sp <= num_pages+1 for sp in args.split_pages), 'All split page numbers should be within the page limit!'

 	start_page = 1
 	for sp in args.split_pages:
 		output_filepath = get_output_filepath(args.pdf_filepath, start_page, sp-1, num_digits)
 		extract_pages(pdf_in, output_filepath, start_page, sp, args.overwrite)
 		start_page = sp
 	output_filepath = get_output_filepath(args.pdf_filepath, start_page, num_pages, num_digits)
 	extract_pages(pdf_in, output_filepath, start_page, num_pages+1, args.overwrite)

 	input_f.close()


 if __name__ == '__main__': main()
	'''
	Simple script to split a PDF using PyPDF2 package in Python.
	Often times we would need to split an academic paper into the main paper and the
	supplementary material before submission.
	To do that, the script may be simply run as:

	`python split_pdf.py -in CVPR.pdf -s 15 -o`

	This produces 2 files: 'CVPR.01-14.pdf' and 'CVPR.15-20.pdf', where the starting
	page numbers for each split file are 1 and 15 respectively.

	This script supports any arbitrary number of splits using the `-s` argument,
	which can be a comma-separated list of ascending page numbers. Each page number
	represents the starting page of a new split. E.g. `-s 7,9,15`
	'''
	__author__ = "Jin Zhe"

	import argparse
	from pathlib import Path

	import PyPDF2


	def extract_pages(pdf, output_path, page_start, page_end, overwrite_file):
	# Note: page_end is an exclusive end
	print(f'Extracting pages {page_start}-{page_end-1}...')
	if not overwrite_file and output_path.is_file():
	raise IOError(f'File {output_path} already exists! Pages not extracted.')

	index_range = range(page_start-1, page_end-1) # convert from page number to page indices
	output_f = output_path.open('wb')
	pdf_out = PyPDF2.PdfFileWriter()
	for page_index in index_range:
	pdf_out.addPage(pdf.getPage(page_index))
	pdf_out.write(output_f)
	output_f.close()


	def get_output_filepath(input_filepath, start_page, end_page, num_digits):
	start_page = '{:0>{}}'.format(start_page, num_digits)
	end_page = '{:0>{}}'.format(end_page, num_digits)
	return input_filepath.parent / f'{input_filepath.stem}.{start_page}-{end_page}.pdf'


	def get_args():
	parser = argparse.ArgumentParser(description='A simple script to split a PDF file into 2 parts.')
	parser.add_argument('--pdf_filepath', '-in', type=Path, help='The pdf file to split.')
	parser.add_argument('--split_pages', '-s', type=str, help='Comma separated page numbers indicating starting page numbers of new splits. 1 split page number will produce 2 splits, 2 split page numbers produces 3 splits and so on.')
	parser.add_argument('--overwrite', '-o', action='store_true', help='Ignore and overwrite existing files for output.')
	args = parser.parse_args()
	args.split_pages = [int(s.strip()) for s in args.split_pages.split(',')]
	assert all([sp < args.split_pages[i+1] if i+1 < len(args.split_pages) else True for i,sp in enumerate(args.split_pages)]), 'Split pages must be in ascending order!'
	return args


	def main():
	args = get_args()
	input_f = args.pdf_filepath.open('rb')
	pdf_in = PyPDF2.PdfFileReader(input_f)
	num_pages = pdf_in.getNumPages()
	num_digits = len(str(num_pages))
	assert all(sp <= num_pages+1 for sp in args.split_pages), 'All split page numbers should be within the page limit!'

	start_page = 1
	for sp in args.split_pages:
	output_filepath = get_output_filepath(args.pdf_filepath, start_page, sp-1, num_digits)
	extract_pages(pdf_in, output_filepath, start_page, sp, args.overwrite)
	start_page = sp
	output_filepath = get_output_filepath(args.pdf_filepath, start_page, num_pages, num_digits)
	extract_pages(pdf_in, output_filepath, start_page, num_pages+1, args.overwrite)

	input_f.close()


	if __name__ == '__main__': main()