Skip to content

Instantly share code, notes, and snippets.

@dijikul
Last active March 19, 2025 01:28
Show Gist options
  • Save dijikul/ca27f4d8fbcecc12c595ffbd9df080e3 to your computer and use it in GitHub Desktop.
Save dijikul/ca27f4d8fbcecc12c595ffbd9df080e3 to your computer and use it in GitHub Desktop.
Merge Multiple PDF's into PDF's under 500 MB each (JFK File Research)
import os
import PyPDF2
def get_pdf_size(pdf_writer):
"""Estimate the size of the PDF by writing it to a temporary file."""
import io
temp_stream = io.BytesIO()
pdf_writer.write(temp_stream)
return temp_stream.tell() / (1024 * 1024) # Convert bytes to MB
def merge_pdfs(input_directory, output_prefix, size_limit_mb=500):
pdf_files = sorted([f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')])
if not pdf_files:
print("No PDF files found in the directory.")
return
print(f"Merging {len(pdf_files)} PDF files...")
part_number = 1
pdf_writer = PyPDF2.PdfWriter()
output_file = f"{output_prefix}_part{part_number}.pdf"
for pdf in pdf_files:
pdf_path = os.path.join(input_directory, pdf)
try:
pdf_reader = PyPDF2.PdfReader(pdf_path)
for page in pdf_reader.pages:
pdf_writer.add_page(page)
if get_pdf_size(pdf_writer) >= size_limit_mb:
# Save the current part and start a new one
with open(output_file, "wb") as output_pdf:
pdf_writer.write(output_pdf)
print(f"Created: {output_file}")
# Start new PDF part
part_number += 1
output_file = f"{output_prefix}_part{part_number}.pdf"
pdf_writer = PyPDF2.PdfWriter()
except Exception as e:
print(f"Error adding {pdf}: {e}")
# Save the last part if it has any content
if pdf_writer.pages:
with open(output_file, "wb") as output_pdf:
pdf_writer.write(output_pdf)
print(f"Created: {output_file}")
if __name__ == "__main__":
input_directory = "./jfk_records_2025" # Change to your directory
output_prefix = "merged_output" # Prefix for output files
merge_pdfs(input_directory, output_prefix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment