Last active
March 19, 2025 01:28
-
-
Save dijikul/ca27f4d8fbcecc12c595ffbd9df080e3 to your computer and use it in GitHub Desktop.
Merge Multiple PDF's into PDF's under 500 MB each (JFK File Research)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import PyPDF2 | |
def get_pdf_size(pdf_writer): | |
"""Estimate the size of the PDF by writing it to a temporary file.""" | |
import io | |
temp_stream = io.BytesIO() | |
pdf_writer.write(temp_stream) | |
return temp_stream.tell() / (1024 * 1024) # Convert bytes to MB | |
def merge_pdfs(input_directory, output_prefix, size_limit_mb=500): | |
pdf_files = sorted([f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')]) | |
if not pdf_files: | |
print("No PDF files found in the directory.") | |
return | |
print(f"Merging {len(pdf_files)} PDF files...") | |
part_number = 1 | |
pdf_writer = PyPDF2.PdfWriter() | |
output_file = f"{output_prefix}_part{part_number}.pdf" | |
for pdf in pdf_files: | |
pdf_path = os.path.join(input_directory, pdf) | |
try: | |
pdf_reader = PyPDF2.PdfReader(pdf_path) | |
for page in pdf_reader.pages: | |
pdf_writer.add_page(page) | |
if get_pdf_size(pdf_writer) >= size_limit_mb: | |
# Save the current part and start a new one | |
with open(output_file, "wb") as output_pdf: | |
pdf_writer.write(output_pdf) | |
print(f"Created: {output_file}") | |
# Start new PDF part | |
part_number += 1 | |
output_file = f"{output_prefix}_part{part_number}.pdf" | |
pdf_writer = PyPDF2.PdfWriter() | |
except Exception as e: | |
print(f"Error adding {pdf}: {e}") | |
# Save the last part if it has any content | |
if pdf_writer.pages: | |
with open(output_file, "wb") as output_pdf: | |
pdf_writer.write(output_pdf) | |
print(f"Created: {output_file}") | |
if __name__ == "__main__": | |
input_directory = "./jfk_records_2025" # Change to your directory | |
output_prefix = "merged_output" # Prefix for output files | |
merge_pdfs(input_directory, output_prefix) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment