Last active
December 21, 2018 08:30
-
-
Save IceDcap/179fb06aa990123e09766acf14985b15 to your computer and use it in GitHub Desktop.
Using PyPDF2 to merge two single page pdf into a new page & Merge(Append) pdf one by one.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#a program that uses pyPdf to add an outline--basically, an | |
#interactive table of contents--to a given PDF file. | |
#use is: | |
# addindex.py thefile.pdf toc.json | |
# the file is not written in-place; a copy is written to "output.pdf" | |
# the file toc.json--which can be named anything--is a simple json | |
# file consisting of one object or hash with the outline titles as | |
# keys and the 0-indexed page numbers as values. An example is given | |
# below. | |
# { | |
# "Chapter 1" : 0, | |
# "Chapter 2" : 20 | |
# } | |
#at the moment, this only supports single-level outlines; the PDF spec | |
#supports multi-level outlines, which are feasible under the current | |
#framework, and would honestly just take a little bit more work. | |
#Relevant parts of the PDF spec to consult (and really, the thing's | |
#free and actually fairly clear reading, so don't be bashful) are | |
#12.3.2 (Destinations) and 12.3.3 (Document Outline), as well as 7.7.2 | |
#(Document Catalog). Note that pyPdf apparently only supports named | |
#destinations, a complication unnecessary for outlines, which can use | |
#simple explicit destinations. | |
# Copyright (c)2012 Tyson Burghardt. Licensed under GPLv3. | |
import pyPdf | |
import pyPdf.pdf as PDF | |
import sys | |
import json | |
def name(s): | |
"""convenience function to construct a pdf NameObject""" | |
return PDF.NameObject("/" + s) | |
def addOutline(pdfw, outline_dict): | |
"""Given a PdfFileWriter @pdfw, adds an outline defined by the | |
outline dictionary @outline_dict.""" | |
# print(outline_dict) | |
olitems = len(outline_dict) | |
#get length of @pdf's _objects list; from this | |
# we can derive the next and subsequent idorefs | |
idoix = len(pdfw._objects)+1 | |
idorefs = [PDF.IndirectObject(x+idoix,0,pdfw) | |
for x in range(olitems+1)] | |
# print(idorefs) | |
#build outline dictionary | |
ol = PDF.DictionaryObject() | |
ol.update({name("Type") : name("Outlines"), | |
name("First") : idorefs[1], | |
name("Last") : idorefs[-1], | |
name("Count") : PDF.NumberObject(olitems)}) | |
#build outline items | |
olitems = [] | |
#have to sort the values or they get inserted in random order | |
odv = {v:k for k,v in outline_dict.items()} | |
#xx = sorted(odv.keys()) | |
#print(xx) | |
for i in sorted(odv.keys()): | |
# print(i) | |
oli = PDF.DictionaryObject() | |
oli.update({name("Title") : PDF.TextStringObject(odv[i]), | |
name("Parent") : idorefs[0], | |
name("Dest") : makeDest(pdfw, i)}) | |
olitems.append(oli) | |
# print(olitems[:-1]) | |
for ix,olitem in enumerate(olitems[:-1]): | |
olitem.update({name("Next") : idorefs[ix+2]}) | |
for ix,olitem in enumerate(olitems[1:]): | |
olitem.update({name("Prev") : idorefs[ix+1]}) | |
#now add outline dict to pdf obj | |
pdfw._addObject(ol) | |
for i in olitems: | |
pdfw._addObject(i) | |
#lastly, change catalog | |
pdfw._root.getObject().update({name("Outlines") : idorefs[0]}) | |
def makeDest(pdfw, pg): | |
"""function to make an explicit destination, given a PdfFileWriter | |
@pdfw and a page number @pg. | |
We are using explicit destinations (see S12.3.2 of the PDF | |
spec). This consists of an array headed by an indirect ref to the | |
page, followed by the name XYZ and three size/zoom specifiers, | |
which we leave null so that the page remains in the default | |
state.""" | |
d = PDF.ArrayObject() | |
d.append(pdfw.getPage(pg).indirectRef) | |
d.append(name("XYZ")) | |
# d.append(PDF.NullObject()) | |
# d.append(PDF.NullObject()) | |
# d.append(PDF.NullObject()) | |
return d | |
def main(): | |
#PdfFileWriter needs to have the original PDF (that the | |
#PdfFileReader it depends on depends on) still open | |
#at the time of writing. If it's closed, you get a | |
#Value I/O error | |
w = pyPdf.PdfFileWriter() | |
f = file(sys.argv[1], "rb") | |
r = pyPdf.PdfFileReader(f) | |
for i in r.pages: | |
w.addPage(i) | |
g = file(sys.argv[2], "r") | |
oldict = json.load(g) | |
addOutline(w, oldict) | |
outputFile = open("output1.pdf", "wb") | |
w.write(outputFile) | |
outputFile.close() | |
g.close() | |
f.close() | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
def PDFmerge(pdfs, output): | |
pdfMerger = PyPDF2.PdfFileMerger() | |
# appending pdfs one by one | |
for pdf in pdfs: | |
with open(pdf, 'rb') as f: | |
pdfMerger.append(f) | |
# writing combined pdf to output pdf file | |
with open(output, 'wb') as f: | |
pdfMerger.write(f) | |
def main(): | |
pdfs = ['0000.pdf', 'out.pdf', 'out1.pdf','out2.pdf','out3.pdf','out4.pdf','out5.pdf','out6.pdf','out7.pdf','out8.pdf','out9.pdf','out10.pdf','out11.pdf','out12.pdf','out13.pdf', '0029.pdf'] | |
output = 'merge_file.pdf' | |
PDFmerge(pdfs=pdfs, output=output) | |
if __name__ == "__main__": | |
# calling the main function | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader, PdfFileWriter | |
from PyPDF2.pdf import PageObject | |
# document: https://pythonhosted.org/PyPDF2/ | |
reader = PdfFileReader(open("0000.pdf",'rb')) | |
invoice_page = reader.getPage(0)# the page number of first pdf | |
sup_reader = PdfFileReader(open("0001.pdf",'rb')) | |
sup_page = sup_reader.getPage(0)# the page number of second pdf | |
# create blank page & merge second pdf in right side | |
translated_page = PageObject.createBlankPage(None, sup_page.mediaBox.getWidth()+invoice_page.mediaBox.getWidth(), sup_page.mediaBox.getHeight()) | |
translated_page.mergeScaledTranslatedPage(sup_page, 1, invoice_page.mediaBox.getWidth(), 0) | |
# merge first pdf into this blank page | |
translated_page.mergePage(invoice_page) | |
# create page to writer | |
writer = PdfFileWriter() | |
writer.addPage(translated_page) | |
with open('outc.pdf', 'wb') as f: | |
writer.write(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment