Created
April 21, 2019 21:49
-
-
Save marcgeld/e6056f9b5e96d525c0bed218b42615fe to your computer and use it in GitHub Desktop.
Extract images from a pdf file to multipage TIFF (Tesseract-ocr accepts multipage TIFF, but not a pdf file with images as input)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env groovy | |
// Java 9 or later (…for the TIFF ImageIO Plugin) | |
@Grab(group='ch.qos.logback', module='logback-classic', version='1.2.3') | |
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.15') | |
@Grab(group='commons-io', module='commons-io', version='2.6') | |
import org.apache.pdfbox.pdfwriter.* | |
import org.apache.pdfbox.pdmodel.* | |
import org.apache.pdfbox.pdmodel.font.* | |
import org.apache.pdfbox.pdmodel.edit.* | |
import org.apache.pdfbox.pdmodel.graphics.* | |
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject | |
import java.awt.image.BufferedImage | |
import javax.imageio.IIOImage | |
import javax.imageio.ImageIO | |
import javax.imageio.stream.ImageOutputStream | |
import javax.imageio.ImageWriter | |
import javax.imageio.ImageWriteParam | |
import org.apache.pdfbox.cos.COSName | |
import java.nio.* | |
def appName = this.getClass().getName() | |
// Add .removeExtension() to String | |
String.metaClass.mixin org.apache.commons.io.FilenameUtils | |
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider") | |
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true") | |
def cli = new CliBuilder( | |
usage:"${appName} [<options>]", | |
header: 'Options:', | |
footer: 'Use with file OR dir path' | |
) | |
cli.with { | |
f(longOpt: 'file', 'filepath', args: 1, required: false) | |
d(longOpt: 'dir', 'path to directory', args: 1, required: false) | |
h(longOpt: 'help', 'Print help', required: false) | |
} | |
def opt = cli.parse(args) | |
if ( !opt || opt.h ) { | |
cli.usage() | |
return | |
} else if (opt.f && opt.d) { | |
cli.usage() | |
return | |
} | |
def found = [] | |
if (opt.d) { | |
currentDir = new File(opt.d).getAbsoluteFile() | |
currentDir.traverse(type: groovy.io.FileType.FILES, nameFilter: ~/(?x).*.pdf/) { f -> | |
found << f.getAbsoluteFile() | |
} | |
} else { | |
found << new File(opt.f).getAbsoluteFile() | |
} | |
found.each{ f -> | |
PDDocument doc = null | |
if (f.exists() && f.canRead()) { | |
println "Processing: file ${f}" | |
} | |
else { | |
println "Error: file ${f} not found or not readble" | |
return | |
} | |
outFile="${f.getPath().removeExtension()}.tiff" | |
ImageOutputStream outputStream = ImageIO.createImageOutputStream(new FileOutputStream(outFile)) | |
ImageWriter writer = ImageIO.getImageWritersByFormatName("TIFF").next() | |
writer.setOutput(outputStream) | |
ImageWriteParam params = writer.getDefaultWriteParam() | |
params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT) | |
// Compression: None, PackBits, ZLib, Deflate, LZW, JPEG and CCITT variants allowed | |
params.setCompressionType("Deflate") | |
writer.prepareWriteSequence(null) | |
doc = PDDocument.load( f ) | |
PDPageTree pageTree = doc.getDocumentCatalog().getPages() | |
for ( PDPage page : pageTree.iterator() ) { | |
PDResources pdResources = page.getResources() | |
for ( COSName xObjCosName : pdResources.getXObjectNames() ) { | |
PDXObject pdxObj = pdResources.getXObject(xObjCosName) | |
if ( pdxObj instanceof PDImageXObject ) { | |
PDImageXObject pdImageXObject = (PDImageXObject) pdxObj | |
BufferedImage bufferedImage = pdImageXObject.getImage() | |
writer.writeToSequence(new IIOImage(bufferedImage, null, null), params) | |
} | |
} | |
} | |
writer.endWriteSequence() | |
println( "Created outfile: ${outFile}") | |
doc.close() | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment