Last active
June 22, 2025 11:54
-
-
Save thekid/4222de1b7a72863aeca4255960852d50 to your computer and use it in GitHub Desktop.
PDF to Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use com\adobe\pdf\{PdfReader, Content, Ref, Tokens, CharacterMap}; | |
use io\streams\FileInputStream; | |
use lang\FormatException; | |
use util\Objects; | |
use util\cmd\Console; | |
try { | |
$reader= new PdfReader(new FileInputStream($argv[1])); | |
} catch (Throwable $e) { | |
Console::writeLine($argv[1], ': ', $e); | |
return 2; | |
} | |
$objects= []; | |
$trailer= []; | |
Console::$err->write('Parsing ', $argv[1], ' ['); | |
foreach ($reader->objects() as $kind => $value) { | |
Console::$err->write('.'); | |
if ('object' === $kind) { | |
$objects[$value['id']->hashCode()]= $value['dict']; | |
} else if ('trailer' === $kind) { | |
$trailer+= $value; | |
} | |
} | |
Console::$err->writeLine(']'); | |
// Handle files without `trailer` entry, e.g. linearized PDFs | |
if (empty($trailer)) { | |
foreach ($objects as $object) { | |
if ('XRef' === ($object['Type'] ?? null)) { | |
$trailer= $object; | |
break; | |
} | |
} | |
} | |
// Handle encryption | |
Console::writeLine('Trailer: ', $trailer); | |
if (isset($trailer['Encrypt'])) { | |
$encrypt= $objects[$trailer['Encrypt']->hashCode()]; | |
Console::writeLine('Cannot handle encrypted PDFs: ', $encrypt); | |
return 1; | |
} | |
// Now, unpack object streams and add them to the lookup table | |
foreach ($objects as $id => $object) { | |
if ('ObjStm' === ($object['Type'] ?? null)) { | |
$n= $object['N']; | |
$tokens= new Tokens($object['$stream']); | |
$numbers= []; | |
for ($i= 0; $i < $n; $i++) { | |
$number= $tokens->token()[1]; | |
$offset= $tokens->token()[1]; | |
$numbers[$i]= $number.'_0'; | |
} | |
for ($i= 0; $i < $n; $i++) { | |
$objects[$numbers[$i]]= $tokens->value(); | |
} | |
} | |
} | |
if ($info= ($trailer['Info'] ?? null)) { | |
Console::writeLine('Info: ', $objects[$info->hashCode()]); | |
} | |
$root= $objects[$trailer['Root']->hashCode()]; | |
Console::writeLine('Root: ', $root); | |
$pages= $objects[$root['Pages']->hashCode()]; | |
Console::writeLine('Pages: ', $pages); | |
$objectOf= function($arg) use(&$objects) { | |
return $arg instanceof Ref ? $objects[$arg->hashCode()] : $arg; | |
}; | |
$contentsOf= function($kids, $suffix= '') use(&$contentsOf, &$objectOf, &$objects) { | |
foreach ($kids as $i => $ref) { | |
$page= $objects[$ref->hashCode()]; | |
$number= $suffix.($i + 1); | |
if ('Pages' === $page['Type']) { | |
yield from $contentsOf($page['Kids'], $number.'.'); | |
continue; | |
} | |
// A single content object or an array of references | |
$contents= $objectOf($page['Contents']); | |
if ($stream= $contents['$stream'] ?? null) { | |
$streams= [$stream]; | |
} else { | |
$streams= []; | |
foreach ($contents as $ref) { | |
$streams[]= $objects[$ref->hashCode()]['$stream']; | |
} | |
} | |
yield $number => $page + ['$contents' => $streams]; | |
} | |
}; | |
$stringOf= function($bytes, $encoding, $unicode= null) use(&$stringOf) { | |
switch ($encoding) { | |
case null: return $bytes; | |
case 'MacRomanEncoding': return iconv('macintosh', \xp::ENCODING, $bytes); | |
case 'WinAnsiEncoding': return iconv('cp1252', \xp::ENCODING, $bytes); | |
case 'Identity-H': return $unicode->translate($bytes); | |
default: | |
// FIXME: Respect differences | |
return $stringOf($bytes, $encoding['BaseEncoding'] ?? null); | |
} | |
}; | |
$textOf= function($token, array $format) use(&$objects, &$objectOf, &$stringOf) { | |
if ($descendants= $format['font']['DescendantFonts'] ?? null) { | |
// TODO https://github.com/adobe-type-tools/cmap-resources | |
} | |
$bytes= 'hex' === $token[0] ? hex2bin($token[1]) : $token[1]; | |
switch ($format['font']['Subtype']) { | |
case 'Type0': case 'Type1': case 'TrueType': | |
return $stringOf( | |
$bytes, | |
$objectOf($format['font']['Encoding'] ?? null), | |
$format['font']['$unicode'] ?? null | |
); | |
case 'Type3': | |
if ($map= $format['font']['$unicode'] ?? null) { | |
return $format['font']['$unicode']->translate($bytes); | |
} | |
throw new FormatException('Cannot handle type 3 fonts without unicode'); | |
default: | |
throw new FormatException('Unknown font type '.$format['font']['Subtype']); | |
} | |
}; | |
foreach ($contentsOf($pages['Kids']) as $number => $page) { | |
Console::writeLine('- Page #', $number, ' contents: ', Objects::stringOf($page['Contents'], ' ')); | |
// Resolve fonts | |
$fonts= []; | |
if ($resources= $objectOf($page['Resources'])['Font'] ?? null) { | |
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) { | |
$font= $objects[$ref->hashCode()]; | |
// Console::writeLine(' Font <', $id, '> := ', Objects::stringOf($font, ' ')); | |
if ($ref= $font['ToUnicode'] ?? null) { | |
$font['$unicode']= new CharacterMap($objects[$ref->hashCode()]['$stream']); | |
} | |
$fonts[$id]= $font; | |
} | |
} | |
// Select no font for the beginning | |
$font= null; | |
// Resolve objects | |
$xobjects= []; | |
if ($resources= $objectOf($page['Resources'])['XObject'] ?? null) { | |
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) { | |
$xobjects[$id]= $objects[$ref->hashCode()]; | |
} | |
} | |
// DEBUG | |
// foreach ($page['$contents'] as $i => $stream) { | |
// file_put_contents('text-block.'.$number.'_'.$i, $stream->bytes()); | |
// } | |
$paragraph= ''; | |
$content= new Content(...$page['$contents']); | |
foreach ($content->operations() as $op => $arguments) { | |
// DEBUG | |
// Console::$err->writeLine("\e[2m", '[', $number, ' | ', $op, ']: ', $arguments, "\e[0m"); | |
if ('BT' === $op) { | |
// NOOP | |
} else if ('ET' === $op) { | |
Console::writeLine(' ', str_replace("\n", "\n ", $paragraph)); | |
Console::writeLine(); | |
$paragraph= ''; | |
} else if ('Tf' === $op) { | |
$id= $arguments[0][1]; | |
$size= $arguments[1][1]; | |
$font= ['font' => $fonts[$id] ?? '@'.$id, 'size' => $size]; | |
} else if ('Tj' === $op) { | |
$paragraph.= $textOf($arguments[0], $font); | |
} else if ('TJ' === $op) { | |
foreach ($arguments as $argument) { | |
if ('hex' === $argument[0] || 'string' === $argument[0]) { | |
$paragraph.= $textOf($argument, $font); | |
} | |
} | |
} else if ("'" === $op) { | |
$paragraph.= "\n".$textOf($arguments[0], $font);; | |
} else if ('"' === $op) { | |
$paragraph.= "\n".$textOf($arguments[2], $font);; | |
} else if ('TD' === $op) { | |
$paragraph.= "\n"; | |
} else if ('Do' === $op) { | |
$id= $arguments[0][1]; | |
$xobject= $xobjects[$id]; | |
if ('Image' === $xobject['Subtype']) { | |
Console::writeLine(' ', "\n"); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uses https://github.com/xp-forge/pdf-parser