Skip to content

Instantly share code, notes, and snippets.

@thekid
Last active June 22, 2025 11:54
Show Gist options
  • Save thekid/4222de1b7a72863aeca4255960852d50 to your computer and use it in GitHub Desktop.
Save thekid/4222de1b7a72863aeca4255960852d50 to your computer and use it in GitHub Desktop.
PDF to Markdown
<?php
use com\adobe\pdf\{PdfReader, Content, Ref, Tokens, CharacterMap};
use io\streams\FileInputStream;
use lang\FormatException;
use util\Objects;
use util\cmd\Console;
try {
$reader= new PdfReader(new FileInputStream($argv[1]));
} catch (Throwable $e) {
Console::writeLine($argv[1], ': ', $e);
return 2;
}
$objects= [];
$trailer= [];
Console::$err->write('Parsing ', $argv[1], ' [');
foreach ($reader->objects() as $kind => $value) {
Console::$err->write('.');
if ('object' === $kind) {
$objects[$value['id']->hashCode()]= $value['dict'];
} else if ('trailer' === $kind) {
$trailer+= $value;
}
}
Console::$err->writeLine(']');
// Handle files without `trailer` entry, e.g. linearized PDFs
if (empty($trailer)) {
foreach ($objects as $object) {
if ('XRef' === ($object['Type'] ?? null)) {
$trailer= $object;
break;
}
}
}
// Handle encryption
Console::writeLine('Trailer: ', $trailer);
if (isset($trailer['Encrypt'])) {
$encrypt= $objects[$trailer['Encrypt']->hashCode()];
Console::writeLine('Cannot handle encrypted PDFs: ', $encrypt);
return 1;
}
// Now, unpack object streams and add them to the lookup table
foreach ($objects as $id => $object) {
if ('ObjStm' === ($object['Type'] ?? null)) {
$n= $object['N'];
$tokens= new Tokens($object['$stream']);
$numbers= [];
for ($i= 0; $i < $n; $i++) {
$number= $tokens->token()[1];
$offset= $tokens->token()[1];
$numbers[$i]= $number.'_0';
}
for ($i= 0; $i < $n; $i++) {
$objects[$numbers[$i]]= $tokens->value();
}
}
}
if ($info= ($trailer['Info'] ?? null)) {
Console::writeLine('Info: ', $objects[$info->hashCode()]);
}
$root= $objects[$trailer['Root']->hashCode()];
Console::writeLine('Root: ', $root);
$pages= $objects[$root['Pages']->hashCode()];
Console::writeLine('Pages: ', $pages);
$objectOf= function($arg) use(&$objects) {
return $arg instanceof Ref ? $objects[$arg->hashCode()] : $arg;
};
$contentsOf= function($kids, $suffix= '') use(&$contentsOf, &$objectOf, &$objects) {
foreach ($kids as $i => $ref) {
$page= $objects[$ref->hashCode()];
$number= $suffix.($i + 1);
if ('Pages' === $page['Type']) {
yield from $contentsOf($page['Kids'], $number.'.');
continue;
}
// A single content object or an array of references
$contents= $objectOf($page['Contents']);
if ($stream= $contents['$stream'] ?? null) {
$streams= [$stream];
} else {
$streams= [];
foreach ($contents as $ref) {
$streams[]= $objects[$ref->hashCode()]['$stream'];
}
}
yield $number => $page + ['$contents' => $streams];
}
};
$stringOf= function($bytes, $encoding, $unicode= null) use(&$stringOf) {
switch ($encoding) {
case null: return $bytes;
case 'MacRomanEncoding': return iconv('macintosh', \xp::ENCODING, $bytes);
case 'WinAnsiEncoding': return iconv('cp1252', \xp::ENCODING, $bytes);
case 'Identity-H': return $unicode->translate($bytes);
default:
// FIXME: Respect differences
return $stringOf($bytes, $encoding['BaseEncoding'] ?? null);
}
};
$textOf= function($token, array $format) use(&$objects, &$objectOf, &$stringOf) {
if ($descendants= $format['font']['DescendantFonts'] ?? null) {
// TODO https://github.com/adobe-type-tools/cmap-resources
}
$bytes= 'hex' === $token[0] ? hex2bin($token[1]) : $token[1];
switch ($format['font']['Subtype']) {
case 'Type0': case 'Type1': case 'TrueType':
return $stringOf(
$bytes,
$objectOf($format['font']['Encoding'] ?? null),
$format['font']['$unicode'] ?? null
);
case 'Type3':
if ($map= $format['font']['$unicode'] ?? null) {
return $format['font']['$unicode']->translate($bytes);
}
throw new FormatException('Cannot handle type 3 fonts without unicode');
default:
throw new FormatException('Unknown font type '.$format['font']['Subtype']);
}
};
foreach ($contentsOf($pages['Kids']) as $number => $page) {
Console::writeLine('- Page #', $number, ' contents: ', Objects::stringOf($page['Contents'], ' '));
// Resolve fonts
$fonts= [];
if ($resources= $objectOf($page['Resources'])['Font'] ?? null) {
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
$font= $objects[$ref->hashCode()];
// Console::writeLine(' Font <', $id, '> := ', Objects::stringOf($font, ' '));
if ($ref= $font['ToUnicode'] ?? null) {
$font['$unicode']= new CharacterMap($objects[$ref->hashCode()]['$stream']);
}
$fonts[$id]= $font;
}
}
// Select no font for the beginning
$font= null;
// Resolve objects
$xobjects= [];
if ($resources= $objectOf($page['Resources'])['XObject'] ?? null) {
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
$xobjects[$id]= $objects[$ref->hashCode()];
}
}
// DEBUG
// foreach ($page['$contents'] as $i => $stream) {
// file_put_contents('text-block.'.$number.'_'.$i, $stream->bytes());
// }
$paragraph= '';
$content= new Content(...$page['$contents']);
foreach ($content->operations() as $op => $arguments) {
// DEBUG
// Console::$err->writeLine("\e[2m", '[', $number, ' | ', $op, ']: ', $arguments, "\e[0m");
if ('BT' === $op) {
// NOOP
} else if ('ET' === $op) {
Console::writeLine(' ', str_replace("\n", "\n ", $paragraph));
Console::writeLine();
$paragraph= '';
} else if ('Tf' === $op) {
$id= $arguments[0][1];
$size= $arguments[1][1];
$font= ['font' => $fonts[$id] ?? '@'.$id, 'size' => $size];
} else if ('Tj' === $op) {
$paragraph.= $textOf($arguments[0], $font);
} else if ('TJ' === $op) {
foreach ($arguments as $argument) {
if ('hex' === $argument[0] || 'string' === $argument[0]) {
$paragraph.= $textOf($argument, $font);
}
}
} else if ("'" === $op) {
$paragraph.= "\n".$textOf($arguments[0], $font);;
} else if ('"' === $op) {
$paragraph.= "\n".$textOf($arguments[2], $font);;
} else if ('TD' === $op) {
$paragraph.= "\n";
} else if ('Do' === $op) {
$id= $arguments[0][1];
$xobject= $xobjects[$id];
if ('Image' === $xobject['Subtype']) {
Console::writeLine(' ', "![{$id}]({$xobject['Subtype']}:{$xobject['Width']}x{$xobject['Height']})\n");
}
}
}
}
@thekid
Copy link
Author

thekid commented Jun 22, 2025

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment