Skip to content

Instantly share code, notes, and snippets.

@yig
Last active June 10, 2025 15:20
Show Gist options
  • Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Converts a PDF file assumed to be a two-column ACM or CGF article to text. Ignores reviewer red numbering.
'''
# Author: Yotam Gingold <[email protected]>
# License: CC0
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>
## About
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.
## Install poppler for pdftotext, pdfinfo
brew install poppler
## Usage
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.
python3 sig2text.py file.pdf [out.txt]
or
uv run sig2text.py file.pdf [out.txt]
'''
# /// script
# requires-python = ">=3.11"
# dependencies = [
# ]
# ///
import argparse
import subprocess
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' )
parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' )
parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' )
parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' )
args = parser.parse_args()
print( f"Using {args.metrics} metrics." )
metrics = [
## Column 1
# left, right, top, bottom. Coordinates measured with origin at top-left.
{
'ACM': [ 43, 313, 75, 705 ],
'CGF': [ 47, 292, 78, 726 ],
'CAG': [ 30, 308, 65, 760 ],
},
## Column 2
# left, right, top, bottom. Coordinates measured with origin at top-left.
{
'ACM': [ 313, 575, 75, 705 ],
'CGF': [ 311, 557, 78, 726 ],
'CAG': [ 310, 570, 65, 760 ],
}
]
print( f"Loading:", args.inpath )
def get_pdf_page_count( pdf_path ):
# Run pdfinfo command
result = subprocess.run( [ 'pdfinfo', pdf_path ], capture_output = True, text = True, check = True )
# Parse the output to find the Pages line
for line in result.stdout.split('\n'):
if line.startswith('Pages:'):
# Extract the number after "Pages:"
pages = int( line.split(':')[1].strip() )
return pages
raise ValueError( "Could not parse page number from pdfinfo output." )
def get_subpage_text( pdf_path, page_index, left_right_top_bottom ):
# Run pdftotext command
result = subprocess.run(
[str(x) for x in [
'pdftotext',
'-nopgbrk',
'-f', page_index+1,
'-l', page_index+1,
'-x', left_right_top_bottom[0],
'-W', left_right_top_bottom[1] - left_right_top_bottom[0],
'-y', left_right_top_bottom[2],
'-H', left_right_top_bottom[3] - left_right_top_bottom[2],
pdf_path,
'-'
]],
capture_output = True,
text = True,
check = True
)
return result.stdout
parts = []
for page_index in range( get_pdf_page_count( args.inpath ) ):
## Column 1, Column 2
for subpage_metrics in metrics:
# crop: left, right, top, bottom
crop = list(subpage_metrics[args.metrics])
column = get_subpage_text( args.inpath, page_index, crop )
if args.metrics == 'ACM' and page_index == 0:
copyright = column.find( 'Permission to make digital or hard copies' )
if copyright != -1:
print( "Dropping ACM copyright text:" )
print( column[ copyright: ] )
column = column[ :copyright ]
parts.append( column )
text_body = "\n".join( parts )
if args.outpath is None:
from pathlib import Path
args.outpath = Path(args.inpath).with_suffix( '.txt' )
if args.outpath == '-':
print( text_body )
else:
with open( args.outpath, 'w' ) as f: f.write( text_body )
print( f"Saved:", args.outpath )
'''
# Author: Yotam Gingold <[email protected]>
# License: CC0
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>
## About
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.
## Install (unless using uv)
pip install pypdf==4.0.2
To remove hyphenation:
pip install spacy==3.8.7
python -m spacy download en_core_web_sm
## Usage
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.
python3 sig2text.py file.pdf [out.txt]
or
uv run sig2text.py file.pdf [out.txt]
'''
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "pypdf==4.0.2",
# "spacy==3.8.7",
# "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
# ]
# ///
import argparse
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' )
parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' )
parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' )
parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' )
args = parser.parse_args()
from pypdf import PdfReader
print( f"Loading:", args.inpath )
reader = PdfReader( args.inpath )
print( f"Using {args.metrics} metrics." )
metrics = [
## Column 1
# left, right, top, bottom. Coordinates measured with origin at top-left.
{
'ACM': [ 43, 313, 75, 705 ],
'CGF': [ 47, 292, 78, 726 ],
'CAG': [ 30, 308, 65, 760 ],
},
## Column 2
# left, right, top, bottom. Coordinates measured with origin at top-left.
{
'ACM': [ 313, 575, 75, 705 ],
'CGF': [ 311, 557, 78, 726 ],
'CAG': [ 310, 570, 65, 760 ],
}
]
def print_skipped( x, y, text ):
if args.print_skipped and len(text.strip()) > 0: print( f"Skipping text at ( {x}, {y} ): {text.strip()}" )
parts = []
for page_index, page in enumerate( reader.pages ):
def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ):
x, y = tm_matrix[4:6]
# y is from the bottom, so flip it
y = page.mediabox[3] - y
# Now y is from the top.
if not args.no_skip:
## Keep only what's inside the crop box
if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]:
print_skipped( x, y, text )
return
## Check for the noise on the first page
if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ):
crop[3] = y
print_skipped( x, y, text )
return
## Skip small solo numbers in CGF
if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798:
print_skipped( x, y, text )
return
## Skip small solo numbers in CAG
if args.metrics in ('CAG',) and text.strip().isdigit() and font_size == 4.7322:
print_skipped( x, y, text )
return
if args.print_saved:
print( f"{x}, {y}: {text.strip()}" )
column.append( ( x, y, text ) )
## Column 1, Column 2
for subpage_metrics in metrics:
# crop: left, right, top, bottom
crop = list(subpage_metrics[args.metrics])
column = []
page.extract_text( visitor_text = visit_crop )
## Sorting makes things worse
# column.sort( key = lambda x_y_text: ( x_y_text[1], x_y_text[0] ) ) # sort by x and then x
parts.extend( [ text for x, y, text in column ] )
if len( parts ) > 0: parts[-1] += '\n'
if args.metrics in ('CAG',):
text_body = "\n".join( parts )
else:
text_body = "".join( parts )
## Remove hyphenation
REMOVE_HYPHENATION = False
try:
import spacy
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")
# Let's remove hyphenation!
REMOVE_HYPHENATION = True
except: pass
if REMOVE_HYPHENATION:
def is_english(word):
doc = nlp(word)
# Check if the language of the word is English
return doc.lang_ == "en"
import re
pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)')
def replace_if_english( match ):
dehyphen = '\n' + match.group(1) + match.group(2)
return dehyphen if is_english( dehyphen ) else match.group(0)
original_text_body = text_body
text_body = pattern.sub( replace_if_english, original_text_body )
if args.outpath is None:
from pathlib import Path
args.outpath = Path(args.inpath).with_suffix( '.txt' )
if args.outpath == '-':
print( text_body )
else:
with open( args.outpath, 'w' ) as f: f.write( text_body )
print( f"Saved:", args.outpath )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment