Skip to content

Instantly share code, notes, and snippets.

@yig
Last active March 25, 2025 14:49
Show Gist options
  • Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Converts a PDF file assumed to be a two-column ACM or CGF article to text. Ignores reviewer red numbering.
'''
# Author: Yotam Gingold <[email protected]>
# License: CC0
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>
## About
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.
## Install
pip install pypdf==4.0.2
To remove hyphenation:
pip install spacy==3.7.4
python -m spacy download en_core_web_sm
## Usage
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.
python3 sig2text.py file.pdf [out.txt]
'''
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "pypdf==4.0.2",
# "spacy==3.7.4",
# ]
# ///
import argparse
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF. Default is ACM.' )
parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'Whether to print skipped text.' )
args = parser.parse_args()
from pypdf import PdfReader
print( f"Loading:", args.inpath )
reader = PdfReader( args.inpath )
print( f"Using {args.metrics} metrics." )
metrics = [
## Column 1
# left, right, top, bottom
{
'ACM': [ 43, 313, 75, 705 ],
'CGF': [ 47, 292, 78, 726 ]
},
## Column 2
# left, right, top, bottom
{
'ACM': [ 313, 575, 75, 705 ],
'CGF': [ 311, 557, 78, 726 ]
}
]
def print_skipped( text ):
if args.print_skipped and len(text.strip()) > 0: print( "Skipping text:", text.strip() )
parts = []
for page_index, page in enumerate( reader.pages ):
def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ):
x, y = tm_matrix[4:6]
# y is from the bottom, so flip it
y = page.mediabox[3] - y
## Keep only what's inside the crop box
if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]:
print_skipped( text )
return
## Check for the noise on the first page
if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ):
crop[3] = y
print_skipped( text )
return
## Skip small solo numbers in CGF
if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798:
print_skipped( text )
return
parts.append(text)
## Column 1
# left, right, top, bottom
crop = list(metrics[0][args.metrics])
page.extract_text( visitor_text = visit_crop )
if len( parts ) > 0: parts[-1] += '\n'
## Column 2
# left, right, top, bottom
crop = list(metrics[1][args.metrics])
page.extract_text( visitor_text = visit_crop )
if len( parts ) > 0: parts[-1] += '\n'
text_body = "".join( parts )
## Remove hyphenation
REMOVE_HYPHENATION = False
try:
import spacy
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")
# Let's remove hyphenation!
REMOVE_HYPHENATION = True
except: pass
if REMOVE_HYPHENATION:
def is_english(word):
doc = nlp(word)
# Check if the language of the word is English
return doc.lang_ == "en"
import re
pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)')
def replace_if_english( match ):
dehyphen = '\n' + match.group(1) + match.group(2)
return dehyphen if is_english( dehyphen ) else match.group(0)
original_text_body = text_body
text_body = pattern.sub( replace_if_english, original_text_body )
if args.outpath is None:
from pathlib import Path
args.outpath = Path(args.inpath).with_suffix( '.txt' )
if args.outpath == '-':
print( text_body )
else:
with open( args.outpath, 'w' ) as f: f.write( text_body )
print( f"Saved:", args.outpath )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment