Last active
June 10, 2025 15:20
-
-
Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Converts a PDF file assumed to be a two-column ACM or CGF article to text. Ignores reviewer red numbering.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
# Author: Yotam Gingold <[email protected]> | |
# License: CC0 | |
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd> | |
## About | |
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering. | |
## Install poppler for pdftotext, pdfinfo | |
brew install poppler | |
## Usage | |
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout. | |
python3 sig2text.py file.pdf [out.txt] | |
or | |
uv run sig2text.py file.pdf [out.txt] | |
''' | |
# /// script | |
# requires-python = ">=3.11" | |
# dependencies = [ | |
# ] | |
# /// | |
import argparse | |
import subprocess | |
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' ) | |
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' ) | |
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' ) | |
parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' ) | |
parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' ) | |
parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' ) | |
parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' ) | |
args = parser.parse_args() | |
print( f"Using {args.metrics} metrics." ) | |
metrics = [ | |
## Column 1 | |
# left, right, top, bottom. Coordinates measured with origin at top-left. | |
{ | |
'ACM': [ 43, 313, 75, 705 ], | |
'CGF': [ 47, 292, 78, 726 ], | |
'CAG': [ 30, 308, 65, 760 ], | |
}, | |
## Column 2 | |
# left, right, top, bottom. Coordinates measured with origin at top-left. | |
{ | |
'ACM': [ 313, 575, 75, 705 ], | |
'CGF': [ 311, 557, 78, 726 ], | |
'CAG': [ 310, 570, 65, 760 ], | |
} | |
] | |
print( f"Loading:", args.inpath ) | |
def get_pdf_page_count( pdf_path ): | |
# Run pdfinfo command | |
result = subprocess.run( [ 'pdfinfo', pdf_path ], capture_output = True, text = True, check = True ) | |
# Parse the output to find the Pages line | |
for line in result.stdout.split('\n'): | |
if line.startswith('Pages:'): | |
# Extract the number after "Pages:" | |
pages = int( line.split(':')[1].strip() ) | |
return pages | |
raise ValueError( "Could not parse page number from pdfinfo output." ) | |
def get_subpage_text( pdf_path, page_index, left_right_top_bottom ): | |
# Run pdftotext command | |
result = subprocess.run( | |
[str(x) for x in [ | |
'pdftotext', | |
'-nopgbrk', | |
'-f', page_index+1, | |
'-l', page_index+1, | |
'-x', left_right_top_bottom[0], | |
'-W', left_right_top_bottom[1] - left_right_top_bottom[0], | |
'-y', left_right_top_bottom[2], | |
'-H', left_right_top_bottom[3] - left_right_top_bottom[2], | |
pdf_path, | |
'-' | |
]], | |
capture_output = True, | |
text = True, | |
check = True | |
) | |
return result.stdout | |
parts = [] | |
for page_index in range( get_pdf_page_count( args.inpath ) ): | |
## Column 1, Column 2 | |
for subpage_metrics in metrics: | |
# crop: left, right, top, bottom | |
crop = list(subpage_metrics[args.metrics]) | |
column = get_subpage_text( args.inpath, page_index, crop ) | |
if args.metrics == 'ACM' and page_index == 0: | |
copyright = column.find( 'Permission to make digital or hard copies' ) | |
if copyright != -1: | |
print( "Dropping ACM copyright text:" ) | |
print( column[ copyright: ] ) | |
column = column[ :copyright ] | |
parts.append( column ) | |
text_body = "\n".join( parts ) | |
if args.outpath is None: | |
from pathlib import Path | |
args.outpath = Path(args.inpath).with_suffix( '.txt' ) | |
if args.outpath == '-': | |
print( text_body ) | |
else: | |
with open( args.outpath, 'w' ) as f: f.write( text_body ) | |
print( f"Saved:", args.outpath ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
# Author: Yotam Gingold <[email protected]> | |
# License: CC0 | |
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd> | |
## About | |
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering. | |
## Install (unless using uv) | |
pip install pypdf==4.0.2 | |
To remove hyphenation: | |
pip install spacy==3.8.7 | |
python -m spacy download en_core_web_sm | |
## Usage | |
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout. | |
python3 sig2text.py file.pdf [out.txt] | |
or | |
uv run sig2text.py file.pdf [out.txt] | |
''' | |
# /// script | |
# requires-python = ">=3.11" | |
# dependencies = [ | |
# "pypdf==4.0.2", | |
# "spacy==3.8.7", | |
# "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", | |
# ] | |
# /// | |
import argparse | |
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' ) | |
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' ) | |
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' ) | |
parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' ) | |
parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' ) | |
parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' ) | |
parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' ) | |
args = parser.parse_args() | |
from pypdf import PdfReader | |
print( f"Loading:", args.inpath ) | |
reader = PdfReader( args.inpath ) | |
print( f"Using {args.metrics} metrics." ) | |
metrics = [ | |
## Column 1 | |
# left, right, top, bottom. Coordinates measured with origin at top-left. | |
{ | |
'ACM': [ 43, 313, 75, 705 ], | |
'CGF': [ 47, 292, 78, 726 ], | |
'CAG': [ 30, 308, 65, 760 ], | |
}, | |
## Column 2 | |
# left, right, top, bottom. Coordinates measured with origin at top-left. | |
{ | |
'ACM': [ 313, 575, 75, 705 ], | |
'CGF': [ 311, 557, 78, 726 ], | |
'CAG': [ 310, 570, 65, 760 ], | |
} | |
] | |
def print_skipped( x, y, text ): | |
if args.print_skipped and len(text.strip()) > 0: print( f"Skipping text at ( {x}, {y} ): {text.strip()}" ) | |
parts = [] | |
for page_index, page in enumerate( reader.pages ): | |
def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ): | |
x, y = tm_matrix[4:6] | |
# y is from the bottom, so flip it | |
y = page.mediabox[3] - y | |
# Now y is from the top. | |
if not args.no_skip: | |
## Keep only what's inside the crop box | |
if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]: | |
print_skipped( x, y, text ) | |
return | |
## Check for the noise on the first page | |
if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ): | |
crop[3] = y | |
print_skipped( x, y, text ) | |
return | |
## Skip small solo numbers in CGF | |
if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798: | |
print_skipped( x, y, text ) | |
return | |
## Skip small solo numbers in CAG | |
if args.metrics in ('CAG',) and text.strip().isdigit() and font_size == 4.7322: | |
print_skipped( x, y, text ) | |
return | |
if args.print_saved: | |
print( f"{x}, {y}: {text.strip()}" ) | |
column.append( ( x, y, text ) ) | |
## Column 1, Column 2 | |
for subpage_metrics in metrics: | |
# crop: left, right, top, bottom | |
crop = list(subpage_metrics[args.metrics]) | |
column = [] | |
page.extract_text( visitor_text = visit_crop ) | |
## Sorting makes things worse | |
# column.sort( key = lambda x_y_text: ( x_y_text[1], x_y_text[0] ) ) # sort by x and then x | |
parts.extend( [ text for x, y, text in column ] ) | |
if len( parts ) > 0: parts[-1] += '\n' | |
if args.metrics in ('CAG',): | |
text_body = "\n".join( parts ) | |
else: | |
text_body = "".join( parts ) | |
## Remove hyphenation | |
REMOVE_HYPHENATION = False | |
try: | |
import spacy | |
# Load the English NLP model | |
nlp = spacy.load("en_core_web_sm") | |
# Let's remove hyphenation! | |
REMOVE_HYPHENATION = True | |
except: pass | |
if REMOVE_HYPHENATION: | |
def is_english(word): | |
doc = nlp(word) | |
# Check if the language of the word is English | |
return doc.lang_ == "en" | |
import re | |
pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)') | |
def replace_if_english( match ): | |
dehyphen = '\n' + match.group(1) + match.group(2) | |
return dehyphen if is_english( dehyphen ) else match.group(0) | |
original_text_body = text_body | |
text_body = pattern.sub( replace_if_english, original_text_body ) | |
if args.outpath is None: | |
from pathlib import Path | |
args.outpath = Path(args.inpath).with_suffix( '.txt' ) | |
if args.outpath == '-': | |
print( text_body ) | |
else: | |
with open( args.outpath, 'w' ) as f: f.write( text_body ) | |
print( f"Saved:", args.outpath ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment