yig · June 10, 2025 15:20
diff --git a/sig2text-poppler.py b/sig2text-poppler.py
 '''
 # Author: Yotam Gingold <[email protected]>
 # License: CC0
 # URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>

 ## About

 Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.

 ## Install poppler for pdftotext, pdfinfo

    brew install poppler

 ## Usage

 By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.

    python3 sig2text.py file.pdf [out.txt]

 or

    uv run sig2text.py file.pdf [out.txt]
 '''

 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 # ]
 # ///

 import argparse
 import subprocess

 parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
 parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
 parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
 parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' )
 parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' )
 parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' )
 parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' )

 args = parser.parse_args()

 print( f"Using {args.metrics} metrics." )

 metrics = [
    ## Column 1
    # left, right, top, bottom. Coordinates measured with origin at top-left.
    {
        'ACM': [ 43, 313, 75, 705 ],
        'CGF': [ 47, 292, 78, 726 ],
        'CAG': [ 30, 308, 65, 760 ],
    },
    ## Column 2
    # left, right, top, bottom. Coordinates measured with origin at top-left.
    {
        'ACM': [ 313, 575, 75, 705 ],
        'CGF': [ 311, 557, 78, 726 ],
        'CAG': [ 310, 570, 65, 760 ],
    }
    ]

 print( f"Loading:", args.inpath )
 def get_pdf_page_count( pdf_path ):
    # Run pdfinfo command
    result = subprocess.run( [ 'pdfinfo', pdf_path ], capture_output = True, text = True, check = True )
    
    # Parse the output to find the Pages line
    for line in result.stdout.split('\n'):
        if line.startswith('Pages:'):
            # Extract the number after "Pages:"
            pages = int( line.split(':')[1].strip() )
            return pages
    
    raise ValueError( "Could not parse page number from pdfinfo output." )

 def get_subpage_text( pdf_path, page_index, left_right_top_bottom ):
    # Run pdftotext command
    result = subprocess.run(
        [str(x) for x in [
        'pdftotext',
        '-nopgbrk',
        '-f', page_index+1,
        '-l', page_index+1,
        '-x', left_right_top_bottom[0],
        '-W', left_right_top_bottom[1] - left_right_top_bottom[0],
        '-y', left_right_top_bottom[2],
        '-H', left_right_top_bottom[3] - left_right_top_bottom[2],
        pdf_path,
        '-'
        ]],
        capture_output = True,
        text = True,
        check = True
    )
    
    return result.stdout

 parts = []
 for page_index in range( get_pdf_page_count( args.inpath ) ):
    
    ## Column 1, Column 2
    for subpage_metrics in metrics:
        # crop: left, right, top, bottom
        crop = list(subpage_metrics[args.metrics])
        
        column = get_subpage_text( args.inpath, page_index, crop )
        
        if args.metrics == 'ACM' and page_index == 0:
            copyright = column.find( 'Permission to make digital or hard copies' )
            if copyright != -1:
                print( "Dropping ACM copyright text:" )
                print( column[ copyright: ] )
                column = column[ :copyright ]
        
        parts.append( column )

 text_body = "\n".join( parts )

 if args.outpath is None:
    from pathlib import Path
    args.outpath = Path(args.inpath).with_suffix( '.txt' )

 if args.outpath == '-':
    print( text_body )
 else:
    with open( args.outpath, 'w' ) as f: f.write( text_body )
    print( f"Saved:", args.outpath )
diff --git a/sig2text.py b/sig2text.py
 '''
 # Author: Yotam Gingold <[email protected]>
 # License: CC0
 # URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>

 ## About

 Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.

 ## Install (unless using uv)

    pip install pypdf==4.0.2

 To remove hyphenation:

    pip install spacy==3.8.7
    python -m spacy download en_core_web_sm

 ## Usage

 By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.

    python3 sig2text.py file.pdf [out.txt]

 or

    uv run sig2text.py file.pdf [out.txt]
 '''

 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #     "pypdf==4.0.2",
 #     "spacy==3.8.7",
 #     "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
 # ]
 # ///

 import argparse
 parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
 parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
 parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
 parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' )
 parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' )
 parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' )
 parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' )

 args = parser.parse_args()

 from pypdf import PdfReader
 print( f"Loading:", args.inpath )
 reader = PdfReader( args.inpath )

 print( f"Using {args.metrics} metrics." )

 metrics = [
    ## Column 1
    # left, right, top, bottom. Coordinates measured with origin at top-left.
    {
        'ACM': [ 43, 313, 75, 705 ],
        'CGF': [ 47, 292, 78, 726 ],
        'CAG': [ 30, 308, 65, 760 ],
    },
    ## Column 2
    # left, right, top, bottom. Coordinates measured with origin at top-left.
    {
        'ACM': [ 313, 575, 75, 705 ],
        'CGF': [ 311, 557, 78, 726 ],
        'CAG': [ 310, 570, 65, 760 ],
    }
    ]

 def print_skipped( x, y, text ):
    if args.print_skipped and len(text.strip()) > 0: print( f"Skipping text at ( {x}, {y} ): {text.strip()}" )

 parts = []
 for page_index, page in enumerate( reader.pages ):
    
    def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ):
        x, y = tm_matrix[4:6]
        # y is from the bottom, so flip it
        y = page.mediabox[3] - y
        # Now y is from the top.
        
        if not args.no_skip:
            
            ## Keep only what's inside the crop box
            if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]:
                print_skipped( x, y, text )
                return
            
            ## Check for the noise on the first page
            if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ):
                crop[3] = y
                print_skipped( x, y, text )
                return
            
            ## Skip small solo numbers in CGF
            if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798:
                print_skipped( x, y, text )
                return
            
            ## Skip small solo numbers in CAG
            if args.metrics in ('CAG',) and text.strip().isdigit() and font_size == 4.7322:
                print_skipped( x, y, text )
                return
        
        if args.print_saved:
            print( f"{x}, {y}: {text.strip()}" )
        
        column.append( ( x, y, text ) )
    
    ## Column 1, Column 2
    for subpage_metrics in metrics:
        # crop: left, right, top, bottom
        crop = list(subpage_metrics[args.metrics])
        column = []
        page.extract_text( visitor_text = visit_crop )
        ## Sorting makes things worse
        # column.sort( key = lambda x_y_text: ( x_y_text[1], x_y_text[0] ) ) # sort by x and then x
        parts.extend( [ text for x, y, text in column ] )
        if len( parts ) > 0: parts[-1] += '\n'

 if args.metrics in ('CAG',):
    text_body = "\n".join( parts )
 else:
    text_body = "".join( parts )

 ## Remove hyphenation
 REMOVE_HYPHENATION = False
 try:
    import spacy
    # Load the English NLP model
    nlp = spacy.load("en_core_web_sm")
    # Let's remove hyphenation!
    REMOVE_HYPHENATION = True
 except: pass

 if REMOVE_HYPHENATION:
    def is_english(word):
        doc = nlp(word)
        # Check if the language of the word is English
        return doc.lang_ == "en"
    
    import re
    pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)')
    
    def replace_if_english( match ):
        dehyphen = '\n' + match.group(1) + match.group(2)
        return dehyphen if is_english( dehyphen ) else match.group(0)
    
    original_text_body = text_body
    text_body = pattern.sub( replace_if_english, original_text_body )

 if args.outpath is None:
    from pathlib import Path
    args.outpath = Path(args.inpath).with_suffix( '.txt' )

 if args.outpath == '-':
    print( text_body )
 else:
    with open( args.outpath, 'w' ) as f: f.write( text_body )
    print( f"Saved:", args.outpath )
	'''
	# Author: Yotam Gingold <[email protected]>
	# License: CC0
	# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>

	## About

	Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.

	## Install poppler for pdftotext, pdfinfo

	brew install poppler

	## Usage

	By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.

	python3 sig2text.py file.pdf [out.txt]

	or

	uv run sig2text.py file.pdf [out.txt]
	'''

	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# ]
	# ///

	import argparse
	import subprocess

	parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
	parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
	parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
	parser.add_argument( '-m', '--metrics', type = str, choices = ['ACM', 'CGF', 'CAG'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF (Computer Graphics Forum), CAG (Computers & Graphics). Default is ACM.' )
	parser.add_argument( '-p', '--print-skipped', action = 'store_true', default = False, help = 'If specific, print skipped text.' )
	parser.add_argument( '--no-skip', action = 'store_true', default = False, help = 'If specified, nothing will be skipped.' )
	parser.add_argument( '--print-saved', action = 'store_true', default = False, help = 'If specified, print saved text.' )

	args = parser.parse_args()

	print( f"Using {args.metrics} metrics." )

	metrics = [
	## Column 1
	# left, right, top, bottom. Coordinates measured with origin at top-left.
	{
	'ACM': [ 43, 313, 75, 705 ],
	'CGF': [ 47, 292, 78, 726 ],
	'CAG': [ 30, 308, 65, 760 ],
	},
	## Column 2
	# left, right, top, bottom. Coordinates measured with origin at top-left.
	{
	'ACM': [ 313, 575, 75, 705 ],
	'CGF': [ 311, 557, 78, 726 ],
	'CAG': [ 310, 570, 65, 760 ],
	}
	]

	print( f"Loading:", args.inpath )
	def get_pdf_page_count( pdf_path ):
	# Run pdfinfo command
	result = subprocess.run( [ 'pdfinfo', pdf_path ], capture_output = True, text = True, check = True )

	# Parse the output to find the Pages line
	for line in result.stdout.split('\n'):
	if line.startswith('Pages:'):
	# Extract the number after "Pages:"
	pages = int( line.split(':')[1].strip() )
	return pages

	raise ValueError( "Could not parse page number from pdfinfo output." )

	def get_subpage_text( pdf_path, page_index, left_right_top_bottom ):
	# Run pdftotext command
	result = subprocess.run(
	[str(x) for x in [
	'pdftotext',
	'-nopgbrk',
	'-f', page_index+1,
	'-l', page_index+1,
	'-x', left_right_top_bottom[0],
	'-W', left_right_top_bottom[1] - left_right_top_bottom[0],
	'-y', left_right_top_bottom[2],
	'-H', left_right_top_bottom[3] - left_right_top_bottom[2],
	pdf_path,
	'-'
	]],
	capture_output = True,
	text = True,
	check = True
	)

	return result.stdout

	parts = []
	for page_index in range( get_pdf_page_count( args.inpath ) ):

	## Column 1, Column 2
	for subpage_metrics in metrics:
	# crop: left, right, top, bottom
	crop = list(subpage_metrics[args.metrics])

	column = get_subpage_text( args.inpath, page_index, crop )

	if args.metrics == 'ACM' and page_index == 0:
	copyright = column.find( 'Permission to make digital or hard copies' )
	if copyright != -1:
	print( "Dropping ACM copyright text:" )
	print( column[ copyright: ] )
	column = column[ :copyright ]

	parts.append( column )

	text_body = "\n".join( parts )

	if args.outpath is None:
	from pathlib import Path
	args.outpath = Path(args.inpath).with_suffix( '.txt' )

	if args.outpath == '-':
	print( text_body )
	else:
	with open( args.outpath, 'w' ) as f: f.write( text_body )
	print( f"Saved:", args.outpath )