-
-
Save pkvip9999/fc2fde7d909bf4c7eec3b6d0f11a39d6 to your computer and use it in GitHub Desktop.
Rename pdf files to their content's titles (for Python 3; with wildcard support; default rename all pdfs)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Extracts title from PDF files (Python 3). | |
Depends on: pdf, pyPDF2, PDFMiner3k, unidecode. | |
Usage: | |
pdftitle -d tmp --rename *.pdf{} | |
""" | |
from io import StringIO | |
import getopt, os, re, string, sys, glob, unidecode | |
from operator import index | |
from PyPDF2 import PdfFileReader | |
from PyPDF2.utils import PdfReadError | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.layout import LAParams, LTChar, LTText, LTFigure, LTTextBox, LTTextLine | |
from tfpdf import tfidf | |
import logging | |
__all__ = ['pdf_title'] | |
def make_parsing_state(*sequential, **named): | |
enums = dict(zip(sequential, range(len(sequential))), **named) | |
return type('ParsingState', (), enums) | |
def log(text): | |
if IS_LOG_ON: | |
print('-------- ' + text) | |
CHAR_PARSING_STATE = make_parsing_state('INIT_X', 'INIT_D', 'INSIDE_WORD') | |
IS_LOG_ON = False | |
ONE_CLICK_MODE = True | |
MIN_CHARS = 6 | |
MAX_WORDS = 30 | |
MIN_LONGEST_WORD = 4 | |
large_text = [] | |
def max_word_length(text): | |
return max(len(w) for w in text.split(' ')) | |
def sanitize(filename): | |
"""Turn string into a valid file name. | |
""" | |
# If the title was picked up from text, it may be too large. | |
# Preserve a certain number of words | |
words = filename.split(' ') | |
filename = ' '.join(words[0:MAX_WORDS]) | |
return filename | |
# Preserve letters with diacritics | |
# try: | |
# filename = unidecode.unidecode(filename.encode('utf-8').decode('utf-8')) | |
# except UnicodeDecodeError: | |
# print("*** Skipping invalid title decoding***") | |
# | |
# # Preserve subtitle separator | |
# filename = re.sub(r':', ' -', filename) | |
# | |
# valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) | |
# return "".join([c for c in filename if c in valid_chars]) | |
def meta_title(filename): | |
"""Title from pdf metadata. | |
""" | |
fp = open(filename, 'rb') | |
docinfo = PdfFileReader(fp).getDocumentInfo() | |
fp.close() | |
if docinfo is None: | |
return "" | |
return docinfo.title if docinfo.title else "" | |
def junk_line(line): | |
"""Judge if a line is not appropriate for a title. | |
""" | |
too_small = len(line.strip()) < MIN_CHARS | |
has_no_words = bool(re.search(r'^[0-9 \t-]+$|^(\(cid:[0-9 \t-]*\))+|^(abstract|unknown|title|untitled):?$', line.strip().lower())) | |
is_copyright_info = bool(re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission|(integrated|international).*conference|transactions\s+on|symposium\s+on|downloaded\s+from\s+http', line.lower())) | |
return too_small or has_no_words or is_copyright_info | |
def empty_str(s): | |
return len(s.strip()) == 0 | |
def update_largest_text(line, size, largest_text): | |
size = round(size, 2) | |
log('update size :' + str(size)) | |
log('largest_text size: ' + str(largest_text['size'])) | |
if not empty_str(line): | |
if (size > largest_text['size']): | |
largest_text = { | |
'contents': line, | |
'size': size | |
} | |
# Title spans multiple lines | |
elif (size == largest_text['size']): | |
largest_text['contents'] = largest_text['contents'] + line | |
if len(large_text) < 3: | |
large_text.append({ | |
'contents': line, | |
'size': size | |
}) | |
return largest_text | |
def extract_largest_text(obj, largest_text): | |
for child in obj: | |
if isinstance(child, LTTextLine): | |
log('lt_obj child line: ' + str(child)) | |
for child2 in child: | |
if isinstance(child2, LTChar): | |
largest_text = update_largest_text(child.get_text(), child2.size, largest_text) | |
break | |
elif isinstance(child, LTChar): | |
largest_text = update_largest_text(obj.get_text(), child.size, largest_text) | |
break | |
return largest_text | |
def extract_figure_text(lt_obj, largest_text): | |
""" | |
Extract text contained in a `LTFigure`. | |
Since text is encoded in `LTChar` elements, we detect separate lines | |
by keeping track of changes in font size. | |
""" | |
text = "" | |
line = "" | |
size = 0 | |
char_distance = 0 | |
char_previous_x1 = 0 | |
state = CHAR_PARSING_STATE.INIT_X | |
for child in lt_obj: | |
log('child: ' + str(child)) | |
# Ignore other elements | |
if not isinstance(child, LTChar): | |
continue | |
char_size = child.size | |
char_text = child.get_text() | |
# decoded_char_text = unidecode.unidecode(char_text.encode('utf-8').decode('utf-8')) | |
decoded_char_text = char_text | |
log('char: ' + str(char_size) + ' ' + str(decoded_char_text)) | |
# A new line was detected | |
if char_size != size: | |
log('new line') | |
largest_text = update_largest_text(line, size, largest_text) | |
text += line + '\n' | |
line = char_text | |
size = char_size | |
char_previous_x1 = child.x1 | |
state = CHAR_PARSING_STATE.INIT_D | |
# The same line | |
else: | |
# Spaces may not be present as `LTChar` elements, | |
# so we manually add them. | |
# NOTE: A word starting with lowercase can't be | |
# distinguished from the current word. | |
char_current_distance = abs(child.x0 - char_previous_x1) | |
log('char_current_distance: ' + str(char_current_distance)) | |
log('char_distance: ' + str(char_distance)) | |
log('state: ' + str(state)) | |
# Initialization | |
if state == CHAR_PARSING_STATE.INIT_X: | |
char_previous_x1 = child.x1 | |
state = CHAR_PARSING_STATE.INIT_D | |
elif state == CHAR_PARSING_STATE.INIT_D: | |
# Update distance only if no space is detected | |
if (char_distance > 0) and (char_current_distance < char_distance * 2.5): | |
char_distance = char_current_distance | |
if (char_distance < 0.1): | |
char_distance = 0.1 | |
state = CHAR_PARSING_STATE.INSIDE_WORD | |
# If the x-position decreased, then it's a new line | |
if (state == CHAR_PARSING_STATE.INSIDE_WORD) and (child.x1 < char_previous_x1): | |
log('x-position decreased') | |
line += ' ' | |
char_previous_x1 = child.x1 | |
state = CHAR_PARSING_STATE.INIT_D | |
# Large enough distance: it's a space | |
elif (state == CHAR_PARSING_STATE.INSIDE_WORD) and (char_current_distance > char_distance * 8.5): | |
log('space detected') | |
log('char_current_distance: ' + str(char_current_distance)) | |
log('char_distance: ' + str(char_distance)) | |
line += ' ' | |
char_previous_x1 = child.x1 | |
# When larger distance is detected between chars, use it to | |
# improve our heuristic | |
elif (state == CHAR_PARSING_STATE.INSIDE_WORD) and (char_current_distance > char_distance) and ( | |
char_current_distance < char_distance * 2.5): | |
char_distance = char_current_distance | |
char_previous_x1 = child.x1 | |
# Chars are sequential | |
else: | |
char_previous_x1 = child.x1 | |
line += child.get_text() | |
return largest_text, text | |
def pdf_text(filename): | |
fp = open(filename, 'rb') | |
parser = PDFParser(fp) | |
doc = PDFDocument(parser) | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams(all_texts=True) | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
interpreter.debug = True | |
text = "" | |
largest_text = { | |
'contents': "", | |
'size': 0 | |
} | |
pageno = 0 | |
for page in PDFPage.create_pages(doc): | |
interpreter.process_page(page) | |
layout = device.get_result() | |
for lt_obj in layout: | |
log('lt_obj: ' + str(lt_obj)) | |
if isinstance(lt_obj, (LTFigure, LTTextBox, LTTextLine)): | |
if isinstance(lt_obj, LTFigure): | |
(largest_text, figure_text) = extract_figure_text(lt_obj, largest_text) | |
text += figure_text | |
else: | |
largest_text = extract_largest_text(lt_obj, largest_text) | |
text += lt_obj.get_text() + '\n' | |
pageno += 1 | |
# if title valid or page number is larger than 3 is break | |
if bool(re.search(r'[a-zA-Z]+', text.strip(), flags=re.U)) or pageno > 3: | |
fp.close() | |
break | |
return largest_text, text | |
def pdf_full_text(filename): | |
doc = open(filename, 'rb') | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams(all_texts=True) | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
pageno = 0 | |
documents = [] | |
for page in PDFPage.get_pages(doc): | |
interpreter.process_page(page) | |
text = "" | |
# receive the LTPage object for the page. | |
layout = device.get_result() | |
for lt_obj in layout: | |
if isinstance(lt_obj, ( LTTextBox, LTTextLine)): | |
text += re.sub(r'\n', ' ', lt_obj.get_text()) | |
documents.append(text) | |
pageno += 1 | |
if pageno >= 20: | |
doc.close() | |
break | |
return documents | |
def title_start(lines): | |
for i, line in enumerate(lines): | |
if not empty_str(line) and not junk_line(line): | |
return i | |
return 0 | |
def title_end(lines, start, max_lines=2): | |
for i, line in enumerate(lines[start + 1:start + max_lines + 1], start + 1): | |
if empty_str(line): | |
return i | |
return start + 1 | |
def text_title(filename): | |
"""Extract title from PDF's text. | |
""" | |
(largest_text, lines_joined) = pdf_text(filename) | |
lines = lines_joined.strip().split('\n') | |
if empty_str(largest_text['contents']): | |
i = title_start(lines) | |
j = title_end(lines, i) | |
text = ' '.join(line.strip() for line in lines[i:j]) | |
else: | |
text = largest_text['contents'].strip() | |
# Strip dots, which conflict with os.path's splittext() | |
text = re.sub(r'\.', '', text) | |
# title = ' '.join(largest_text['contents'].split()) | |
# print(title.lower().split(" ")) | |
# documents = pdf_full_text(filename) | |
# t = tfidf(documents, title.lower().split(" ")) | |
# print(t) | |
return text | |
def valid_title(title): | |
return not empty_str(title) and max_word_length(title) > MIN_LONGEST_WORD and not junk_line(title) and empty_str( | |
os.path.splitext(title)[1]) | |
def pdf_title(filename): | |
"""Extract title using one of multiple strategies. | |
""" | |
title = "" | |
# try: | |
# title = meta_title(filename) | |
# if valid_title(title): | |
# return title | |
# except: | |
# print("*** Skipping invalid metadata! ***") | |
# try: | |
title = text_title(filename) | |
if valid_title(title): | |
return title | |
# except: | |
# print("*** Skipping invalid parsing! ***") | |
if valid_title(title): | |
return title | |
return os.path.basename(os.path.splitext(filename)[0]) | |
def process_file(directory, filename, rename, dry_run): | |
title = pdf_title(filename) | |
title = sanitize(' '.join(title.split())) | |
if rename: | |
new_name = os.path.join(directory, title + ".pdf") | |
print("%s" % new_name) | |
if not dry_run: | |
if os.path.exists(new_name): | |
print("*** Target %s already exists! ***" % new_name) | |
else: | |
stat = os.stat(filename) | |
os.rename(filename, new_name) | |
os.utime(new_name, (stat.st_atime, stat.st_mtime)) | |
else: | |
print("%s" % title) | |
return 0 | |
def path_leaf(path): | |
head, tail = os.path.split(path) | |
return tail or os.path.basename(head) | |
if __name__ == "__main__": | |
# a_logger = logging.getLogger() | |
# a_logger.setLevel(logging.INFO) | |
# output_file_handler = logging.FileHandler("output.log") | |
# stdout_handler = logging.StreamHandler(sys.stdout) | |
# a_logger.addHandler(output_file_handler) | |
# a_logger.addHandler(stdout_handler) | |
opts, args = getopt.getopt(sys.argv[1:], 'nd:', ['dry-run', 'rename', 'test']) | |
dry_run = False | |
rename = False | |
target_dir = "." | |
for opt, arg in opts: | |
if opt in ['-n', '--dry-run']: | |
dry_run = True | |
elif opt in ['-r', '--rename']: | |
rename = True | |
elif opt in ['-d']: | |
target_dir = arg | |
elif opt in ['-t', '--test']: | |
IS_LOG_ON = True | |
if len(args) == 0: | |
print("Usage: %s [-d output] [--dry-run] [--rename] [filenames]\n" % path_leaf(sys.argv[0])) | |
if ONE_CLICK_MODE: | |
args = ['*.pdf'] | |
rename = True | |
else: | |
sys.exit(1) | |
for filename in args: | |
if "*" in filename: | |
for filenameexpanded in glob.glob(filename): | |
process_file(target_dir, filenameexpanded, rename, dry_run) | |
else: | |
process_file(target_dir, filename, rename, dry_run) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment