Instantly share code, notes, and snippets.
Last active
February 23, 2025 09:17
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save nobucshirai/49421e7953140287233431f7f1ee1d9a to your computer and use it in GitHub Desktop.
Academic Paper Renamer: Extracts text, scans for DOI/arXiv IDs, retrieves metadata from Crossref or arXiv, and renames the file systematically.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Rename a PDF based on the DOI/arXiv ID extracted from its content and author information retrieved via the Crossref or arXiv APIs. | |
""" | |
import argparse | |
import os | |
import sys | |
import re | |
import requests | |
import shutil | |
import feedparser | |
import unicodedata | |
from typing import Tuple, List, Optional, Union, Dict | |
from pypdf import PdfReader | |
def log(message: str, verbose: bool) -> None: | |
"""Prints a log message if verbose mode is enabled.""" | |
if verbose: | |
print(message) | |
def extract_text(pdf_file: str, verbose: bool) -> str: | |
""" | |
Extract text from a PDF file using embedded functionality. | |
Args: | |
pdf_file: The path to the PDF file. | |
verbose: Flag to enable verbose logging. | |
Returns: | |
The extracted text as a UTF-8 string. | |
""" | |
log(f"Extracting text from {pdf_file}", verbose) | |
text = "" | |
try: | |
with open(pdf_file, "rb") as f: | |
reader = PdfReader(f) | |
for page in reader.pages: | |
page_text = page.extract_text() or "" | |
text += page_text | |
except Exception as e: | |
log(f"Error reading {pdf_file}: {e}", verbose) | |
sys.exit(1) | |
return text | |
def extract_doi(text: str, verbose: bool) -> Tuple[Optional[str], Optional[str], Optional[str]]: | |
""" | |
Extract a DOI or arXiv ID from the provided text. | |
Args: | |
text: The text content extracted from the PDF. | |
verbose: Flag to enable verbose logging. | |
Returns: | |
A tuple containing: | |
- The extracted identifier (DOI or arXiv ID), | |
- The source ('crossref' or 'arxiv'), | |
- The arXiv version string (if any). | |
If no identifier is found, returns (None, None, None). | |
""" | |
log("Extracting DOI/arXiv ID from the text", verbose) | |
patterns = [ | |
r'https:\/\/doi\.org\/([10.\d+\/\S]+)', | |
r'http:\/\/dx\.doi\.org\/([10.\d+\/\S]+)', | |
r'doi:\s?([10.\d+\/\S]+)', | |
r'DOI:\s?([10.\d+\/\S]+)', | |
r'arXiv:(\d{4}\.\d{5})(v\d+)?', # New arXiv format | |
r'arXiv:([a-z-]+/\d{7})(v\d+)?' # Old arXiv format | |
] | |
for pattern in patterns: | |
match = re.search(pattern, text) | |
if match: | |
identifier = match.group(1) | |
# Determine the source based on the pattern used | |
source = "arxiv" if "arxiv" in pattern.lower() else "crossref" | |
arxiv_version = match.group(2) if source == "arxiv" and match.group(2) else "" | |
log(f"Found {source.upper()} ID: {identifier}", verbose) | |
return identifier, source, arxiv_version | |
log("DOI/arXiv ID not found.", verbose) | |
return None, None, None | |
def get_author_info(identifier: str, source: str, verbose: bool) -> Tuple[List[Dict[str, str]], Union[str, int], str]: | |
""" | |
Retrieve author information and publication year from the appropriate API. | |
Args: | |
identifier: The DOI or arXiv ID. | |
source: The source, either 'crossref' or 'arxiv'. | |
verbose: Flag to enable verbose logging. | |
Returns: | |
A tuple containing: | |
- A list of author dictionaries, | |
- The publication year, | |
- An unused field (empty string). | |
""" | |
log(f"Fetching author information from {source.upper()} for ID: {identifier}", verbose) | |
if source == 'crossref': | |
base_url = 'https://api.crossref.org/works/' | |
response = requests.get(base_url + identifier) | |
if response.status_code == 404: | |
log(f"DOI not found in Crossref database: {identifier}", verbose) | |
return [], '', '' | |
elif response.status_code != 200: | |
log(f"Error fetching data from Crossref API. Status Code: {response.status_code}", verbose) | |
return [], '', '' | |
data = response.json().get('message', {}) | |
authors = data.get('author', []) | |
year_parts = data.get('created', {}).get('date-parts', []) | |
year = str(year_parts[0][0]) if year_parts and year_parts[0] else '' | |
return authors, year, '' | |
elif source == 'arxiv': | |
base_url = f'http://export.arxiv.org/api/query?id_list={identifier}' | |
response = requests.get(base_url) | |
if response.status_code != 200: | |
log(f"Error fetching data from arXiv API. Status Code: {response.status_code}", verbose) | |
sys.exit(1) | |
feed = feedparser.parse(response.content) | |
if not feed.entries: | |
log("No entries found for this arXiv ID.", verbose) | |
sys.exit(1) | |
entry = feed.entries[0] | |
authors = [] | |
for author in entry.authors: | |
name_parts = author.name.split() | |
if not name_parts: | |
continue | |
family = name_parts[-1] | |
given = " ".join(name_parts[:-1]) | |
authors.append({'family': family, 'given': given}) | |
year = entry.published_parsed.tm_year | |
return authors, year, '' | |
else: | |
log("Invalid source. Must be either 'crossref' or 'arxiv'.", verbose) | |
sys.exit(1) | |
def format_authors(authors: List[Dict[str, str]], year: Union[str, int], arxiv_version: str, | |
verbose: bool, underscores: bool = False, | |
num_authors: Optional[int] = None, prioritize_last: bool = True) -> str: | |
""" | |
Format author names and publication year into a new filename. | |
This function now supports selecting a specific number of authors. | |
By default, if num_authors is set to 2, it will pick the first and the last author. | |
If num_authors is greater than 2 and prioritize_last is True, the output will include | |
the first (num_authors-1) authors and the last author. If prioritize_last is False, | |
it will include the first num_authors authors in order. | |
Args: | |
authors: List of author dictionaries. | |
year: The publication year. | |
arxiv_version: The version string from arXiv (if any). | |
verbose: Flag to enable verbose logging. | |
underscores: Flag to insert underscores between names and between name and year. | |
num_authors: The number of authors to include in the filename. | |
If None, defaults to the old behavior (1 for a single author; first and last for multiple authors). | |
prioritize_last: Whether to always include the last author when num_authors > 1. | |
Returns: | |
A string representing the new filename (without file extension). | |
""" | |
log("Formatting author names and publication year", verbose) | |
if not authors: | |
sys.exit("Error: Author information could not be retrieved. File will not be renamed.") | |
year_str = str(year) | |
# Determine which authors to include based on num_authors and prioritize_last. | |
if num_authors is None: | |
# Fallback to old behavior. | |
if len(authors) == 1: | |
selected_authors = authors | |
else: | |
selected_authors = [authors[0], authors[-1]] | |
else: | |
if num_authors < 1: | |
num_authors = 1 | |
if len(authors) <= num_authors: | |
selected_authors = authors | |
else: | |
if prioritize_last: | |
if num_authors == 1: | |
selected_authors = [authors[0]] | |
else: | |
selected_authors = authors[:num_authors - 1] + [authors[-1]] | |
else: | |
selected_authors = authors[:num_authors] | |
# Join selected authors' family names. | |
delimiter = "_" if underscores else "" | |
formatted_name = delimiter.join(author.get('family', '') for author in selected_authors) | |
formatted_name += delimiter + year_str if underscores else year_str | |
if arxiv_version: | |
if underscores: | |
formatted_name += f"_arXiv{arxiv_version}" | |
else: | |
formatted_name += f"arXiv{arxiv_version}" | |
# Normalize to ASCII to avoid encoding issues in filenames. | |
formatted_name = unicodedata.normalize('NFKD', formatted_name).encode('ascii', 'ignore').decode('utf-8') | |
log(f"Formatted name: {formatted_name}", verbose) | |
return formatted_name | |
def rename_pdf(original_pdf: str, new_name: str, verbose: bool, unique: bool = False) -> None: | |
""" | |
Rename the original PDF file to the new filename. | |
Args: | |
original_pdf: The original PDF file path. | |
new_name: The new filename (without extension). | |
verbose: Flag to enable verbose logging. | |
unique: If True, automatically append a numerical suffix to avoid filename conflicts. | |
""" | |
directory, original_name = os.path.split(original_pdf) | |
new_pdf = os.path.join(directory, new_name + ".pdf") | |
if original_name == new_name + ".pdf": | |
print(f"{original_name} is already formatted. No changes made.") | |
return | |
if os.path.isfile(new_pdf): | |
if unique: | |
base_new_name = new_name | |
suffix = 1 | |
while True: | |
candidate = os.path.join(directory, f"{base_new_name}_{suffix}.pdf") | |
if not os.path.isfile(candidate): | |
new_pdf = candidate | |
break | |
suffix += 1 | |
else: | |
overwrite = input(f"{new_pdf} already exists. Overwrite? (y)/(n): ") | |
if overwrite.lower() != 'y': | |
log("Operation cancelled by user.", verbose) | |
return | |
shutil.move(original_pdf, new_pdf) | |
print(f"Renamed '{original_name}' to '{os.path.basename(new_pdf)}'") | |
def process_pdf(pdf_file: str, args) -> None: | |
""" | |
Process a single PDF file: extract text, retrieve metadata, format a new name, and rename the file. | |
""" | |
text = extract_text(pdf_file, args.verbose) | |
doi, source, arxiv_version = extract_doi(text, args.verbose) | |
if not doi: | |
print(f"Could not extract DOI/arXiv ID from {pdf_file}. Skipping.") | |
return | |
if args.doi_based: | |
# Replace '/' with '_' to ensure a valid filename. | |
formatted_name = doi.replace('/', '_') | |
log(f"Using DOI-based naming: {formatted_name}", args.verbose) | |
else: | |
authors, year, _ = get_author_info(doi, source, args.verbose) | |
prioritize_last = not args.no_priority_last | |
num_authors = len(authors) if args.all_authors else args.num_authors | |
formatted_name = format_authors( | |
authors, year, arxiv_version, args.verbose, | |
underscores=args.underscores, | |
num_authors=num_authors, | |
prioritize_last=prioritize_last | |
) | |
rename_pdf(pdf_file, formatted_name, args.verbose, unique=args.unique) | |
def main() -> None: | |
parser = argparse.ArgumentParser( | |
description='Rename a PDF based on the DOI/arXiv ID and associated author information.' | |
) | |
# Accept one or more PDF files. | |
parser.add_argument('pdf_files', nargs='+', type=str, help='One or more PDF files to process.') | |
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging') | |
parser.add_argument('--doi-based', action='store_true', | |
help='Use DOI for naming instead of author-year format') | |
parser.add_argument('--underscores', action='store_true', | |
help='Insert underscores between names and between name and year in the filename') | |
parser.add_argument('--num-authors', type=int, default=2, | |
help='Specify the number of authors to include in the filename. Default is 2 (first and last author).') | |
parser.add_argument('--no-priority-last', action='store_true', | |
help='Do not prioritize the last author when selecting authors; use the first N authors instead.') | |
parser.add_argument('--all-authors', action='store_true', | |
help='Include all author information in the filename') | |
parser.add_argument('--unique', action='store_true', | |
help='Automatically append a numerical suffix to avoid filename conflicts') | |
args = parser.parse_args() | |
for pdf_file in args.pdf_files: | |
process_pdf(pdf_file, args) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment