Skip to content

Instantly share code, notes, and snippets.

@nobucshirai
Last active February 23, 2025 09:17
Show Gist options
  • Save nobucshirai/49421e7953140287233431f7f1ee1d9a to your computer and use it in GitHub Desktop.
Save nobucshirai/49421e7953140287233431f7f1ee1d9a to your computer and use it in GitHub Desktop.
Academic Paper Renamer: Extracts text, scans for DOI/arXiv IDs, retrieves metadata from Crossref or arXiv, and renames the file systematically.
#!/usr/bin/env python3
"""
Rename a PDF based on the DOI/arXiv ID extracted from its content and author information retrieved via the Crossref or arXiv APIs.
"""
import argparse
import os
import sys
import re
import requests
import shutil
import feedparser
import unicodedata
from typing import Tuple, List, Optional, Union, Dict
from pypdf import PdfReader
def log(message: str, verbose: bool) -> None:
"""Prints a log message if verbose mode is enabled."""
if verbose:
print(message)
def extract_text(pdf_file: str, verbose: bool) -> str:
"""
Extract text from a PDF file using embedded functionality.
Args:
pdf_file: The path to the PDF file.
verbose: Flag to enable verbose logging.
Returns:
The extracted text as a UTF-8 string.
"""
log(f"Extracting text from {pdf_file}", verbose)
text = ""
try:
with open(pdf_file, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text
except Exception as e:
log(f"Error reading {pdf_file}: {e}", verbose)
sys.exit(1)
return text
def extract_doi(text: str, verbose: bool) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Extract a DOI or arXiv ID from the provided text.
Args:
text: The text content extracted from the PDF.
verbose: Flag to enable verbose logging.
Returns:
A tuple containing:
- The extracted identifier (DOI or arXiv ID),
- The source ('crossref' or 'arxiv'),
- The arXiv version string (if any).
If no identifier is found, returns (None, None, None).
"""
log("Extracting DOI/arXiv ID from the text", verbose)
patterns = [
r'https:\/\/doi\.org\/([10.\d+\/\S]+)',
r'http:\/\/dx\.doi\.org\/([10.\d+\/\S]+)',
r'doi:\s?([10.\d+\/\S]+)',
r'DOI:\s?([10.\d+\/\S]+)',
r'arXiv:(\d{4}\.\d{5})(v\d+)?', # New arXiv format
r'arXiv:([a-z-]+/\d{7})(v\d+)?' # Old arXiv format
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
identifier = match.group(1)
# Determine the source based on the pattern used
source = "arxiv" if "arxiv" in pattern.lower() else "crossref"
arxiv_version = match.group(2) if source == "arxiv" and match.group(2) else ""
log(f"Found {source.upper()} ID: {identifier}", verbose)
return identifier, source, arxiv_version
log("DOI/arXiv ID not found.", verbose)
return None, None, None
def get_author_info(identifier: str, source: str, verbose: bool) -> Tuple[List[Dict[str, str]], Union[str, int], str]:
"""
Retrieve author information and publication year from the appropriate API.
Args:
identifier: The DOI or arXiv ID.
source: The source, either 'crossref' or 'arxiv'.
verbose: Flag to enable verbose logging.
Returns:
A tuple containing:
- A list of author dictionaries,
- The publication year,
- An unused field (empty string).
"""
log(f"Fetching author information from {source.upper()} for ID: {identifier}", verbose)
if source == 'crossref':
base_url = 'https://api.crossref.org/works/'
response = requests.get(base_url + identifier)
if response.status_code == 404:
log(f"DOI not found in Crossref database: {identifier}", verbose)
return [], '', ''
elif response.status_code != 200:
log(f"Error fetching data from Crossref API. Status Code: {response.status_code}", verbose)
return [], '', ''
data = response.json().get('message', {})
authors = data.get('author', [])
year_parts = data.get('created', {}).get('date-parts', [])
year = str(year_parts[0][0]) if year_parts and year_parts[0] else ''
return authors, year, ''
elif source == 'arxiv':
base_url = f'http://export.arxiv.org/api/query?id_list={identifier}'
response = requests.get(base_url)
if response.status_code != 200:
log(f"Error fetching data from arXiv API. Status Code: {response.status_code}", verbose)
sys.exit(1)
feed = feedparser.parse(response.content)
if not feed.entries:
log("No entries found for this arXiv ID.", verbose)
sys.exit(1)
entry = feed.entries[0]
authors = []
for author in entry.authors:
name_parts = author.name.split()
if not name_parts:
continue
family = name_parts[-1]
given = " ".join(name_parts[:-1])
authors.append({'family': family, 'given': given})
year = entry.published_parsed.tm_year
return authors, year, ''
else:
log("Invalid source. Must be either 'crossref' or 'arxiv'.", verbose)
sys.exit(1)
def format_authors(authors: List[Dict[str, str]], year: Union[str, int], arxiv_version: str,
verbose: bool, underscores: bool = False,
num_authors: Optional[int] = None, prioritize_last: bool = True) -> str:
"""
Format author names and publication year into a new filename.
This function now supports selecting a specific number of authors.
By default, if num_authors is set to 2, it will pick the first and the last author.
If num_authors is greater than 2 and prioritize_last is True, the output will include
the first (num_authors-1) authors and the last author. If prioritize_last is False,
it will include the first num_authors authors in order.
Args:
authors: List of author dictionaries.
year: The publication year.
arxiv_version: The version string from arXiv (if any).
verbose: Flag to enable verbose logging.
underscores: Flag to insert underscores between names and between name and year.
num_authors: The number of authors to include in the filename.
If None, defaults to the old behavior (1 for a single author; first and last for multiple authors).
prioritize_last: Whether to always include the last author when num_authors > 1.
Returns:
A string representing the new filename (without file extension).
"""
log("Formatting author names and publication year", verbose)
if not authors:
sys.exit("Error: Author information could not be retrieved. File will not be renamed.")
year_str = str(year)
# Determine which authors to include based on num_authors and prioritize_last.
if num_authors is None:
# Fallback to old behavior.
if len(authors) == 1:
selected_authors = authors
else:
selected_authors = [authors[0], authors[-1]]
else:
if num_authors < 1:
num_authors = 1
if len(authors) <= num_authors:
selected_authors = authors
else:
if prioritize_last:
if num_authors == 1:
selected_authors = [authors[0]]
else:
selected_authors = authors[:num_authors - 1] + [authors[-1]]
else:
selected_authors = authors[:num_authors]
# Join selected authors' family names.
delimiter = "_" if underscores else ""
formatted_name = delimiter.join(author.get('family', '') for author in selected_authors)
formatted_name += delimiter + year_str if underscores else year_str
if arxiv_version:
if underscores:
formatted_name += f"_arXiv{arxiv_version}"
else:
formatted_name += f"arXiv{arxiv_version}"
# Normalize to ASCII to avoid encoding issues in filenames.
formatted_name = unicodedata.normalize('NFKD', formatted_name).encode('ascii', 'ignore').decode('utf-8')
log(f"Formatted name: {formatted_name}", verbose)
return formatted_name
def rename_pdf(original_pdf: str, new_name: str, verbose: bool, unique: bool = False) -> None:
"""
Rename the original PDF file to the new filename.
Args:
original_pdf: The original PDF file path.
new_name: The new filename (without extension).
verbose: Flag to enable verbose logging.
unique: If True, automatically append a numerical suffix to avoid filename conflicts.
"""
directory, original_name = os.path.split(original_pdf)
new_pdf = os.path.join(directory, new_name + ".pdf")
if original_name == new_name + ".pdf":
print(f"{original_name} is already formatted. No changes made.")
return
if os.path.isfile(new_pdf):
if unique:
base_new_name = new_name
suffix = 1
while True:
candidate = os.path.join(directory, f"{base_new_name}_{suffix}.pdf")
if not os.path.isfile(candidate):
new_pdf = candidate
break
suffix += 1
else:
overwrite = input(f"{new_pdf} already exists. Overwrite? (y)/(n): ")
if overwrite.lower() != 'y':
log("Operation cancelled by user.", verbose)
return
shutil.move(original_pdf, new_pdf)
print(f"Renamed '{original_name}' to '{os.path.basename(new_pdf)}'")
def process_pdf(pdf_file: str, args) -> None:
"""
Process a single PDF file: extract text, retrieve metadata, format a new name, and rename the file.
"""
text = extract_text(pdf_file, args.verbose)
doi, source, arxiv_version = extract_doi(text, args.verbose)
if not doi:
print(f"Could not extract DOI/arXiv ID from {pdf_file}. Skipping.")
return
if args.doi_based:
# Replace '/' with '_' to ensure a valid filename.
formatted_name = doi.replace('/', '_')
log(f"Using DOI-based naming: {formatted_name}", args.verbose)
else:
authors, year, _ = get_author_info(doi, source, args.verbose)
prioritize_last = not args.no_priority_last
num_authors = len(authors) if args.all_authors else args.num_authors
formatted_name = format_authors(
authors, year, arxiv_version, args.verbose,
underscores=args.underscores,
num_authors=num_authors,
prioritize_last=prioritize_last
)
rename_pdf(pdf_file, formatted_name, args.verbose, unique=args.unique)
def main() -> None:
parser = argparse.ArgumentParser(
description='Rename a PDF based on the DOI/arXiv ID and associated author information.'
)
# Accept one or more PDF files.
parser.add_argument('pdf_files', nargs='+', type=str, help='One or more PDF files to process.')
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
parser.add_argument('--doi-based', action='store_true',
help='Use DOI for naming instead of author-year format')
parser.add_argument('--underscores', action='store_true',
help='Insert underscores between names and between name and year in the filename')
parser.add_argument('--num-authors', type=int, default=2,
help='Specify the number of authors to include in the filename. Default is 2 (first and last author).')
parser.add_argument('--no-priority-last', action='store_true',
help='Do not prioritize the last author when selecting authors; use the first N authors instead.')
parser.add_argument('--all-authors', action='store_true',
help='Include all author information in the filename')
parser.add_argument('--unique', action='store_true',
help='Automatically append a numerical suffix to avoid filename conflicts')
args = parser.parse_args()
for pdf_file in args.pdf_files:
process_pdf(pdf_file, args)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment