nobucshirai · February 23, 2025 09:17
diff --git a/author_year_renamer.py b/author_year_renamer.py
 #!/usr/bin/env python3
 """
 Rename a PDF based on the DOI/arXiv ID extracted from its content and author information retrieved via the Crossref or arXiv APIs.
 """

 import argparse
 import os
 import sys
 import re
 import requests
 import shutil
 import feedparser
 import unicodedata
 from typing import Tuple, List, Optional, Union, Dict
 from pypdf import PdfReader

 def log(message: str, verbose: bool) -> None:
    """Prints a log message if verbose mode is enabled."""
    if verbose:
        print(message)

 def extract_text(pdf_file: str, verbose: bool) -> str:
    """
    Extract text from a PDF file using embedded functionality.

    Args:
        pdf_file: The path to the PDF file.
        verbose: Flag to enable verbose logging.

    Returns:
        The extracted text as a UTF-8 string.
    """
    log(f"Extracting text from {pdf_file}", verbose)
    text = ""
    try:
        with open(pdf_file, "rb") as f:
            reader = PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text() or ""
                text += page_text
    except Exception as e:
        log(f"Error reading {pdf_file}: {e}", verbose)
        sys.exit(1)
    return text

 def extract_doi(text: str, verbose: bool) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Extract a DOI or arXiv ID from the provided text.

    Args:
        text: The text content extracted from the PDF.
        verbose: Flag to enable verbose logging.

    Returns:
        A tuple containing:
            - The extracted identifier (DOI or arXiv ID),
            - The source ('crossref' or 'arxiv'),
            - The arXiv version string (if any).
        If no identifier is found, returns (None, None, None).
    """
    log("Extracting DOI/arXiv ID from the text", verbose)
    patterns = [
        r'https:\/\/doi\.org\/([10.\d+\/\S]+)',
        r'http:\/\/dx\.doi\.org\/([10.\d+\/\S]+)',
        r'doi:\s?([10.\d+\/\S]+)',
        r'DOI:\s?([10.\d+\/\S]+)',
        r'arXiv:(\d{4}\.\d{5})(v\d+)?',  # New arXiv format
        r'arXiv:([a-z-]+/\d{7})(v\d+)?'   # Old arXiv format
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            identifier = match.group(1)
            # Determine the source based on the pattern used
            source = "arxiv" if "arxiv" in pattern.lower() else "crossref"
            arxiv_version = match.group(2) if source == "arxiv" and match.group(2) else ""
            log(f"Found {source.upper()} ID: {identifier}", verbose)
            return identifier, source, arxiv_version

    log("DOI/arXiv ID not found.", verbose)
    return None, None, None

 def get_author_info(identifier: str, source: str, verbose: bool) -> Tuple[List[Dict[str, str]], Union[str, int], str]:
    """
    Retrieve author information and publication year from the appropriate API.

    Args:
        identifier: The DOI or arXiv ID.
        source: The source, either 'crossref' or 'arxiv'.
        verbose: Flag to enable verbose logging.

    Returns:
        A tuple containing:
            - A list of author dictionaries,
            - The publication year,
            - An unused field (empty string).
    """
    log(f"Fetching author information from {source.upper()} for ID: {identifier}", verbose)
    if source == 'crossref':
        base_url = 'https://api.crossref.org/works/'
        response = requests.get(base_url + identifier)
        if response.status_code == 404:
            log(f"DOI not found in Crossref database: {identifier}", verbose)
            return [], '', ''
        elif response.status_code != 200:
            log(f"Error fetching data from Crossref API. Status Code: {response.status_code}", verbose)
            return [], '', ''
        data = response.json().get('message', {})
        authors = data.get('author', [])
        year_parts = data.get('created', {}).get('date-parts', [])
        year = str(year_parts[0][0]) if year_parts and year_parts[0] else ''
        return authors, year, ''
    elif source == 'arxiv':
        base_url = f'http://export.arxiv.org/api/query?id_list={identifier}'
        response = requests.get(base_url)
        if response.status_code != 200:
            log(f"Error fetching data from arXiv API. Status Code: {response.status_code}", verbose)
            sys.exit(1)
        feed = feedparser.parse(response.content)
        if not feed.entries:
            log("No entries found for this arXiv ID.", verbose)
            sys.exit(1)
        entry = feed.entries[0]
        authors = []
        for author in entry.authors:
            name_parts = author.name.split()
            if not name_parts:
                continue
            family = name_parts[-1]
            given = " ".join(name_parts[:-1])
            authors.append({'family': family, 'given': given})
        year = entry.published_parsed.tm_year
        return authors, year, ''
    else:
        log("Invalid source. Must be either 'crossref' or 'arxiv'.", verbose)
        sys.exit(1)

 def format_authors(authors: List[Dict[str, str]], year: Union[str, int], arxiv_version: str,
                   verbose: bool, underscores: bool = False,
                   num_authors: Optional[int] = None, prioritize_last: bool = True) -> str:
    """
    Format author names and publication year into a new filename.

    This function now supports selecting a specific number of authors.
    By default, if num_authors is set to 2, it will pick the first and the last author.
    If num_authors is greater than 2 and prioritize_last is True, the output will include
    the first (num_authors-1) authors and the last author. If prioritize_last is False,
    it will include the first num_authors authors in order.

    Args:
        authors: List of author dictionaries.
        year: The publication year.
        arxiv_version: The version string from arXiv (if any).
        verbose: Flag to enable verbose logging.
        underscores: Flag to insert underscores between names and between name and year.
        num_authors: The number of authors to include in the filename.
                     If None, defaults to the old behavior (1 for a single author; first and last for multiple authors).
        prioritize_last: Whether to always include the last author when num_authors > 1.

    Returns:
        A string representing the new filename (without file extension).
    """
    log("Formatting author names and publication year", verbose)
    if not authors:
        sys.exit("Error: Author information could not be retrieved. File will not be renamed.")

    year_str = str(year)

    # Determine which authors to include based on num_authors and prioritize_last.
    if num_authors is None:
        # Fallback to old behavior.
        if len(authors) == 1:
            selected_authors = authors
        else:
            selected_authors = [authors[0], authors[-1]]
    else:
        if num_authors < 1:
            num_authors = 1
        if len(authors) <= num_authors:
            selected_authors = authors
        else:
            if prioritize_last:
                if num_authors == 1:
                    selected_authors = [authors[0]]
                else:
                    selected_authors = authors[:num_authors - 1] + [authors[-1]]
            else:
                selected_authors = authors[:num_authors]

    # Join selected authors' family names.
    delimiter = "_" if underscores else ""
    formatted_name = delimiter.join(author.get('family', '') for author in selected_authors)
    formatted_name += delimiter + year_str if underscores else year_str

    if arxiv_version:
        if underscores:
            formatted_name += f"_arXiv{arxiv_version}"
        else:
            formatted_name += f"arXiv{arxiv_version}"

    # Normalize to ASCII to avoid encoding issues in filenames.
    formatted_name = unicodedata.normalize('NFKD', formatted_name).encode('ascii', 'ignore').decode('utf-8')
    log(f"Formatted name: {formatted_name}", verbose)
    return formatted_name

 def rename_pdf(original_pdf: str, new_name: str, verbose: bool, unique: bool = False) -> None:
    """
    Rename the original PDF file to the new filename.

    Args:
        original_pdf: The original PDF file path.
        new_name: The new filename (without extension).
        verbose: Flag to enable verbose logging.
        unique: If True, automatically append a numerical suffix to avoid filename conflicts.
    """
    directory, original_name = os.path.split(original_pdf)
    new_pdf = os.path.join(directory, new_name + ".pdf")

    if original_name == new_name + ".pdf":
        print(f"{original_name} is already formatted. No changes made.")
        return

    if os.path.isfile(new_pdf):
        if unique:
            base_new_name = new_name
            suffix = 1
            while True:
                candidate = os.path.join(directory, f"{base_new_name}_{suffix}.pdf")
                if not os.path.isfile(candidate):
                    new_pdf = candidate
                    break
                suffix += 1
        else:
            overwrite = input(f"{new_pdf} already exists. Overwrite? (y)/(n): ")
            if overwrite.lower() != 'y':
                log("Operation cancelled by user.", verbose)
                return
    shutil.move(original_pdf, new_pdf)
    print(f"Renamed '{original_name}' to '{os.path.basename(new_pdf)}'")

 def process_pdf(pdf_file: str, args) -> None:
    """
    Process a single PDF file: extract text, retrieve metadata, format a new name, and rename the file.
    """
    text = extract_text(pdf_file, args.verbose)
    doi, source, arxiv_version = extract_doi(text, args.verbose)

    if not doi:
        print(f"Could not extract DOI/arXiv ID from {pdf_file}. Skipping.")
        return

    if args.doi_based:
        # Replace '/' with '_' to ensure a valid filename.
        formatted_name = doi.replace('/', '_')
        log(f"Using DOI-based naming: {formatted_name}", args.verbose)
    else:
        authors, year, _ = get_author_info(doi, source, args.verbose)
        prioritize_last = not args.no_priority_last
        num_authors = len(authors) if args.all_authors else args.num_authors
        formatted_name = format_authors(
            authors, year, arxiv_version, args.verbose,
            underscores=args.underscores,
            num_authors=num_authors,
            prioritize_last=prioritize_last
        )

    rename_pdf(pdf_file, formatted_name, args.verbose, unique=args.unique)

 def main() -> None:
    parser = argparse.ArgumentParser(
        description='Rename a PDF based on the DOI/arXiv ID and associated author information.'
    )
    # Accept one or more PDF files.
    parser.add_argument('pdf_files', nargs='+', type=str, help='One or more PDF files to process.')
    parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
    parser.add_argument('--doi-based', action='store_true',
                        help='Use DOI for naming instead of author-year format')
    parser.add_argument('--underscores', action='store_true',
                        help='Insert underscores between names and between name and year in the filename')
    parser.add_argument('--num-authors', type=int, default=2,
                        help='Specify the number of authors to include in the filename. Default is 2 (first and last author).')
    parser.add_argument('--no-priority-last', action='store_true',
                        help='Do not prioritize the last author when selecting authors; use the first N authors instead.')
    parser.add_argument('--all-authors', action='store_true',
                        help='Include all author information in the filename')
    parser.add_argument('--unique', action='store_true',
                        help='Automatically append a numerical suffix to avoid filename conflicts')
    args = parser.parse_args()

    for pdf_file in args.pdf_files:
        process_pdf(pdf_file, args)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Rename a PDF based on the DOI/arXiv ID extracted from its content and author information retrieved via the Crossref or arXiv APIs.
	"""

	import argparse
	import os
	import sys
	import re
	import requests
	import shutil
	import feedparser
	import unicodedata
	from typing import Tuple, List, Optional, Union, Dict
	from pypdf import PdfReader

	def log(message: str, verbose: bool) -> None:
	"""Prints a log message if verbose mode is enabled."""
	if verbose:
	print(message)

	def extract_text(pdf_file: str, verbose: bool) -> str:
	"""
	Extract text from a PDF file using embedded functionality.

	Args:
	pdf_file: The path to the PDF file.
	verbose: Flag to enable verbose logging.

	Returns:
	The extracted text as a UTF-8 string.
	"""
	log(f"Extracting text from {pdf_file}", verbose)
	text = ""
	try:
	with open(pdf_file, "rb") as f:
	reader = PdfReader(f)
	for page in reader.pages:
	page_text = page.extract_text() or ""
	text += page_text
	except Exception as e:
	log(f"Error reading {pdf_file}: {e}", verbose)
	sys.exit(1)
	return text

	def extract_doi(text: str, verbose: bool) -> Tuple[Optional[str], Optional[str], Optional[str]]:
	"""
	Extract a DOI or arXiv ID from the provided text.

	Args:
	text: The text content extracted from the PDF.
	verbose: Flag to enable verbose logging.

	Returns:
	A tuple containing:
	- The extracted identifier (DOI or arXiv ID),
	- The source ('crossref' or 'arxiv'),
	- The arXiv version string (if any).
	If no identifier is found, returns (None, None, None).
	"""
	log("Extracting DOI/arXiv ID from the text", verbose)
	patterns = [
	r'https:\/\/doi\.org\/([10.\d+\/\S]+)',
	r'http:\/\/dx\.doi\.org\/([10.\d+\/\S]+)',
	r'doi:\s?([10.\d+\/\S]+)',
	r'DOI:\s?([10.\d+\/\S]+)',
	r'arXiv:(\d{4}\.\d{5})(v\d+)?', # New arXiv format
	r'arXiv:([a-z-]+/\d{7})(v\d+)?' # Old arXiv format
	]
	for pattern in patterns:
	match = re.search(pattern, text)
	if match:
	identifier = match.group(1)
	# Determine the source based on the pattern used
	source = "arxiv" if "arxiv" in pattern.lower() else "crossref"
	arxiv_version = match.group(2) if source == "arxiv" and match.group(2) else ""
	log(f"Found {source.upper()} ID: {identifier}", verbose)
	return identifier, source, arxiv_version

	log("DOI/arXiv ID not found.", verbose)
	return None, None, None

	def get_author_info(identifier: str, source: str, verbose: bool) -> Tuple[List[Dict[str, str]], Union[str, int], str]:
	"""
	Retrieve author information and publication year from the appropriate API.

	Args:
	identifier: The DOI or arXiv ID.
	source: The source, either 'crossref' or 'arxiv'.
	verbose: Flag to enable verbose logging.

	Returns:
	A tuple containing:
	- A list of author dictionaries,
	- The publication year,
	- An unused field (empty string).
	"""
	log(f"Fetching author information from {source.upper()} for ID: {identifier}", verbose)
	if source == 'crossref':
	base_url = 'https://api.crossref.org/works/'
	response = requests.get(base_url + identifier)
	if response.status_code == 404:
	log(f"DOI not found in Crossref database: {identifier}", verbose)
	return [], '', ''
	elif response.status_code != 200:
	log(f"Error fetching data from Crossref API. Status Code: {response.status_code}", verbose)
	return [], '', ''
	data = response.json().get('message', {})
	authors = data.get('author', [])
	year_parts = data.get('created', {}).get('date-parts', [])
	year = str(year_parts[0][0]) if year_parts and year_parts[0] else ''
	return authors, year, ''
	elif source == 'arxiv':
	base_url = f'http://export.arxiv.org/api/query?id_list={identifier}'
	response = requests.get(base_url)
	if response.status_code != 200:
	log(f"Error fetching data from arXiv API. Status Code: {response.status_code}", verbose)
	sys.exit(1)
	feed = feedparser.parse(response.content)
	if not feed.entries:
	log("No entries found for this arXiv ID.", verbose)
	sys.exit(1)
	entry = feed.entries[0]
	authors = []
	for author in entry.authors:
	name_parts = author.name.split()
	if not name_parts:
	continue
	family = name_parts[-1]
	given = " ".join(name_parts[:-1])
	authors.append({'family': family, 'given': given})
	year = entry.published_parsed.tm_year
	return authors, year, ''
	else:
	log("Invalid source. Must be either 'crossref' or 'arxiv'.", verbose)
	sys.exit(1)

	def format_authors(authors: List[Dict[str, str]], year: Union[str, int], arxiv_version: str,
	verbose: bool, underscores: bool = False,
	num_authors: Optional[int] = None, prioritize_last: bool = True) -> str:
	"""
	Format author names and publication year into a new filename.

	This function now supports selecting a specific number of authors.
	By default, if num_authors is set to 2, it will pick the first and the last author.
	If num_authors is greater than 2 and prioritize_last is True, the output will include
	the first (num_authors-1) authors and the last author. If prioritize_last is False,
	it will include the first num_authors authors in order.

	Args:
	authors: List of author dictionaries.
	year: The publication year.
	arxiv_version: The version string from arXiv (if any).
	verbose: Flag to enable verbose logging.
	underscores: Flag to insert underscores between names and between name and year.
	num_authors: The number of authors to include in the filename.
	If None, defaults to the old behavior (1 for a single author; first and last for multiple authors).
	prioritize_last: Whether to always include the last author when num_authors > 1.

	Returns:
	A string representing the new filename (without file extension).
	"""
	log("Formatting author names and publication year", verbose)
	if not authors:
	sys.exit("Error: Author information could not be retrieved. File will not be renamed.")

	year_str = str(year)

	# Determine which authors to include based on num_authors and prioritize_last.
	if num_authors is None:
	# Fallback to old behavior.
	if len(authors) == 1:
	selected_authors = authors
	else:
	selected_authors = [authors[0], authors[-1]]
	else:
	if num_authors < 1:
	num_authors = 1
	if len(authors) <= num_authors:
	selected_authors = authors
	else:
	if prioritize_last:
	if num_authors == 1:
	selected_authors = [authors[0]]
	else:
	selected_authors = authors[:num_authors - 1] + [authors[-1]]
	else:
	selected_authors = authors[:num_authors]

	# Join selected authors' family names.
	delimiter = "_" if underscores else ""
	formatted_name = delimiter.join(author.get('family', '') for author in selected_authors)
	formatted_name += delimiter + year_str if underscores else year_str

	if arxiv_version:
	if underscores:
	formatted_name += f"_arXiv{arxiv_version}"
	else:
	formatted_name += f"arXiv{arxiv_version}"

	# Normalize to ASCII to avoid encoding issues in filenames.
	formatted_name = unicodedata.normalize('NFKD', formatted_name).encode('ascii', 'ignore').decode('utf-8')
	log(f"Formatted name: {formatted_name}", verbose)
	return formatted_name

	def rename_pdf(original_pdf: str, new_name: str, verbose: bool, unique: bool = False) -> None:
	"""
	Rename the original PDF file to the new filename.

	Args:
	original_pdf: The original PDF file path.
	new_name: The new filename (without extension).
	verbose: Flag to enable verbose logging.
	unique: If True, automatically append a numerical suffix to avoid filename conflicts.
	"""
	directory, original_name = os.path.split(original_pdf)
	new_pdf = os.path.join(directory, new_name + ".pdf")

	if original_name == new_name + ".pdf":
	print(f"{original_name} is already formatted. No changes made.")
	return

	if os.path.isfile(new_pdf):
	if unique:
	base_new_name = new_name
	suffix = 1
	while True:
	candidate = os.path.join(directory, f"{base_new_name}_{suffix}.pdf")
	if not os.path.isfile(candidate):
	new_pdf = candidate
	break
	suffix += 1
	else:
	overwrite = input(f"{new_pdf} already exists. Overwrite? (y)/(n): ")
	if overwrite.lower() != 'y':
	log("Operation cancelled by user.", verbose)
	return
	shutil.move(original_pdf, new_pdf)
	print(f"Renamed '{original_name}' to '{os.path.basename(new_pdf)}'")

	def process_pdf(pdf_file: str, args) -> None:
	"""
	Process a single PDF file: extract text, retrieve metadata, format a new name, and rename the file.
	"""
	text = extract_text(pdf_file, args.verbose)
	doi, source, arxiv_version = extract_doi(text, args.verbose)

	if not doi:
	print(f"Could not extract DOI/arXiv ID from {pdf_file}. Skipping.")
	return

	if args.doi_based:
	# Replace '/' with '_' to ensure a valid filename.
	formatted_name = doi.replace('/', '_')
	log(f"Using DOI-based naming: {formatted_name}", args.verbose)
	else:
	authors, year, _ = get_author_info(doi, source, args.verbose)
	prioritize_last = not args.no_priority_last
	num_authors = len(authors) if args.all_authors else args.num_authors
	formatted_name = format_authors(
	authors, year, arxiv_version, args.verbose,
	underscores=args.underscores,
	num_authors=num_authors,
	prioritize_last=prioritize_last
	)

	rename_pdf(pdf_file, formatted_name, args.verbose, unique=args.unique)

	def main() -> None:
	parser = argparse.ArgumentParser(
	description='Rename a PDF based on the DOI/arXiv ID and associated author information.'
	)
	# Accept one or more PDF files.
	parser.add_argument('pdf_files', nargs='+', type=str, help='One or more PDF files to process.')
	parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
	parser.add_argument('--doi-based', action='store_true',
	help='Use DOI for naming instead of author-year format')
	parser.add_argument('--underscores', action='store_true',
	help='Insert underscores between names and between name and year in the filename')
	parser.add_argument('--num-authors', type=int, default=2,
	help='Specify the number of authors to include in the filename. Default is 2 (first and last author).')
	parser.add_argument('--no-priority-last', action='store_true',
	help='Do not prioritize the last author when selecting authors; use the first N authors instead.')
	parser.add_argument('--all-authors', action='store_true',
	help='Include all author information in the filename')
	parser.add_argument('--unique', action='store_true',
	help='Automatically append a numerical suffix to avoid filename conflicts')
	args = parser.parse_args()

	for pdf_file in args.pdf_files:
	process_pdf(pdf_file, args)

	if __name__ == "__main__":
	main()