amazingvince · June 28, 2025 21:15 · pszemraj · Jun 29, 2025 · pszemraj · Jun 29, 2025
diff --git a/Build_commands.md b/Build_commands.md
diff --git a/huggingface-upload-script.py b/huggingface-upload-script.py
 #!/usr/bin/env python3
 """
 Upload Survivor Library text files to Hugging Face Hub
 Based on https://gist.github.com/pszemraj/502cf50767e8c363947f5e93cb95cc07
 """

 import os
 import json
 import argparse
 from pathlib import Path
 from typing import Dict, List, Optional
 from datetime import datetime
 import hashlib

 try:
    from huggingface_hub import HfApi, create_repo, upload_folder
    from huggingface_hub.utils import RepositoryNotFoundError
 except ImportError:
    print("Please install huggingface_hub: pip install huggingface-hub")
    exit(1)


 def generate_dataset_card(stats: Dict[str, int], total_files: int, total_size: int) -> str:
    """Generate a README.md dataset card for Hugging Face"""
    return f"""---
 language:
 - en
 license: cc0-1.0
 task_categories:
 - text-generation
 - text2text-generation
 tags:
 - survival
 - historical
 - knowledge
 - library
 - public-domain
 pretty_name: Survivor Library Text Corpus
 size_categories:
 - 10K<n<100K
 ---

 # Survivor Library Text Corpus

 This dataset contains OCR-extracted text from the [Survivor Library](https://www.survivorlibrary.com/), 
 a collection of public domain books focused on practical knowledge and skills from the pre-industrial 
 and early industrial era.

 ## Dataset Description

 The Survivor Library is a collection of books that would be useful in rebuilding civilization after 
 a catastrophic event. It focuses on practical, hands-on knowledge from the 1800s and early 1900s.

 ### Dataset Summary

 - **Total Files**: {total_files:,}
 - **Total Size**: {total_size / (1024**3):.2f} GB
 - **Categories**: {len(stats)}
 - **Source**: [survivorlibrary.com](https://www.survivorlibrary.com/)
 - **Processing Date**: {datetime.now().strftime('%Y-%m-%d')}

 ### Categories and File Counts

 | Category | Files |
 |----------|-------|
 """ + "\n".join([f"| {cat} | {count:,} |" for cat, count in sorted(stats.items(), key=lambda x: x[1], reverse=True)]) + """

 ## Dataset Structure

 The dataset is organized by category, with each category containing text files extracted from PDFs:

 ```
 text_outputs/
 ├── Accounting/
 │   ├── book1.txt
 │   ├── book2.txt
 │   └── ...
 ├── Agriculture/
 │   └── ...
 └── ...
 ```

 ## Data Processing

 1. Original PDFs downloaded from survivorlibrary.com
 2. Text extracted using OCR/PDF text extraction
 3. Organized by category
 4. Uploaded to Hugging Face Hub

 ## Usage

 ```python
 from datasets import load_dataset

 # Load the entire dataset
 dataset = load_dataset("YOUR_USERNAME/survivor-library-text")

 # Load specific categories
 accounting_texts = dataset.filter(lambda x: x['category'] == 'Accounting')
 ```

 ## Considerations

 - These texts are historical and may contain outdated or potentially dangerous information
 - Always verify information with modern sources before practical application
 - Some OCR errors may be present in the extracted text
 - Original formatting may not be perfectly preserved

 ## License

 The original books are in the public domain. This dataset compilation is released under CC0 1.0 Universal.

 ## Citation

 If you use this dataset, please cite:

 ```bibtex
 @misc{survivor_library_text,
  title={Survivor Library Text Corpus},
  author={Survivor Library},
  year={2024},
  publisher={Hugging Face},
  url={https://huggingface.co/datasets/YOUR_USERNAME/survivor-library-text}
 }
 ```
 """


 def create_metadata_file(text_dir: Path) -> Dict:
    """Create a metadata JSON file with information about all texts"""
    metadata = {
        "dataset_name": "Survivor Library Text Corpus",
        "version": "1.0.0",
        "created": datetime.now().isoformat(),
        "categories": {},
        "files": []
    }
    
    total_size = 0
    
    for category_dir in sorted(text_dir.iterdir()):
        if category_dir.is_dir():
            category_name = category_dir.name
            category_files = []
            
            for text_file in sorted(category_dir.glob("*.txt")):
                file_size = text_file.stat().st_size
                total_size += file_size
                
                # Calculate file hash for integrity
                with open(text_file, 'rb') as f:
                    file_hash = hashlib.md5(f.read()).hexdigest()
                
                file_info = {
                    "filename": text_file.name,
                    "category": category_name,
                    "path": str(text_file.relative_to(text_dir)),
                    "size_bytes": file_size,
                    "md5": file_hash
                }
                
                metadata["files"].append(file_info)
                category_files.append(text_file.name)
            
            metadata["categories"][category_name] = {
                "file_count": len(category_files),
                "files": category_files
            }
    
    metadata["total_files"] = len(metadata["files"])
    metadata["total_size_bytes"] = total_size
    
    return metadata


 def upload_to_huggingface(
    text_dir: str,
    repo_name: str,
    username: Optional[str] = None,
    private: bool = False,
    token: Optional[str] = None
 ):
    """Upload text files to Hugging Face Hub"""
    
    text_path = Path(text_dir)
    if not text_path.exists():
        raise ValueError(f"Directory {text_dir} does not exist")
    
    # Initialize HF API
    api = HfApi(token=token)
    
    # Get username if not provided
    if username is None:
        user_info = api.whoami()
        username = user_info["name"]
    
    repo_id = f"{username}/{repo_name}"
    
    print(f"Uploading to: {repo_id}")
    
    # Create repository if it doesn't exist
    try:
        api.repo_info(repo_id=repo_id, repo_type="dataset")
        print(f"Repository {repo_id} already exists")
    except RepositoryNotFoundError:
        print(f"Creating repository: {repo_id}")
        create_repo(
            repo_id=repo_id,
            repo_type="dataset",
            private=private,
            token=token
        )
    
    # Generate metadata
    print("Generating metadata...")
    metadata = create_metadata_file(text_path)
    
    # Save metadata
    metadata_path = text_path / "metadata.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    # Calculate statistics
    stats = {cat: info["file_count"] for cat, info in metadata["categories"].items()}
    total_files = metadata["total_files"]
    total_size = metadata["total_size_bytes"]
    
    # Generate and save dataset card
    print("Generating dataset card...")
    dataset_card = generate_dataset_card(stats, total_files, total_size)
    dataset_card = dataset_card.replace("YOUR_USERNAME", username)
    
    readme_path = text_path / "README.md"
    with open(readme_path, 'w') as f:
        f.write(dataset_card)
    
    # Upload everything
    print(f"Uploading {total_files} files ({total_size / (1024**3):.2f} GB)...")
    
    try:
        upload_folder(
            folder_path=str(text_path),
            repo_id=repo_id,
            repo_type="dataset",
            token=token,
            commit_message=f"Upload Survivor Library text corpus - {total_files} files",
            ignore_patterns=["*.pdf", "*.zip", "__pycache__", ".DS_Store"]
        )
        
        print(f"\n✅ Successfully uploaded to: https://huggingface.co/datasets/{repo_id}")
        
    except Exception as e:
        print(f"\n❌ Error uploading: {str(e)}")
        raise
    
    finally:
        # Clean up generated files
        if metadata_path.exists():
            metadata_path.unlink()
        if readme_path.exists():
            readme_path.unlink()


 def main():
    parser = argparse.ArgumentParser(description="Upload Survivor Library texts to Hugging Face")
    parser.add_argument("--text-dir", type=str, default="text_outputs",
                        help="Directory containing the text files")
    parser.add_argument("--repo-name", type=str, default="survivor-library-text",
                        help="Name of the Hugging Face dataset repository")
    parser.add_argument("--username", type=str, default=None,
                        help="Hugging Face username (defaults to logged-in user)")
    parser.add_argument("--private", action="store_true",
                        help="Make the dataset private")
    parser.add_argument("--token", type=str, default=None,
                        help="Hugging Face API token (or set HF_TOKEN env var)")
    
    args = parser.parse_args()
    
    # Get token from environment if not provided
    token = args.token or os.environ.get("HF_TOKEN")
    
    if not token:
        print("Please provide a Hugging Face token via --token or HF_TOKEN environment variable")
        print("You can get a token from: https://huggingface.co/settings/tokens")
        exit(1)
    
    upload_to_huggingface(
        text_dir=args.text_dir,
        repo_name=args.repo_name,
        username=args.username,
        private=args.private,
        token=token
    )


 if __name__ == "__main__":
    main()
diff --git a/main.py b/main.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
 Standalone Asynchronous Nanonets-OCR-s Inference Script using vLLM and PyMuPDF.

 This script processes PDF files from an input directory using the
 nanonets/Nanonets-OCR-s model served locally by vLLM via its OpenAI-compatible API.
 It renders each page, sends API requests concurrently for OCR, extracts the
 structured markdown/HTML text, and saves the combined text for each PDF into a
 corresponding .txt file in the specified output directory.

 This version uses asyncio and the AsyncOpenAI client to significantly speed up
 processing by sending multiple page OCR requests to the vLLM server concurrently.

 **IMPORTANT:** Requires a separate vLLM server running with the Nanonets-OCR-s model.
 Start the server BEFORE running this script, for example:

   vllm serve nanonets/Nanonets-OCR-s --max-num-seqs 256 --gpu-memory-utilization 0.9

 Dependencies (vLLM - see vLLM docs for specific CUDA versions):
    pip install ninja vllm flash-attn

 Dependencies (Script):
    pip install "openai>=1.0" PyMuPDF Pillow fire tqdm pypdf "tqdm[asyncio]" joblib

 Example Usage:
  # 1. Start the vLLM server in a separate terminal:
  #    vllm serve nanonets/Nanonets-OCR-s

  # 2. Run this script:
  python nanonets_pipeline.py \
    --input_dir ./my_pdfs \
    --output_dir ./output_text \
    --model_id nanonets/Nanonets-OCR-s \
    --max_pages 100 \
    --overwrite \
    --api_base_url http://localhost:8000/v1 \
    --concurrency_limit 16
 """

 import asyncio
 import base64
 import io
 import logging
 import os
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

 import fire

 # REMOVED: mdformat is no longer needed as Nanonets produces structured output.
 # import mdformat
 from joblib import Parallel, delayed
 from PIL import Image
 from pypdf import PdfReader
 from pypdf.errors import PdfReadError
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio

 try:
    from openai import APIConnectionError, APIStatusError, AsyncOpenAI, RateLimitError
 except ImportError:
    print("=" * 80)
    print("ERROR: openai library >= 1.0 not found.")
    print("Please install it: pip install 'openai>=1.0'")
    print("=" * 80)
    exit(1)

 try:
    import fitz  # PyMuPDF
 except ImportError:
    print("=" * 80)
    print("ERROR: PyMuPDF library not found.")
    print("Please install it: pip install PyMuPDF")
    print("=" * 80)
    exit(1)


 # --- Configuration ---
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - [%(funcName)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
 )
 logger = logging.getLogger(__name__)

 # Reduce noise from underlying libraries
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("openai").setLevel(logging.WARNING)
 logging.getLogger("httpcore").setLevel(logging.WARNING)

 # --- CHANGED: Updated model ID, prompt, and default parameters for Nanonets-OCR-s ---
 DEFAULT_MODEL_ID: str = "nanonets/Nanonets-OCR-s"
 NANONETS_PROMPT: str = (
    "Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."
 )
 DEFAULT_TARGET_IMAGE_DIM: int = 1024
 DEFAULT_API_BASE_URL: str = "http://localhost:8000/v1"
 DEFAULT_API_KEY: str = "EMPTY"
 DEFAULT_CONCURRENCY_LIMIT: int = 16
 DEFAULT_MAX_TOKENS_PER_PAGE: int = 10000
 DEFAULT_TEMPERATURE: float = 0.0
 DEFAULT_FREQ_PENALTY: float = 0.1


 def render_pdf_page_to_pil_fitz(
    pdf_path: Path,
    page_num: int,
    target_longest_image_dim: int = DEFAULT_TARGET_IMAGE_DIM,
 ) -> Optional[Image.Image]:
    """
    Renders a single page of a PDF to a PIL Image using PyMuPDF (fitz).

    Resizes the image so its longest dimension matches target_longest_image_dim,
    but only downscales (does not upscale).

    Args:
        pdf_path: Path to the PDF file.
        page_num: The 1-based page number to render.
        target_longest_image_dim: Target size for the longest dimension.

    Returns:
        A PIL Image object of the rendered page, or None if rendering fails.
    """
    doc: Optional[fitz.Document] = None
    try:
        doc = fitz.open(pdf_path)
        if not 0 < page_num <= doc.page_count:
            logger.error(
                f"Invalid page number {page_num} for {pdf_path.name} "
                f"({doc.page_count} pages)."
            )
            return None

        page: fitz.Page = doc.load_page(page_num - 1)  # fitz uses 0-based index
        page_rect: fitz.Rect = page.rect
        width, height = page_rect.width, page_rect.height

        if max(width, height) <= 0:
            logger.error(
                f"Invalid page dimensions ({width}x{height}) for "
                f"{pdf_path.name} page {page_num}."
            )
            return None

        zoom_factor: float = 1.0
        if max(width, height) > target_longest_image_dim:
            zoom_factor = target_longest_image_dim / max(width, height)

        matrix: fitz.Matrix = fitz.Matrix(zoom_factor, zoom_factor)
        pix: fitz.Pixmap = page.get_pixmap(matrix=matrix, alpha=False)

        if pix.width == 0 or pix.height == 0:
            logger.error(
                f"Rendered pixmap has zero dimension for {pdf_path.name} "
                f"page {page_num}."
            )
            return None

        img: Image.Image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        return img

    except fitz.fitz.FileNotFoundError:
        logger.error(f"PyMuPDF could not find file: {pdf_path}")
        return None
    except Exception as e:
        logger.error(
            f"PyMuPDF error rendering {pdf_path.name} page {page_num}: "
            f"{type(e).__name__} - {e}"
        )
        return None
    finally:
        if doc:
            try:
                doc.close()
            except Exception as e:
                logger.warning(f"Error closing PDF {pdf_path.name}: {e}")


 def get_pdf_page_count(pdf_path: Path) -> Optional[int]:
    """
    Gets the number of pages in a PDF file using pypdf, with fitz fallback.

    Args:
        pdf_path: Path to the PDF file.

    Returns:
        The number of pages as an integer, or None if reading fails.
    """
    try:
        reader = PdfReader(pdf_path, strict=False)
        count = len(reader.pages)

        if count == 0:
            try:
                with fitz.open(pdf_path) as doc:
                    count = doc.page_count
            except Exception:
                logger.warning(
                    f"pypdf reported 0 pages, fitz failed to open "
                    f"{pdf_path.name}. Assuming 0 pages."
                )
                return 0
        return count
    except PdfReadError as e:
        logger.error(f"pypdf failed to read {pdf_path.name}: {e}. Trying fitz.")
        try:
            with fitz.open(pdf_path) as doc:
                return doc.page_count
        except Exception as fitz_e:
            logger.error(
                f"Both pypdf and fitz failed page count for {pdf_path.name}: {fitz_e}"
            )
            return None
    except FileNotFoundError:
        logger.error(f"File not found for page count: {pdf_path}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error getting page count for {pdf_path.name}: {e}")
        return None


 def encode_pil_to_base64(image: Image.Image, format: str = "PNG") -> str:
    """
    Encodes a PIL image object to a base64 string.

    Args:
        image: The PIL Image object.
        format: The image format to use (e.g., "PNG", "JPEG").

    Returns:
        The base64 encoded string representation of the image.
    """
    buffered = io.BytesIO()
    image.save(buffered, format=format)
    img_byte = buffered.getvalue()
    img_base64 = base64.b64encode(img_byte)
    return img_base64.decode("utf-8")


 async def ocr_page_api(
    client: AsyncOpenAI,
    model_id: str,
    img_base64: str,
    page_num: int,
    pdf_name: str,
    semaphore: asyncio.Semaphore,
    temperature: float = DEFAULT_TEMPERATURE,
    max_tokens: int = DEFAULT_MAX_TOKENS_PER_PAGE,
    frequency_penalty: float = DEFAULT_FREQ_PENALTY,
 ) -> str:
    """
    Sends a single page image to the vLLM OpenAI API for OCR asynchronously.

    Uses an asyncio.Semaphore to limit the number of concurrent requests.

    Args:
        client: The initialized AsyncOpenAI client.
        model_id: The model identifier for the API call.
        img_base64: The base64 encoded string of the page image.
        page_num: The 1-based page number (for logging).
        pdf_name: The name of the PDF file (for logging).
        semaphore: The asyncio.Semaphore to control concurrency.
        temperature: Sampling temperature for the model.
        max_tokens: Maximum tokens to generate for the page.

    Returns:
        The extracted text content as a string, or an error marker string
        (e.g., "[API_CONNECTION_ERROR]") if an API error occurs.
    """
    async with semaphore:  # Acquire semaphore before making the API call
        try:
            response = await client.chat.completions.create(
                model=model_id,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{img_base64}"
                                },
                            },
                            # --- CHANGED: Use the detailed Nanonets prompt ---
                            {"type": "text", "text": NANONETS_PROMPT},
                        ],
                    }
                ],
                temperature=temperature,
                max_tokens=max_tokens,
                frequency_penalty=frequency_penalty,
            )
            content = response.choices[0].message.content
            return content.strip() if content else "[API_EMPTY_RESPONSE]"
        except APIConnectionError as e:
            logger.error(
                f"API Connect Error page {page_num} ({pdf_name}): {e}. "
                f"Is server at {client.base_url} running?"
            )
            return "[API_CONNECTION_ERROR]"
        except RateLimitError as e:
            logger.warning(
                f"API Rate Limit Error page {page_num} ({pdf_name}): {e}. "
                f"Server busy or concurrency too high? Retrying may be needed."
            )
            await asyncio.sleep(2)
            return "[API_RATE_LIMIT_ERROR]"
        except APIStatusError as e:
            logger.error(
                f"API Status Error page {page_num} ({pdf_name}): "
                f"Status={e.status_code}, Response={e.response}"
            )
            return f"[API_STATUS_ERROR_{e.status_code}]"
        except Exception as e:
            logger.exception(f"Unexpected API Error page {page_num} ({pdf_name}): {e}")
            return "[API_UNEXPECTED_ERROR]"


 def render_and_encode_single_page(
    pdf_file: Path, page_num: int, target_image_dim: int, pdf_name: str
 ) -> tuple:
    """
    Renders and encodes a single PDF page in one function for parallel processing.

    Args:
        pdf_file: Path to the PDF file
        page_num: Page number to render (1-based)
        target_image_dim: Target size for longest dimension
        pdf_name: Name of PDF file (for logging)

    Returns:
        tuple: (page_num, base64_string or error_message)
    """
    pil_image = render_pdf_page_to_pil_fitz(pdf_file, page_num, target_image_dim)
    if not pil_image:
        logger.warning(f"Failed to render page {page_num} ({pdf_name})")
        return page_num, "[PAGE_RENDER_ERROR]"

    try:
        img_base64 = encode_pil_to_base64(pil_image)
        return page_num, img_base64
    except Exception as e:
        logger.error(f"Failed to encode page {page_num} ({pdf_name}): {e}")
        return page_num, "[IMAGE_ENCODE_ERROR]"


 # --- Main Processing Logic ---


 async def process_directory(
    input_dir: str,
    output_dir: Optional[str] = None,
    model_id: str = DEFAULT_MODEL_ID,
    api_base_url: str = DEFAULT_API_BASE_URL,
    api_key: str = DEFAULT_API_KEY,
    target_image_dim: int = DEFAULT_TARGET_IMAGE_DIM,
    max_pages: Optional[int] = None,
    # --- CHANGED: Updated default temperature and max tokens for Nanonets ---
    temperature: float = DEFAULT_TEMPERATURE,
    max_tokens_per_page: int = DEFAULT_MAX_TOKENS_PER_PAGE,
    overwrite: bool = False,
    concurrency_limit: int = DEFAULT_CONCURRENCY_LIMIT,
 ) -> None:
    """
    Processes PDF files asynchronously using Nanonets-OCR-s via vLLM's OpenAI API.

    Renders pages, encodes them, sends concurrent API requests for OCR,
    combines results, and saves text files.

    Args:
        input_dir: Path to the directory containing input PDF files.
        output_dir: Path to the directory for output .txt files. If None,
                    creates a directory next to input_dir.
        model_id: Model ID for the vLLM server API.
        api_base_url: Base URL of the vLLM OpenAI-compatible API endpoint.
        api_key: API key for the endpoint (usually 'EMPTY' for local vLLM).
        target_image_dim: Target size for the longest dimension of page images.
        max_pages: Max pages to process per PDF (None for all pages).
        temperature: Sampling temperature for the model (0.0 recommended for Nanonets).
        max_tokens_per_page: Max tokens the model can generate per page.
        overwrite: If True, overwrite existing output .txt files.
        concurrency_limit: Maximum number of concurrent API requests.
    """
    input_path = Path(input_dir).resolve()
    assert (
        input_path.is_dir()
    ), f"Input directory not found or is not a directory: {input_path}"

    output_path = (
        Path(output_dir).resolve()
        if output_dir is not None
        else input_path.parent / f"output-pdftotext-{input_path.name}"
    )
    output_path.mkdir(parents=True, exist_ok=True)

    logger.info(f"Input directory:  {input_path}")
    logger.info(f"Output directory: {output_path}")
    logger.info(f"Model API:        {model_id} at {api_base_url}")
    logger.info(f"Concurrency:      {concurrency_limit}")
    logger.info(f"Target Image Dim: {target_image_dim}")
    logger.info(f"Overwrite:        {overwrite}")

    client: Optional[AsyncOpenAI] = None
    try:
        client = AsyncOpenAI(api_key=api_key, base_url=api_base_url)
        logger.info(f"AsyncOpenAI client initialized for {api_base_url}")

        pdf_files: List[Path] = sorted(list(input_path.glob("*.pdf")))
        if not pdf_files:
            logger.warning(f"No PDF files found in {input_path}")
            return
        logger.info(f"Found {len(pdf_files)} PDF files.")

        semaphore = asyncio.Semaphore(concurrency_limit)

        for pdf_file in tqdm(
            pdf_files, desc="Processing PDFs", unit="pdf", mininterval=1.0
        ):
            output_txt_path = output_path / (pdf_file.stem + ".md")

            if not overwrite and output_txt_path.exists():
                logger.info(f"Skipping {pdf_file.name}, output exists.")
                continue

            logger.info(f"Starting processing for {pdf_file.name}")

            page_count = get_pdf_page_count(pdf_file)
            if page_count is None:
                logger.warning(f"Skipping {pdf_file.name}, failed to get page count.")
                output_txt_path.write_text("[ERROR_READING_PDF]", encoding="utf-8")
                continue
            if page_count == 0:
                logger.warning(f"Skipping {pdf_file.name}, contains 0 pages.")
                output_txt_path.write_text("", encoding="utf-8")  # Empty file
                continue

            num_pages_to_process = page_count
            if max_pages is not None and 0 < max_pages < page_count:
                logger.info(f"Limiting to first {max_pages} pages of {pdf_file.name}")
                num_pages_to_process = max_pages

            # --- Preprocessing: Render and Encode Pages ---
            page_render_encode_data: Dict[int, str] = {}  # page_num -> base64 or error
            logger.debug(
                f"Rendering/encoding {num_pages_to_process} pages for {pdf_file.name} in parallel"
            )
            n_jobs = min(64, os.cpu_count() or 1)
            logger.info(f"Using {n_jobs} cores for parallel page rendering")
            parallel_results = Parallel(
                n_jobs=n_jobs, verbose=0
            )(  # Set verbose=0 to avoid clutter
                delayed(render_and_encode_single_page)(
                    pdf_file, page_num, target_image_dim, pdf_file.name
                )
                for page_num in range(1, num_pages_to_process + 1)
            )
            page_render_encode_data = {}
            valid_pages_for_api = 0
            for page_num, result in parallel_results:
                page_render_encode_data[page_num] = result
                if not result.startswith("["):
                    valid_pages_for_api += 1

            if valid_pages_for_api == 0:
                logger.warning(
                    f"No pages successfully rendered/encoded for {pdf_file.name}. "
                    "Skipping API calls."
                )
                all_page_texts = {
                    pn: data for pn, data in page_render_encode_data.items()
                }
            else:
                # --- Asynchronous API Calls ---
                tasks: List[Tuple[int, asyncio.Task[str]]] = []
                logger.info(
                    f"Submitting {valid_pages_for_api} pages to API for {pdf_file.name}"
                )
                for page_num in range(1, num_pages_to_process + 1):
                    img_data = page_render_encode_data.get(page_num)
                    if img_data and not img_data.startswith("["):
                        task = asyncio.create_task(
                            ocr_page_api(
                                client=client,
                                model_id=model_id,
                                img_base64=img_data,
                                page_num=page_num,
                                pdf_name=pdf_file.name,
                                semaphore=semaphore,
                                temperature=temperature,
                                max_tokens=max_tokens_per_page,
                            ),
                            name=f"OCR_{pdf_file.stem}_p{page_num}",
                        )
                        tasks.append((page_num, task))

                api_results: List[str] = await tqdm_asyncio.gather(
                    *(task for _, task in tasks),
                    desc=f"  OCR Pages ({pdf_file.name[:20]})",
                    unit="page",
                    leave=False,
                    mininterval=5.0,  # Update every 5 seconds max
                )

                # --- Combine Results ---
                all_page_texts: Dict[int, str] = {}
                for pn, data in page_render_encode_data.items():
                    if data.startswith("["):
                        all_page_texts[pn] = data
                for i, (page_num, _) in enumerate(tasks):
                    all_page_texts[page_num] = api_results[i]

            if not all_page_texts:
                logger.warning(f"No text results generated for {pdf_file.name}.")
                output_txt_path.write_text("", encoding="utf-8")
                continue

            ERROR_PATTERN = re.compile(r"^\s*\[[A-Z0-9_]+\]\s*$")
            ordered_texts: List[str] = [
                all_page_texts.get(pn, f"[PAGE_{pn}_MISSING_UNEXPECTEDLY]")
                for pn in range(1, num_pages_to_process + 1)
            ]

            # The filtering of error messages is still useful.
            filtered_texts: List[str] = [
                text
                for text in ordered_texts
                if text.strip() and not ERROR_PATTERN.match(text.strip())
            ]

            if not filtered_texts:
                logger.warning(f"All pages were filtered out for {pdf_file.name}.")
                output_txt_path.write_text("", encoding="utf-8")
                continue

            # Use form feed character (\f) as page separator. This is a good way
            # to delimit pages in the final text file.
            final_text: str = "\n\f\n".join(ordered_texts)

            try:
                output_txt_path.write_text(final_text, encoding="utf-8")
                logger.info(f"Successfully wrote output: {output_txt_path.name}")
            except Exception as e:
                logger.error(f"Failed to write output file {output_txt_path}: {e}")

    except Exception as e:
        logger.exception(f"An unexpected error occurred during processing: {e}")
    finally:
        if client:
            await client.close()
            logger.info("AsyncOpenAI client closed.")
        logger.info("Processing run finished.")


 def main(**kwargs: Any) -> None:
    """
    Command-line entry point wrapper to run the async processing function.

    Uses fire library to handle command-line arguments. Any argument accepted
    by `process_directory` can be passed via the command line, e.g.,
    `--input_dir ./pdfs --max_pages 5`.

    Args:
        **kwargs: Arguments passed from the command line via fire.
    """
    try:
        asyncio.run(process_directory(**kwargs))
    except KeyboardInterrupt:
        logger.info("Processing interrupted by user.")


 if __name__ == "__main__":
    fire.Fire(main)
diff --git a/survivor-library-downloader.sh b/survivor-library-downloader.sh
 #!/bin/bash

 # Survivor Library Bulk Download and Process Script
 # This script downloads all zip files from survivorlibrary.com, extracts PDFs, and converts them to text

 # Configuration
 BASE_URL="https://www.survivorlibrary.com/library"
 OUTPUT_BASE_DIR="text_outputs"
 TEMP_BASE_DIR="temp_pdfs"
 LOG_FILE="survivor_library_download.log"
 COMPLETED_FILE="survivor_library_completed.txt"
 MAX_PARALLEL_DOWNLOADS=2
 PARALLEL_CONNECTIONS=2  # Number of connections per file for pget
 RETRY_ATTEMPTS=3
 RETRY_DELAY=5
 USE_PGET=true  # Set to false to use wget instead

 # Load Hugging Face configuration if exists
 if [ -f ".hf_config" ]; then
    source .hf_config
    log "Loaded Hugging Face configuration from .hf_config"
 fi


 # Create necessary directories
 mkdir -p "$OUTPUT_BASE_DIR"
 mkdir -p "$TEMP_BASE_DIR"

 # Initialize completed file if it doesn't exist
 touch "$COMPLETED_FILE"

 # Log function
 log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
 }

 # Function to check if category is already completed
 is_completed() {
    local category="$1"
    grep -q "^${category}$" "$COMPLETED_FILE"
 }

 # Function to mark category as completed
 mark_completed() {
    local category="$1"
    echo "$category" >> "$COMPLETED_FILE"
 }

 # Function to download with retry using either pget or wget
 download_with_retry() {
    local url="$1"
    local output="$2"
    local attempts=0
    
    while [ $attempts -lt $RETRY_ATTEMPTS ]; do
        if [ "$USE_PGET" = true ]; then
            # Use pget with proper syntax
            if pget -p "$PARALLEL_CONNECTIONS" -o "$output" "$url" 2>>"$LOG_FILE"; then
                return 0
            fi
        else
            # Use wget as fallback
            if wget -q --show-progress -O "$output" "$url" 2>>"$LOG_FILE"; then
                return 0
            fi
        fi
        
        ((attempts++))
        if [ $attempts -lt $RETRY_ATTEMPTS ]; then
            log "Download failed, retrying in ${RETRY_DELAY} seconds... (Attempt $attempts/$RETRY_ATTEMPTS)"
            sleep $RETRY_DELAY
        fi
    done
    
    return 1
 }

 # List of all categories from the Survivor Library
 CATEGORIES=(
    "Accounting"
    "Aeroplanes"
    "Airships"
    "Archery"
    "Architecture"
    "Astronomy"
    "Baking"
    "Banking"
    "Basketry"
    "Bee_Journal_American"
    "Bee_Journal_British"
    "Beekeeping"
    "Berries"
    "Boilermaker"
    "Bookbinding"
    "Books_for_Boys_and_Girls"
    "Books_for_Young_Children"
    "Botany"
    "Boy_Scout_Manuals"
    "Brewing_and_Distilling"
    "Bridges_and_Dams"
    "Butchering"
    "Canning"
    "Cheese_and_Butter"
    "Chemistry"
    "Christmas"
    "Clockmaking"
    "Coal_and_Mining"
    "Coffee_and_Tea"
    "Concrete"
    "Conduct_of_Life"
    "Construction"
    "Cooking_and_Cookbooks"
    "Cotton"
    "Cycles_Bi_Tri_Motor"
    "Dentistry"
    "Dogs"
    "Drilling"
    "Economics"
    "Embalming"
    "Encyclopedias"
    "Engineering_Drainage"
    "Engineering_Electrical"
    "Engineering_General"
    "Engineering_Hydraulics"
    "Engraving_and_Woodcuts"
    "Ethics"
    "Farming"
    "Farming_Corn"
    "Farming_Fish"
    "Farming_Potato_and_Sweet_Potato"
    "Firearms_Books"
    "Firearms_Manuals"
    "Fishing"
    "Food"
    "Forestry"
    "Forging_and_Casting"
    "Formulas"
    "Fuels"
    "Geodesy"
    "Geography"
    "Glassmaking"
    "Grapes_Wine_Raisins"
    "Great_Books"
    "Gunpowder_and_Explosives"
    "Hatmaking"
    "Heating"
    "Heavy_Industrial_Machinery"
    "Hemp_and_Flax"
    "Herbalism"
    "History_American"
    "Home_Economics"
    "Horses"
    "Journalism"
    "Knitting_Lace_Needlepoint"
    "Laundry"
    "Law"
    "Leather"
    "Leisure_Games_and_Sports"
    "Leisure_Recreation_Magazine"
    "Leisure_Whist"
    "Lithography"
    "Livestock_Cattle"
    "Livestock_Rabbits_and_Cavies"
    "Livestock_Sheep"
    "Livestock_Swine"
    "Machine_Tools"
    "Machinerys_Reference"
    "Masterpieces_of_Eloquence"
    "Mathematics"
    "Mechanical_Drawing"
    "Medical_Anesthesia"
    "Medical_Courses_US_Army"
    "Medical_Diagnostics"
    "Medical_Emergency"
    "Medical_Hypnotism"
    "Medical_Medicine_1900-1922"
    "Medical_Microscopy"
    "Medical_Nursing"
    "Medical_Obstetrics_1900-1922"
    "Medical_Surgery_1900-1922"
    "Medical_Surgery_2"
    "Medical_X_Rays"
    "Meteorology"
    "Mimeograph"
    "Miscellaneous"
    "Monasticism"
    "Morality"
    "Mushrooms"
    "Musical_Instruments"
    "NBC"
    "Navigation"
    "Opium"
    "Optometry"
    "Painting"
    "Papermaking"
    "Photography"
    "Pottery"
    "Poultry"
    "Primers"
    "Printing"
    "Radio"
    "Radio_73_Magazine"
    "Railroads"
    "Rat_Control"
    "Refrigeration"
    "Sanitation"
    "Scientific_American_Series_1"
    "Scientific_American_Series_2"
    "Sewage"
    "Sewing"
    "Shelter"
    "Shipbuilding"
    "Shoemaking"
    "Shorthand"
    "Silk_Culture"
    "Sliderules"
    "Smithing"
    "Steam_Engines"
    "Stone_and_Masonry"
    "Surveying"
    "Survival_Individual"
    "Teaching"
    "Teaching_Arithmetic"
    "Teaching_Civics"
    "Teaching_Phonics"
    "Teaching_Readers"
    "Teaching_Readers_McGuffey"
    "Telegraph_and_Telephone"
    "Thanksgiving"
    "Tobacco"
    "Toys"
    "Trapping_and_Hunting"
    "Turpentine_Glue_Solvents"
    "Veterinary"
    "Wagons_and_Coaches"
    "Weaving"
    "Welding"
    "Wind_and_Water"
    "Wood_Carpentry"
    "Wood_Carving"
    "Wood_Furniture"
    "World_Depression"
 )

 # Function to download and process a single category
 process_category() {
    local category="$1"
    local safe_category=$(echo "$category" | tr ' ' '_')
    local zip_url="${BASE_URL}/${safe_category}.zip"
    local zip_file="${TEMP_BASE_DIR}/${safe_category}.zip"
    local temp_dir="${TEMP_BASE_DIR}/${safe_category}-input-pdfs"
    local output_dir="${OUTPUT_BASE_DIR}/${safe_category}"
    
    # Skip if already completed
    if is_completed "$category"; then
        log "Skipping already completed category: $category"
        return 0
    fi
    
    log "Starting download for category: $category"
    log "Downloading from: $zip_url"
    
    # Create temporary directory for this category
    mkdir -p "$temp_dir"
    mkdir -p "$output_dir"
    
    # Download the zip file with retry
    if download_with_retry "$zip_url" "$zip_file"; then
        log "Successfully downloaded: $category"
        
        # Check if zip file is valid and not empty
        if [ -s "$zip_file" ]; then
            # Extract PDFs
            if unzip -j -o "$zip_file" -d "$temp_dir" 2>>"$LOG_FILE"; then
                log "Successfully extracted: $category"
                rm "$zip_file"
                
                # Check if any PDFs were extracted
                pdf_count=$(find "$temp_dir" -name "*.pdf" -o -name "*.PDF" | wc -l)
                if [ $pdf_count -gt 0 ]; then
                    log "Found $pdf_count PDF files in: $category"
                    
                    # Run the Python script to convert PDFs to text
                    if python main.py --input_dir "$temp_dir" --output_dir "$output_dir" 2>>"$LOG_FILE"; then
                        log "Successfully converted PDFs to text for: $category"
                        
                        # Clean up the temporary PDF directory
                        rm -rf "$temp_dir"
                        log "Cleaned up temporary files for: $category"
                        
                        # Mark as completed
                        mark_completed "$category"
                    else
                        log "ERROR: Failed to convert PDFs for: $category"
                        rm -rf "$temp_dir"
                    fi
                else
                    log "WARNING: No PDF files found in: $category"
                    rm -rf "$temp_dir"
                    mark_completed "$category"  # Mark as completed even if no PDFs
                fi
            else
                log "ERROR: Failed to extract zip for: $category"
                rm -f "$zip_file"
            fi
        else
            log "ERROR: Downloaded file is empty or invalid for: $category"
            rm -f "$zip_file"
        fi
    else
        log "ERROR: Failed to download after $RETRY_ATTEMPTS attempts: $category"
    fi
 }

 # Function to manage parallel downloads
 run_parallel_downloads() {
    local pids=()
    local category_index=0
    local processed=0
    local total=${#CATEGORIES[@]}
    
    # Start initial batch of downloads - FIXED to handle all completed categories
    while [ ${#pids[@]} -lt $MAX_PARALLEL_DOWNLOADS ] && [ $category_index -lt ${#CATEGORIES[@]} ]; do
        if ! is_completed "${CATEGORIES[$category_index]}"; then
            process_category "${CATEGORIES[$category_index]}" &
            pids+=($!)
        else
            ((processed++))
        fi
        ((category_index++))
    done
    
    # If no processes were started (all were completed), we need to handle this
    if [ ${#pids[@]} -eq 0 ] && [ $category_index -ge ${#CATEGORIES[@]} ]; then
        echo -ne "\rProgress: $processed/$total categories processed (100%)\n"
        return 0
    fi
    
    # Continue processing remaining categories
    while [ $category_index -lt ${#CATEGORIES[@]} ] || [ ${#pids[@]} -gt 0 ]; do
        # Check for finished processes
        for i in "${!pids[@]}"; do
            if [ -n "${pids[$i]}" ] && ! kill -0 "${pids[$i]}" 2>/dev/null; then
                # Process finished
                ((processed++))
                echo -ne "\rProgress: $processed/$total categories processed ($(( processed * 100 / total ))%)"
                unset pids[$i]
                
                # Start a new process if there are more categories
                while [ $category_index -lt ${#CATEGORIES[@]} ]; do
                    if ! is_completed "${CATEGORIES[$category_index]}"; then
                        process_category "${CATEGORIES[$category_index]}" &
                        pids[$i]=$!
                        ((category_index++))
                        break
                    else
                        ((processed++))
                        ((category_index++))
                    fi
                done
            fi
        done
        
        # Remove empty elements from array
        local new_pids=()
        for pid in "${pids[@]}"; do
            [ -n "$pid" ] && new_pids+=("$pid")
        done
        pids=("${new_pids[@]}")
        
        # If we have no running processes and still have categories to check
        if [ ${#pids[@]} -eq 0 ] && [ $category_index -lt ${#CATEGORIES[@]} ]; then
            # Try to start new processes for remaining categories
            while [ ${#pids[@]} -lt $MAX_PARALLEL_DOWNLOADS ] && [ $category_index -lt ${#CATEGORIES[@]} ]; do
                if ! is_completed "${CATEGORIES[$category_index]}"; then
                    process_category "${CATEGORIES[$category_index]}" &
                    pids+=($!)
                else
                    ((processed++))
                fi
                ((category_index++))
            done
            
            # If still no processes, all remaining must be completed
            if [ ${#pids[@]} -eq 0 ]; then
                # Count remaining completed
                while [ $category_index -lt ${#CATEGORIES[@]} ]; do
                    ((processed++))
                    ((category_index++))
                done
                break
            fi
        fi
        
        sleep 1
    done
    
    echo  # New line after progress indicator
 }

 # Main execution
 if [ "$USE_PGET" = true ]; then
    log "=== Starting Survivor Library bulk download and processing with pget ==="
    log "Parallel connections per file: $PARALLEL_CONNECTIONS"
 else
    log "=== Starting Survivor Library bulk download and processing with wget ==="
 fi

 log "Total categories to process: ${#CATEGORIES[@]}"
 log "Max parallel downloads: $MAX_PARALLEL_DOWNLOADS"

 # Count already completed categories
 completed_count=$(wc -l < "$COMPLETED_FILE")
 remaining_count=$((${#CATEGORIES[@]} - completed_count))
 log "Already completed: $completed_count categories"
 log "Remaining to process: $remaining_count categories"

 # Check if required tools are installed
 if [ "$USE_PGET" = true ]; then
    if ! command -v pget &> /dev/null; then
        log "WARNING: pget is not installed. Falling back to wget."
        USE_PGET=false
    fi
 fi

 if [ "$USE_PGET" = false ]; then
    if ! command -v wget &> /dev/null; then
        log "ERROR: wget is not installed. Please install wget first."
        exit 1
    fi
 fi

 if ! command -v unzip &> /dev/null; then
    log "ERROR: unzip is not installed. Please install unzip first."
    exit 1
 fi

 if ! command -v python &> /dev/null; then
    log "ERROR: python is not installed. Please install python first."
    exit 1
 fi

 # Check if main.py exists
 if [ ! -f "main.py" ]; then
    log "ERROR: main.py not found in current directory"
    exit 1
 fi

 # Add option to reset and start fresh
 if [ "$1" == "--reset" ]; then
    log "Resetting progress and starting fresh..."
    rm -f "$COMPLETED_FILE"
    rm -rf "$OUTPUT_BASE_DIR"
    rm -rf "$TEMP_BASE_DIR"
    mkdir -p "$OUTPUT_BASE_DIR"
    mkdir -p "$TEMP_BASE_DIR"
    touch "$COMPLETED_FILE"
 fi

 # Run the parallel download process
 run_parallel_downloads

 # Clean up any remaining temporary files
 rm -rf "$TEMP_BASE_DIR"

 log "=== Download and processing complete ==="
 log "All text files are in: $OUTPUT_BASE_DIR"

 # Generate summary report
 echo
 echo "=== SUMMARY REPORT ==="
 echo "Total categories: ${#CATEGORIES[@]}"
 echo "Successfully processed: $(wc -l < "$COMPLETED_FILE")"
 echo "Output directory: $OUTPUT_BASE_DIR"
 echo
 echo "Category file counts:"
 total_files=0
 for category in "${CATEGORIES[@]}"; do
    safe_category=$(echo "$category" | tr ' ' '_')
    if [ -d "${OUTPUT_BASE_DIR}/${safe_category}" ]; then
        count=$(find "${OUTPUT_BASE_DIR}/${safe_category}" -name "*.txt" 2>/dev/null | wc -l)
        if [ $count -gt 0 ]; then
            printf "  %-40s %d text files\n" "$category:" "$count"
            ((total_files += count))
        fi
    fi
 done
 echo
 echo "Total text files generated: $total_files"
 echo
 echo "To see the full log, check: $LOG_FILE"
 echo "To reset and start fresh, run: $0 --reset"

 # Ask about Hugging Face upload
 if [ $total_files -gt 0 ]; then
    # Check if auto-upload is configured
    if [ "$HF_AUTO_UPLOAD" == "true" ] && [ -n "$HF_TOKEN" ]; then
        log "Auto-uploading to Hugging Face as configured..."
        SHOULD_UPLOAD=true
    else
        echo
        read -p "Would you like to upload this dataset to Hugging Face? (y/n) " -n 1 -r
        echo
        if [[ $REPLY =~ ^[Yy]$ ]]; then
            SHOULD_UPLOAD=true
        else
            SHOULD_UPLOAD=false
        fi
    fi
    
    if [ "$SHOULD_UPLOAD" == "true" ]; then
        # Check if upload script exists
        if [ ! -f "upload_to_huggingface.py" ]; then
            log "Creating Hugging Face upload script..."
            echo "Please ensure upload_to_huggingface.py is in the current directory"
            echo "You can download it from the artifacts provided"
            exit 1
        fi
        
        # Check for required Python packages
        if ! python -c "import huggingface_hub" 2>/dev/null; then
            echo "Installing required Python packages..."
            pip install huggingface-hub
        fi
        
        # Configure upload parameters
        if [ "$HF_AUTO_UPLOAD" == "true" ] && [ -n "$HF_TOKEN" ]; then
            # Use configuration file settings
            HF_TOKEN_INPUT=$HF_TOKEN
            REPO_NAME=${HF_REPO_NAME:-survivor-library-text}
            PRIVATE_FLAG=""
            if [ "$HF_PRIVATE" == "true" ]; then
                PRIVATE_FLAG="--private"
            fi
            USERNAME_FLAG=""
            if [ -n "$HF_USERNAME" ]; then
                USERNAME_FLAG="--username $HF_USERNAME"
            fi
        else
            # Interactive configuration
            echo
            echo "=== Hugging Face Upload Configuration ==="
            
            # Check for existing token
            if [ -n "$HF_TOKEN" ]; then
                echo "Using HF_TOKEN from environment"
                HF_TOKEN_INPUT=$HF_TOKEN
            else
                echo "Please enter your Hugging Face token"
                echo "(Get one from: https://huggingface.co/settings/tokens)"
                read -s -p "Token: " HF_TOKEN_INPUT
                echo
            fi
            
            read -p "Repository name (default: survivor-library-text): " REPO_NAME
            REPO_NAME=${REPO_NAME:-survivor-library-text}
            
            read -p "Make dataset private? (y/n): " -n 1 -r
            echo
            PRIVATE_FLAG=""
            if [[ $REPLY =~ ^[Yy]$ ]]; then
                PRIVATE_FLAG="--private"
            fi
            USERNAME_FLAG=""
        fi
        
        # Run the upload
        log "Starting upload to Hugging Face..."
        log "Repository: $REPO_NAME"
        if [ -n "$PRIVATE_FLAG" ]; then
            log "Privacy: Private"
        else
            log "Privacy: Public"
        fi
        
        if HF_TOKEN=$HF_TOKEN_INPUT python upload_to_huggingface.py \
            --text-dir "$OUTPUT_BASE_DIR" \
            --repo-name "$REPO_NAME" \
            $USERNAME_FLAG \
            $PRIVATE_FLAG; then
            log "Successfully uploaded to Hugging Face!"
            echo
            echo "🎉 Dataset available at: https://huggingface.co/datasets/$REPO_NAME"
        else
            log "Failed to upload to Hugging Face"
        fi
    fi
 fi

 # Show configuration file tip if not using auto-upload
 if [ "$HF_AUTO_UPLOAD" != "true" ] && [ $total_files -gt 0 ]; then
    echo
    echo "💡 Tip: To enable automatic uploads, create a .hf_config file:"
    echo "   cp hf_config_template.sh .hf_config"
    echo "   # Edit .hf_config with your settings"
 fi
	#!/usr/bin/env python3
	"""
	Upload Survivor Library text files to Hugging Face Hub
	Based on https://gist.github.com/pszemraj/502cf50767e8c363947f5e93cb95cc07
	"""

	import os
	import json
	import argparse
	from pathlib import Path
	from typing import Dict, List, Optional
	from datetime import datetime
	import hashlib

	try:
	from huggingface_hub import HfApi, create_repo, upload_folder
	from huggingface_hub.utils import RepositoryNotFoundError
	except ImportError:
	print("Please install huggingface_hub: pip install huggingface-hub")
	exit(1)


	def generate_dataset_card(stats: Dict[str, int], total_files: int, total_size: int) -> str:
	"""Generate a README.md dataset card for Hugging Face"""
	return f"""---
	language:
	- en
	license: cc0-1.0
	task_categories:
	- text-generation
	- text2text-generation
	tags:
	- survival
	- historical
	- knowledge
	- library
	- public-domain
	pretty_name: Survivor Library Text Corpus
	size_categories:
	- 10K<n<100K
	---

	# Survivor Library Text Corpus

	This dataset contains OCR-extracted text from the [Survivor Library](https://www.survivorlibrary.com/),
	a collection of public domain books focused on practical knowledge and skills from the pre-industrial
	and early industrial era.

	## Dataset Description

	The Survivor Library is a collection of books that would be useful in rebuilding civilization after
	a catastrophic event. It focuses on practical, hands-on knowledge from the 1800s and early 1900s.

	### Dataset Summary

	- Total Files: {total_files:,}
	- Total Size: {total_size / (1024**3):.2f} GB
	- Categories: {len(stats)}
	- Source: [survivorlibrary.com](https://www.survivorlibrary.com/)
	- Processing Date: {datetime.now().strftime('%Y-%m-%d')}

	### Categories and File Counts

	\| Category \| Files \|
	\|----------\|-------\|
	""" + "\n".join([f"\| {cat} \| {count:,} \|" for cat, count in sorted(stats.items(), key=lambda x: x[1], reverse=True)]) + """

	## Dataset Structure

	The dataset is organized by category, with each category containing text files extracted from PDFs:

	```
	text_outputs/
	├── Accounting/
	│ ├── book1.txt
	│ ├── book2.txt
	│ └── ...
	├── Agriculture/
	│ └── ...
	└── ...
	```

	## Data Processing

	1. Original PDFs downloaded from survivorlibrary.com
	2. Text extracted using OCR/PDF text extraction
	3. Organized by category
	4. Uploaded to Hugging Face Hub

	## Usage

	```python
	from datasets import load_dataset

	# Load the entire dataset
	dataset = load_dataset("YOUR_USERNAME/survivor-library-text")

	# Load specific categories
	accounting_texts = dataset.filter(lambda x: x['category'] == 'Accounting')
	```

	## Considerations

	- These texts are historical and may contain outdated or potentially dangerous information
	- Always verify information with modern sources before practical application
	- Some OCR errors may be present in the extracted text
	- Original formatting may not be perfectly preserved

	## License

	The original books are in the public domain. This dataset compilation is released under CC0 1.0 Universal.

	## Citation

	If you use this dataset, please cite:

	```bibtex
	@misc{survivor_library_text,
	title={Survivor Library Text Corpus},
	author={Survivor Library},
	year={2024},
	publisher={Hugging Face},
	url={https://huggingface.co/datasets/YOUR_USERNAME/survivor-library-text}
	}
	```
	"""


	def create_metadata_file(text_dir: Path) -> Dict:
	"""Create a metadata JSON file with information about all texts"""
	metadata = {
	"dataset_name": "Survivor Library Text Corpus",
	"version": "1.0.0",
	"created": datetime.now().isoformat(),
	"categories": {},
	"files": []
	}

	total_size = 0

	for category_dir in sorted(text_dir.iterdir()):
	if category_dir.is_dir():
	category_name = category_dir.name
	category_files = []

	for text_file in sorted(category_dir.glob("*.txt")):
	file_size = text_file.stat().st_size
	total_size += file_size

	# Calculate file hash for integrity
	with open(text_file, 'rb') as f:
	file_hash = hashlib.md5(f.read()).hexdigest()

	file_info = {
	"filename": text_file.name,
	"category": category_name,
	"path": str(text_file.relative_to(text_dir)),
	"size_bytes": file_size,
	"md5": file_hash
	}

	metadata["files"].append(file_info)
	category_files.append(text_file.name)

	metadata["categories"][category_name] = {
	"file_count": len(category_files),
	"files": category_files
	}

	metadata["total_files"] = len(metadata["files"])
	metadata["total_size_bytes"] = total_size

	return metadata


	def upload_to_huggingface(
	text_dir: str,
	repo_name: str,
	username: Optional[str] = None,
	private: bool = False,
	token: Optional[str] = None
	):
	"""Upload text files to Hugging Face Hub"""

	text_path = Path(text_dir)
	if not text_path.exists():
	raise ValueError(f"Directory {text_dir} does not exist")

	# Initialize HF API
	api = HfApi(token=token)

	# Get username if not provided
	if username is None:
	user_info = api.whoami()
	username = user_info["name"]

	repo_id = f"{username}/{repo_name}"

	print(f"Uploading to: {repo_id}")

	# Create repository if it doesn't exist
	try:
	api.repo_info(repo_id=repo_id, repo_type="dataset")
	print(f"Repository {repo_id} already exists")
	except RepositoryNotFoundError:
	print(f"Creating repository: {repo_id}")
	create_repo(
	repo_id=repo_id,
	repo_type="dataset",
	private=private,
	token=token
	)

	# Generate metadata
	print("Generating metadata...")
	metadata = create_metadata_file(text_path)

	# Save metadata
	metadata_path = text_path / "metadata.json"
	with open(metadata_path, 'w') as f:
	json.dump(metadata, f, indent=2)

	# Calculate statistics
	stats = {cat: info["file_count"] for cat, info in metadata["categories"].items()}
	total_files = metadata["total_files"]
	total_size = metadata["total_size_bytes"]

	# Generate and save dataset card
	print("Generating dataset card...")
	dataset_card = generate_dataset_card(stats, total_files, total_size)
	dataset_card = dataset_card.replace("YOUR_USERNAME", username)

	readme_path = text_path / "README.md"
	with open(readme_path, 'w') as f:
	f.write(dataset_card)

	# Upload everything
	print(f"Uploading {total_files} files ({total_size / (1024**3):.2f} GB)...")

	try:
	upload_folder(
	folder_path=str(text_path),
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	commit_message=f"Upload Survivor Library text corpus - {total_files} files",
	ignore_patterns=[".pdf", ".zip", "__pycache__", ".DS_Store"]
	)

	print(f"\n✅ Successfully uploaded to: https://huggingface.co/datasets/{repo_id}")

	except Exception as e:
	print(f"\n❌ Error uploading: {str(e)}")
	raise

	finally:
	# Clean up generated files
	if metadata_path.exists():
	metadata_path.unlink()
	if readme_path.exists():
	readme_path.unlink()


	def main():
	parser = argparse.ArgumentParser(description="Upload Survivor Library texts to Hugging Face")
	parser.add_argument("--text-dir", type=str, default="text_outputs",
	help="Directory containing the text files")
	parser.add_argument("--repo-name", type=str, default="survivor-library-text",
	help="Name of the Hugging Face dataset repository")
	parser.add_argument("--username", type=str, default=None,
	help="Hugging Face username (defaults to logged-in user)")
	parser.add_argument("--private", action="store_true",
	help="Make the dataset private")
	parser.add_argument("--token", type=str, default=None,
	help="Hugging Face API token (or set HF_TOKEN env var)")

	args = parser.parse_args()

	# Get token from environment if not provided
	token = args.token or os.environ.get("HF_TOKEN")

	if not token:
	print("Please provide a Hugging Face token via --token or HF_TOKEN environment variable")
	print("You can get a token from: https://huggingface.co/settings/tokens")
	exit(1)

	upload_to_huggingface(
	text_dir=args.text_dir,
	repo_name=args.repo_name,
	username=args.username,
	private=args.private,
	token=token
	)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Standalone Asynchronous Nanonets-OCR-s Inference Script using vLLM and PyMuPDF.

	This script processes PDF files from an input directory using the
	nanonets/Nanonets-OCR-s model served locally by vLLM via its OpenAI-compatible API.
	It renders each page, sends API requests concurrently for OCR, extracts the
	structured markdown/HTML text, and saves the combined text for each PDF into a
	corresponding .txt file in the specified output directory.

	This version uses asyncio and the AsyncOpenAI client to significantly speed up
	processing by sending multiple page OCR requests to the vLLM server concurrently.

	IMPORTANT: Requires a separate vLLM server running with the Nanonets-OCR-s model.
	Start the server BEFORE running this script, for example:

	vllm serve nanonets/Nanonets-OCR-s --max-num-seqs 256 --gpu-memory-utilization 0.9

	Dependencies (vLLM - see vLLM docs for specific CUDA versions):
	pip install ninja vllm flash-attn

	Dependencies (Script):
	pip install "openai>=1.0" PyMuPDF Pillow fire tqdm pypdf "tqdm[asyncio]" joblib

	Example Usage:
	# 1. Start the vLLM server in a separate terminal:
	# vllm serve nanonets/Nanonets-OCR-s

	# 2. Run this script:
	python nanonets_pipeline.py \
	--input_dir ./my_pdfs \
	--output_dir ./output_text \
	--model_id nanonets/Nanonets-OCR-s \
	--max_pages 100 \
	--overwrite \
	--api_base_url http://localhost:8000/v1 \
	--concurrency_limit 16
	"""

	import asyncio
	import base64
	import io
	import logging
	import os
	import re
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple

	import fire

	# REMOVED: mdformat is no longer needed as Nanonets produces structured output.
	# import mdformat
	from joblib import Parallel, delayed
	from PIL import Image
	from pypdf import PdfReader
	from pypdf.errors import PdfReadError
	from tqdm import tqdm
	from tqdm.asyncio import tqdm_asyncio

	try:
	from openai import APIConnectionError, APIStatusError, AsyncOpenAI, RateLimitError
	except ImportError:
	print("=" * 80)
	print("ERROR: openai library >= 1.0 not found.")
	print("Please install it: pip install 'openai>=1.0'")
	print("=" * 80)
	exit(1)

	try:
	import fitz # PyMuPDF
	except ImportError:
	print("=" * 80)
	print("ERROR: PyMuPDF library not found.")
	print("Please install it: pip install PyMuPDF")
	print("=" * 80)
	exit(1)


	# --- Configuration ---
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - [%(funcName)s] %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)
	logger = logging.getLogger(__name__)

	# Reduce noise from underlying libraries
	logging.getLogger("httpx").setLevel(logging.WARNING)
	logging.getLogger("openai").setLevel(logging.WARNING)
	logging.getLogger("httpcore").setLevel(logging.WARNING)

	# --- CHANGED: Updated model ID, prompt, and default parameters for Nanonets-OCR-s ---
	DEFAULT_MODEL_ID: str = "nanonets/Nanonets-OCR-s"
	NANONETS_PROMPT: str = (
	"Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."
	)
	DEFAULT_TARGET_IMAGE_DIM: int = 1024
	DEFAULT_API_BASE_URL: str = "http://localhost:8000/v1"
	DEFAULT_API_KEY: str = "EMPTY"
	DEFAULT_CONCURRENCY_LIMIT: int = 16
	DEFAULT_MAX_TOKENS_PER_PAGE: int = 10000
	DEFAULT_TEMPERATURE: float = 0.0
	DEFAULT_FREQ_PENALTY: float = 0.1


	def render_pdf_page_to_pil_fitz(
	pdf_path: Path,
	page_num: int,
	target_longest_image_dim: int = DEFAULT_TARGET_IMAGE_DIM,
	) -> Optional[Image.Image]:
	"""
	Renders a single page of a PDF to a PIL Image using PyMuPDF (fitz).

	Resizes the image so its longest dimension matches target_longest_image_dim,
	but only downscales (does not upscale).

	Args:
	pdf_path: Path to the PDF file.
	page_num: The 1-based page number to render.
	target_longest_image_dim: Target size for the longest dimension.

	Returns:
	A PIL Image object of the rendered page, or None if rendering fails.
	"""
	doc: Optional[fitz.Document] = None
	try:
	doc = fitz.open(pdf_path)
	if not 0 < page_num <= doc.page_count:
	logger.error(
	f"Invalid page number {page_num} for {pdf_path.name} "
	f"({doc.page_count} pages)."
	)
	return None

	page: fitz.Page = doc.load_page(page_num - 1) # fitz uses 0-based index
	page_rect: fitz.Rect = page.rect
	width, height = page_rect.width, page_rect.height

	if max(width, height) <= 0:
	logger.error(
	f"Invalid page dimensions ({width}x{height}) for "
	f"{pdf_path.name} page {page_num}."
	)
	return None

	zoom_factor: float = 1.0
	if max(width, height) > target_longest_image_dim:
	zoom_factor = target_longest_image_dim / max(width, height)

	matrix: fitz.Matrix = fitz.Matrix(zoom_factor, zoom_factor)
	pix: fitz.Pixmap = page.get_pixmap(matrix=matrix, alpha=False)

	if pix.width == 0 or pix.height == 0:
	logger.error(
	f"Rendered pixmap has zero dimension for {pdf_path.name} "
	f"page {page_num}."
	)
	return None

	img: Image.Image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	return img

	except fitz.fitz.FileNotFoundError:
	logger.error(f"PyMuPDF could not find file: {pdf_path}")
	return None
	except Exception as e:
	logger.error(
	f"PyMuPDF error rendering {pdf_path.name} page {page_num}: "
	f"{type(e).__name__} - {e}"
	)
	return None
	finally:
	if doc:
	try:
	doc.close()
	except Exception as e:
	logger.warning(f"Error closing PDF {pdf_path.name}: {e}")


	def get_pdf_page_count(pdf_path: Path) -> Optional[int]:
	"""
	Gets the number of pages in a PDF file using pypdf, with fitz fallback.

	Args:
	pdf_path: Path to the PDF file.

	Returns:
	The number of pages as an integer, or None if reading fails.
	"""
	try:
	reader = PdfReader(pdf_path, strict=False)
	count = len(reader.pages)

	if count == 0:
	try:
	with fitz.open(pdf_path) as doc:
	count = doc.page_count
	except Exception:
	logger.warning(
	f"pypdf reported 0 pages, fitz failed to open "
	f"{pdf_path.name}. Assuming 0 pages."
	)
	return 0
	return count
	except PdfReadError as e:
	logger.error(f"pypdf failed to read {pdf_path.name}: {e}. Trying fitz.")
	try:
	with fitz.open(pdf_path) as doc:
	return doc.page_count
	except Exception as fitz_e:
	logger.error(
	f"Both pypdf and fitz failed page count for {pdf_path.name}: {fitz_e}"
	)
	return None
	except FileNotFoundError:
	logger.error(f"File not found for page count: {pdf_path}")
	return None
	except Exception as e:
	logger.error(f"Unexpected error getting page count for {pdf_path.name}: {e}")
	return None


	def encode_pil_to_base64(image: Image.Image, format: str = "PNG") -> str:
	"""
	Encodes a PIL image object to a base64 string.

	Args:
	image: The PIL Image object.
	format: The image format to use (e.g., "PNG", "JPEG").

	Returns:
	The base64 encoded string representation of the image.
	"""
	buffered = io.BytesIO()
	image.save(buffered, format=format)
	img_byte = buffered.getvalue()
	img_base64 = base64.b64encode(img_byte)
	return img_base64.decode("utf-8")


	async def ocr_page_api(
	client: AsyncOpenAI,
	model_id: str,
	img_base64: str,
	page_num: int,
	pdf_name: str,
	semaphore: asyncio.Semaphore,
	temperature: float = DEFAULT_TEMPERATURE,
	max_tokens: int = DEFAULT_MAX_TOKENS_PER_PAGE,
	frequency_penalty: float = DEFAULT_FREQ_PENALTY,
	) -> str:
	"""
	Sends a single page image to the vLLM OpenAI API for OCR asynchronously.

	Uses an asyncio.Semaphore to limit the number of concurrent requests.

	Args:
	client: The initialized AsyncOpenAI client.
	model_id: The model identifier for the API call.
	img_base64: The base64 encoded string of the page image.
	page_num: The 1-based page number (for logging).
	pdf_name: The name of the PDF file (for logging).
	semaphore: The asyncio.Semaphore to control concurrency.
	temperature: Sampling temperature for the model.
	max_tokens: Maximum tokens to generate for the page.

	Returns:
	The extracted text content as a string, or an error marker string
	(e.g., "[API_CONNECTION_ERROR]") if an API error occurs.
	"""
	async with semaphore: # Acquire semaphore before making the API call
	try:
	response = await client.chat.completions.create(
	model=model_id,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{img_base64}"
	},
	},
	# --- CHANGED: Use the detailed Nanonets prompt ---
	{"type": "text", "text": NANONETS_PROMPT},
	],
	}
	],
	temperature=temperature,
	max_tokens=max_tokens,
	frequency_penalty=frequency_penalty,
	)
	content = response.choices[0].message.content
	return content.strip() if content else "[API_EMPTY_RESPONSE]"
	except APIConnectionError as e:
	logger.error(
	f"API Connect Error page {page_num} ({pdf_name}): {e}. "
	f"Is server at {client.base_url} running?"
	)
	return "[API_CONNECTION_ERROR]"
	except RateLimitError as e:
	logger.warning(
	f"API Rate Limit Error page {page_num} ({pdf_name}): {e}. "
	f"Server busy or concurrency too high? Retrying may be needed."
	)
	await asyncio.sleep(2)
	return "[API_RATE_LIMIT_ERROR]"
	except APIStatusError as e:
	logger.error(
	f"API Status Error page {page_num} ({pdf_name}): "
	f"Status={e.status_code}, Response={e.response}"
	)
	return f"[API_STATUS_ERROR_{e.status_code}]"
	except Exception as e:
	logger.exception(f"Unexpected API Error page {page_num} ({pdf_name}): {e}")
	return "[API_UNEXPECTED_ERROR]"


	def render_and_encode_single_page(
	pdf_file: Path, page_num: int, target_image_dim: int, pdf_name: str
	) -> tuple:
	"""
	Renders and encodes a single PDF page in one function for parallel processing.

	Args:
	pdf_file: Path to the PDF file
	page_num: Page number to render (1-based)
	target_image_dim: Target size for longest dimension
	pdf_name: Name of PDF file (for logging)

	Returns:
	tuple: (page_num, base64_string or error_message)
	"""
	pil_image = render_pdf_page_to_pil_fitz(pdf_file, page_num, target_image_dim)
	if not pil_image:
	logger.warning(f"Failed to render page {page_num} ({pdf_name})")
	return page_num, "[PAGE_RENDER_ERROR]"

	try:
	img_base64 = encode_pil_to_base64(pil_image)
	return page_num, img_base64
	except Exception as e:
	logger.error(f"Failed to encode page {page_num} ({pdf_name}): {e}")
	return page_num, "[IMAGE_ENCODE_ERROR]"


	# --- Main Processing Logic ---


	async def process_directory(
	input_dir: str,
	output_dir: Optional[str] = None,
	model_id: str = DEFAULT_MODEL_ID,
	api_base_url: str = DEFAULT_API_BASE_URL,
	api_key: str = DEFAULT_API_KEY,
	target_image_dim: int = DEFAULT_TARGET_IMAGE_DIM,
	max_pages: Optional[int] = None,
	# --- CHANGED: Updated default temperature and max tokens for Nanonets ---
	temperature: float = DEFAULT_TEMPERATURE,
	max_tokens_per_page: int = DEFAULT_MAX_TOKENS_PER_PAGE,
	overwrite: bool = False,
	concurrency_limit: int = DEFAULT_CONCURRENCY_LIMIT,
	) -> None:
	"""
	Processes PDF files asynchronously using Nanonets-OCR-s via vLLM's OpenAI API.

	Renders pages, encodes them, sends concurrent API requests for OCR,
	combines results, and saves text files.

	Args:
	input_dir: Path to the directory containing input PDF files.
	output_dir: Path to the directory for output .txt files. If None,
	creates a directory next to input_dir.
	model_id: Model ID for the vLLM server API.
	api_base_url: Base URL of the vLLM OpenAI-compatible API endpoint.
	api_key: API key for the endpoint (usually 'EMPTY' for local vLLM).
	target_image_dim: Target size for the longest dimension of page images.
	max_pages: Max pages to process per PDF (None for all pages).
	temperature: Sampling temperature for the model (0.0 recommended for Nanonets).
	max_tokens_per_page: Max tokens the model can generate per page.
	overwrite: If True, overwrite existing output .txt files.
	concurrency_limit: Maximum number of concurrent API requests.
	"""
	input_path = Path(input_dir).resolve()
	assert (
	input_path.is_dir()
	), f"Input directory not found or is not a directory: {input_path}"

	output_path = (
	Path(output_dir).resolve()
	if output_dir is not None
	else input_path.parent / f"output-pdftotext-{input_path.name}"
	)
	output_path.mkdir(parents=True, exist_ok=True)

	logger.info(f"Input directory: {input_path}")
	logger.info(f"Output directory: {output_path}")
	logger.info(f"Model API: {model_id} at {api_base_url}")
	logger.info(f"Concurrency: {concurrency_limit}")
	logger.info(f"Target Image Dim: {target_image_dim}")
	logger.info(f"Overwrite: {overwrite}")

	client: Optional[AsyncOpenAI] = None
	try:
	client = AsyncOpenAI(api_key=api_key, base_url=api_base_url)
	logger.info(f"AsyncOpenAI client initialized for {api_base_url}")

	pdf_files: List[Path] = sorted(list(input_path.glob("*.pdf")))
	if not pdf_files:
	logger.warning(f"No PDF files found in {input_path}")
	return
	logger.info(f"Found {len(pdf_files)} PDF files.")

	semaphore = asyncio.Semaphore(concurrency_limit)

	for pdf_file in tqdm(
	pdf_files, desc="Processing PDFs", unit="pdf", mininterval=1.0
	):
	output_txt_path = output_path / (pdf_file.stem + ".md")

	if not overwrite and output_txt_path.exists():
	logger.info(f"Skipping {pdf_file.name}, output exists.")
	continue

	logger.info(f"Starting processing for {pdf_file.name}")

	page_count = get_pdf_page_count(pdf_file)
	if page_count is None:
	logger.warning(f"Skipping {pdf_file.name}, failed to get page count.")
	output_txt_path.write_text("[ERROR_READING_PDF]", encoding="utf-8")
	continue
	if page_count == 0:
	logger.warning(f"Skipping {pdf_file.name}, contains 0 pages.")
	output_txt_path.write_text("", encoding="utf-8") # Empty file
	continue

	num_pages_to_process = page_count
	if max_pages is not None and 0 < max_pages < page_count:
	logger.info(f"Limiting to first {max_pages} pages of {pdf_file.name}")
	num_pages_to_process = max_pages

	# --- Preprocessing: Render and Encode Pages ---
	page_render_encode_data: Dict[int, str] = {} # page_num -> base64 or error
	logger.debug(
	f"Rendering/encoding {num_pages_to_process} pages for {pdf_file.name} in parallel"
	)
	n_jobs = min(64, os.cpu_count() or 1)
	logger.info(f"Using {n_jobs} cores for parallel page rendering")
	parallel_results = Parallel(
	n_jobs=n_jobs, verbose=0
	)( # Set verbose=0 to avoid clutter
	delayed(render_and_encode_single_page)(
	pdf_file, page_num, target_image_dim, pdf_file.name
	)
	for page_num in range(1, num_pages_to_process + 1)
	)
	page_render_encode_data = {}
	valid_pages_for_api = 0
	for page_num, result in parallel_results:
	page_render_encode_data[page_num] = result
	if not result.startswith("["):
	valid_pages_for_api += 1

	if valid_pages_for_api == 0:
	logger.warning(
	f"No pages successfully rendered/encoded for {pdf_file.name}. "
	"Skipping API calls."
	)
	all_page_texts = {
	pn: data for pn, data in page_render_encode_data.items()
	}
	else:
	# --- Asynchronous API Calls ---
	tasks: List[Tuple[int, asyncio.Task[str]]] = []
	logger.info(
	f"Submitting {valid_pages_for_api} pages to API for {pdf_file.name}"
	)
	for page_num in range(1, num_pages_to_process + 1):
	img_data = page_render_encode_data.get(page_num)
	if img_data and not img_data.startswith("["):
	task = asyncio.create_task(
	ocr_page_api(
	client=client,
	model_id=model_id,
	img_base64=img_data,
	page_num=page_num,
	pdf_name=pdf_file.name,
	semaphore=semaphore,
	temperature=temperature,
	max_tokens=max_tokens_per_page,
	),
	name=f"OCR_{pdf_file.stem}_p{page_num}",
	)
	tasks.append((page_num, task))

	api_results: List[str] = await tqdm_asyncio.gather(
	*(task for _, task in tasks),
	desc=f" OCR Pages ({pdf_file.name[:20]})",
	unit="page",
	leave=False,
	mininterval=5.0, # Update every 5 seconds max
	)

	# --- Combine Results ---
	all_page_texts: Dict[int, str] = {}
	for pn, data in page_render_encode_data.items():
	if data.startswith("["):
	all_page_texts[pn] = data
	for i, (page_num, _) in enumerate(tasks):
	all_page_texts[page_num] = api_results[i]

	if not all_page_texts:
	logger.warning(f"No text results generated for {pdf_file.name}.")
	output_txt_path.write_text("", encoding="utf-8")
	continue

	ERROR_PATTERN = re.compile(r"^\s\[[A-Z0-9_]+\]\s$")
	ordered_texts: List[str] = [
	all_page_texts.get(pn, f"[PAGE_{pn}_MISSING_UNEXPECTEDLY]")
	for pn in range(1, num_pages_to_process + 1)
	]

	# The filtering of error messages is still useful.
	filtered_texts: List[str] = [
	text
	for text in ordered_texts
	if text.strip() and not ERROR_PATTERN.match(text.strip())
	]

	if not filtered_texts:
	logger.warning(f"All pages were filtered out for {pdf_file.name}.")
	output_txt_path.write_text("", encoding="utf-8")
	continue

	# Use form feed character (\f) as page separator. This is a good way
	# to delimit pages in the final text file.
	final_text: str = "\n\f\n".join(ordered_texts)

	try:
	output_txt_path.write_text(final_text, encoding="utf-8")
	logger.info(f"Successfully wrote output: {output_txt_path.name}")
	except Exception as e:
	logger.error(f"Failed to write output file {output_txt_path}: {e}")

	except Exception as e:
	logger.exception(f"An unexpected error occurred during processing: {e}")
	finally:
	if client:
	await client.close()
	logger.info("AsyncOpenAI client closed.")
	logger.info("Processing run finished.")


	def main(**kwargs: Any) -> None:
	"""
	Command-line entry point wrapper to run the async processing function.

	Uses fire library to handle command-line arguments. Any argument accepted
	by `process_directory` can be passed via the command line, e.g.,
	`--input_dir ./pdfs --max_pages 5`.

	Args:
	**kwargs: Arguments passed from the command line via fire.
	"""
	try:
	asyncio.run(process_directory(**kwargs))
	except KeyboardInterrupt:
	logger.info("Processing interrupted by user.")


	if __name__ == "__main__":
	fire.Fire(main)
	#!/bin/bash

	# Survivor Library Bulk Download and Process Script
	# This script downloads all zip files from survivorlibrary.com, extracts PDFs, and converts them to text

	# Configuration
	BASE_URL="https://www.survivorlibrary.com/library"
	OUTPUT_BASE_DIR="text_outputs"
	TEMP_BASE_DIR="temp_pdfs"
	LOG_FILE="survivor_library_download.log"
	COMPLETED_FILE="survivor_library_completed.txt"
	MAX_PARALLEL_DOWNLOADS=2
	PARALLEL_CONNECTIONS=2 # Number of connections per file for pget
	RETRY_ATTEMPTS=3
	RETRY_DELAY=5
	USE_PGET=true # Set to false to use wget instead

	# Load Hugging Face configuration if exists
	if [ -f ".hf_config" ]; then
	source .hf_config
	log "Loaded Hugging Face configuration from .hf_config"
	fi


	# Create necessary directories
	mkdir -p "$OUTPUT_BASE_DIR"
	mkdir -p "$TEMP_BASE_DIR"

	# Initialize completed file if it doesn't exist
	touch "$COMPLETED_FILE"

	# Log function
	log() {
	echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" \| tee -a "$LOG_FILE"
	}

	# Function to check if category is already completed
	is_completed() {
	local category="$1"
	grep -q "^${category}$" "$COMPLETED_FILE"
	}

	# Function to mark category as completed
	mark_completed() {
	local category="$1"
	echo "$category" >> "$COMPLETED_FILE"
	}

	# Function to download with retry using either pget or wget
	download_with_retry() {
	local url="$1"
	local output="$2"
	local attempts=0

	while [ $attempts -lt $RETRY_ATTEMPTS ]; do
	if [ "$USE_PGET" = true ]; then
	# Use pget with proper syntax
	if pget -p "$PARALLEL_CONNECTIONS" -o "$output" "$url" 2>>"$LOG_FILE"; then
	return 0
	fi
	else
	# Use wget as fallback
	if wget -q --show-progress -O "$output" "$url" 2>>"$LOG_FILE"; then
	return 0
	fi
	fi

	((attempts++))
	if [ $attempts -lt $RETRY_ATTEMPTS ]; then
	log "Download failed, retrying in ${RETRY_DELAY} seconds... (Attempt $attempts/$RETRY_ATTEMPTS)"
	sleep $RETRY_DELAY
	fi
	done

	return 1
	}

	# List of all categories from the Survivor Library
	CATEGORIES=(
	"Accounting"
	"Aeroplanes"
	"Airships"
	"Archery"
	"Architecture"
	"Astronomy"
	"Baking"
	"Banking"
	"Basketry"
	"Bee_Journal_American"
	"Bee_Journal_British"
	"Beekeeping"
	"Berries"
	"Boilermaker"
	"Bookbinding"
	"Books_for_Boys_and_Girls"
	"Books_for_Young_Children"
	"Botany"
	"Boy_Scout_Manuals"
	"Brewing_and_Distilling"
	"Bridges_and_Dams"
	"Butchering"
	"Canning"
	"Cheese_and_Butter"
	"Chemistry"
	"Christmas"
	"Clockmaking"
	"Coal_and_Mining"
	"Coffee_and_Tea"
	"Concrete"
	"Conduct_of_Life"
	"Construction"
	"Cooking_and_Cookbooks"
	"Cotton"
	"Cycles_Bi_Tri_Motor"
	"Dentistry"
	"Dogs"
	"Drilling"
	"Economics"
	"Embalming"
	"Encyclopedias"
	"Engineering_Drainage"
	"Engineering_Electrical"
	"Engineering_General"
	"Engineering_Hydraulics"
	"Engraving_and_Woodcuts"
	"Ethics"
	"Farming"
	"Farming_Corn"
	"Farming_Fish"
	"Farming_Potato_and_Sweet_Potato"
	"Firearms_Books"
	"Firearms_Manuals"
	"Fishing"
	"Food"
	"Forestry"
	"Forging_and_Casting"
	"Formulas"
	"Fuels"
	"Geodesy"
	"Geography"
	"Glassmaking"
	"Grapes_Wine_Raisins"
	"Great_Books"
	"Gunpowder_and_Explosives"
	"Hatmaking"
	"Heating"
	"Heavy_Industrial_Machinery"
	"Hemp_and_Flax"
	"Herbalism"
	"History_American"
	"Home_Economics"
	"Horses"
	"Journalism"
	"Knitting_Lace_Needlepoint"
	"Laundry"
	"Law"
	"Leather"
	"Leisure_Games_and_Sports"
	"Leisure_Recreation_Magazine"
	"Leisure_Whist"
	"Lithography"
	"Livestock_Cattle"
	"Livestock_Rabbits_and_Cavies"
	"Livestock_Sheep"
	"Livestock_Swine"
	"Machine_Tools"
	"Machinerys_Reference"
	"Masterpieces_of_Eloquence"
	"Mathematics"
	"Mechanical_Drawing"
	"Medical_Anesthesia"
	"Medical_Courses_US_Army"
	"Medical_Diagnostics"
	"Medical_Emergency"
	"Medical_Hypnotism"
	"Medical_Medicine_1900-1922"
	"Medical_Microscopy"
	"Medical_Nursing"
	"Medical_Obstetrics_1900-1922"
	"Medical_Surgery_1900-1922"
	"Medical_Surgery_2"
	"Medical_X_Rays"
	"Meteorology"
	"Mimeograph"
	"Miscellaneous"
	"Monasticism"
	"Morality"
	"Mushrooms"
	"Musical_Instruments"
	"NBC"
	"Navigation"
	"Opium"
	"Optometry"
	"Painting"
	"Papermaking"
	"Photography"
	"Pottery"
	"Poultry"
	"Primers"
	"Printing"
	"Radio"
	"Radio_73_Magazine"
	"Railroads"
	"Rat_Control"
	"Refrigeration"
	"Sanitation"
	"Scientific_American_Series_1"
	"Scientific_American_Series_2"
	"Sewage"
	"Sewing"
	"Shelter"
	"Shipbuilding"
	"Shoemaking"
	"Shorthand"
	"Silk_Culture"
	"Sliderules"
	"Smithing"
	"Steam_Engines"
	"Stone_and_Masonry"
	"Surveying"
	"Survival_Individual"
	"Teaching"
	"Teaching_Arithmetic"
	"Teaching_Civics"
	"Teaching_Phonics"
	"Teaching_Readers"
	"Teaching_Readers_McGuffey"
	"Telegraph_and_Telephone"
	"Thanksgiving"
	"Tobacco"
	"Toys"
	"Trapping_and_Hunting"
	"Turpentine_Glue_Solvents"
	"Veterinary"
	"Wagons_and_Coaches"
	"Weaving"
	"Welding"
	"Wind_and_Water"
	"Wood_Carpentry"
	"Wood_Carving"
	"Wood_Furniture"
	"World_Depression"
	)

	# Function to download and process a single category
	process_category() {
	local category="$1"
	local safe_category=$(echo "$category" \| tr ' ' '_')
	local zip_url="${BASE_URL}/${safe_category}.zip"
	local zip_file="${TEMP_BASE_DIR}/${safe_category}.zip"
	local temp_dir="${TEMP_BASE_DIR}/${safe_category}-input-pdfs"
	local output_dir="${OUTPUT_BASE_DIR}/${safe_category}"

	# Skip if already completed
	if is_completed "$category"; then
	log "Skipping already completed category: $category"
	return 0
	fi

	log "Starting download for category: $category"
	log "Downloading from: $zip_url"

	# Create temporary directory for this category
	mkdir -p "$temp_dir"
	mkdir -p "$output_dir"

	# Download the zip file with retry
	if download_with_retry "$zip_url" "$zip_file"; then
	log "Successfully downloaded: $category"

	# Check if zip file is valid and not empty
	if [ -s "$zip_file" ]; then
	# Extract PDFs
	if unzip -j -o "$zip_file" -d "$temp_dir" 2>>"$LOG_FILE"; then
	log "Successfully extracted: $category"
	rm "$zip_file"

	# Check if any PDFs were extracted
	pdf_count=$(find "$temp_dir" -name ".pdf" -o -name ".PDF" \| wc -l)
	if [ $pdf_count -gt 0 ]; then
	log "Found $pdf_count PDF files in: $category"

	# Run the Python script to convert PDFs to text
	if python main.py --input_dir "$temp_dir" --output_dir "$output_dir" 2>>"$LOG_FILE"; then
	log "Successfully converted PDFs to text for: $category"

	# Clean up the temporary PDF directory
	rm -rf "$temp_dir"
	log "Cleaned up temporary files for: $category"

	# Mark as completed
	mark_completed "$category"
	else
	log "ERROR: Failed to convert PDFs for: $category"
	rm -rf "$temp_dir"
	fi
	else
	log "WARNING: No PDF files found in: $category"
	rm -rf "$temp_dir"
	mark_completed "$category" # Mark as completed even if no PDFs
	fi
	else
	log "ERROR: Failed to extract zip for: $category"
	rm -f "$zip_file"
	fi
	else
	log "ERROR: Downloaded file is empty or invalid for: $category"
	rm -f "$zip_file"
	fi
	else
	log "ERROR: Failed to download after $RETRY_ATTEMPTS attempts: $category"
	fi
	}

	# Function to manage parallel downloads
	run_parallel_downloads() {
	local pids=()
	local category_index=0
	local processed=0
	local total=${#CATEGORIES[@]}

	# Start initial batch of downloads - FIXED to handle all completed categories
	while [ ${#pids[@]} -lt $MAX_PARALLEL_DOWNLOADS ] && [ $category_index -lt ${#CATEGORIES[@]} ]; do
	if ! is_completed "${CATEGORIES[$category_index]}"; then
	process_category "${CATEGORIES[$category_index]}" &
	pids+=($!)
	else
	((processed++))
	fi
	((category_index++))
	done

	# If no processes were started (all were completed), we need to handle this
	if [ ${#pids[@]} -eq 0 ] && [ $category_index -ge ${#CATEGORIES[@]} ]; then
	echo -ne "\rProgress: $processed/$total categories processed (100%)\n"
	return 0
	fi

	# Continue processing remaining categories
	while [ $category_index -lt ${#CATEGORIES[@]} ] \|\| [ ${#pids[@]} -gt 0 ]; do
	# Check for finished processes
	for i in "${!pids[@]}"; do
	if [ -n "${pids[$i]}" ] && ! kill -0 "${pids[$i]}" 2>/dev/null; then
	# Process finished
	((processed++))
	echo -ne "\rProgress: $processed/$total categories processed ($(( processed * 100 / total ))%)"
	unset pids[$i]

	# Start a new process if there are more categories
	while [ $category_index -lt ${#CATEGORIES[@]} ]; do
	if ! is_completed "${CATEGORIES[$category_index]}"; then
	process_category "${CATEGORIES[$category_index]}" &
	pids[$i]=$!
	((category_index++))
	break
	else
	((processed++))
	((category_index++))
	fi
	done
	fi
	done

	# Remove empty elements from array
	local new_pids=()
	for pid in "${pids[@]}"; do
	[ -n "$pid" ] && new_pids+=("$pid")
	done
	pids=("${new_pids[@]}")

	# If we have no running processes and still have categories to check
	if [ ${#pids[@]} -eq 0 ] && [ $category_index -lt ${#CATEGORIES[@]} ]; then
	# Try to start new processes for remaining categories
	while [ ${#pids[@]} -lt $MAX_PARALLEL_DOWNLOADS ] && [ $category_index -lt ${#CATEGORIES[@]} ]; do
	if ! is_completed "${CATEGORIES[$category_index]}"; then
	process_category "${CATEGORIES[$category_index]}" &
	pids+=($!)
	else
	((processed++))
	fi
	((category_index++))
	done

	# If still no processes, all remaining must be completed
	if [ ${#pids[@]} -eq 0 ]; then
	# Count remaining completed
	while [ $category_index -lt ${#CATEGORIES[@]} ]; do
	((processed++))
	((category_index++))
	done
	break
	fi
	fi

	sleep 1
	done

	echo # New line after progress indicator
	}

	# Main execution
	if [ "$USE_PGET" = true ]; then
	log "=== Starting Survivor Library bulk download and processing with pget ==="
	log "Parallel connections per file: $PARALLEL_CONNECTIONS"
	else
	log "=== Starting Survivor Library bulk download and processing with wget ==="
	fi

	log "Total categories to process: ${#CATEGORIES[@]}"
	log "Max parallel downloads: $MAX_PARALLEL_DOWNLOADS"

	# Count already completed categories
	completed_count=$(wc -l < "$COMPLETED_FILE")
	remaining_count=$((${#CATEGORIES[@]} - completed_count))
	log "Already completed: $completed_count categories"
	log "Remaining to process: $remaining_count categories"

	# Check if required tools are installed
	if [ "$USE_PGET" = true ]; then
	if ! command -v pget &> /dev/null; then
	log "WARNING: pget is not installed. Falling back to wget."
	USE_PGET=false
	fi
	fi

	if [ "$USE_PGET" = false ]; then
	if ! command -v wget &> /dev/null; then
	log "ERROR: wget is not installed. Please install wget first."
	exit 1
	fi
	fi

	if ! command -v unzip &> /dev/null; then
	log "ERROR: unzip is not installed. Please install unzip first."
	exit 1
	fi

	if ! command -v python &> /dev/null; then
	log "ERROR: python is not installed. Please install python first."
	exit 1
	fi

	# Check if main.py exists
	if [ ! -f "main.py" ]; then
	log "ERROR: main.py not found in current directory"
	exit 1
	fi

	# Add option to reset and start fresh
	if [ "$1" == "--reset" ]; then
	log "Resetting progress and starting fresh..."
	rm -f "$COMPLETED_FILE"
	rm -rf "$OUTPUT_BASE_DIR"
	rm -rf "$TEMP_BASE_DIR"
	mkdir -p "$OUTPUT_BASE_DIR"
	mkdir -p "$TEMP_BASE_DIR"
	touch "$COMPLETED_FILE"
	fi

	# Run the parallel download process
	run_parallel_downloads

	# Clean up any remaining temporary files
	rm -rf "$TEMP_BASE_DIR"

	log "=== Download and processing complete ==="
	log "All text files are in: $OUTPUT_BASE_DIR"

	# Generate summary report
	echo
	echo "=== SUMMARY REPORT ==="
	echo "Total categories: ${#CATEGORIES[@]}"
	echo "Successfully processed: $(wc -l < "$COMPLETED_FILE")"
	echo "Output directory: $OUTPUT_BASE_DIR"
	echo
	echo "Category file counts:"
	total_files=0
	for category in "${CATEGORIES[@]}"; do
	safe_category=$(echo "$category" \| tr ' ' '_')
	if [ -d "${OUTPUT_BASE_DIR}/${safe_category}" ]; then
	count=$(find "${OUTPUT_BASE_DIR}/${safe_category}" -name "*.txt" 2>/dev/null \| wc -l)
	if [ $count -gt 0 ]; then
	printf " %-40s %d text files\n" "$category:" "$count"
	((total_files += count))
	fi
	fi
	done
	echo
	echo "Total text files generated: $total_files"
	echo
	echo "To see the full log, check: $LOG_FILE"
	echo "To reset and start fresh, run: $0 --reset"

	# Ask about Hugging Face upload
	if [ $total_files -gt 0 ]; then
	# Check if auto-upload is configured
	if [ "$HF_AUTO_UPLOAD" == "true" ] && [ -n "$HF_TOKEN" ]; then
	log "Auto-uploading to Hugging Face as configured..."
	SHOULD_UPLOAD=true
	else
	echo
	read -p "Would you like to upload this dataset to Hugging Face? (y/n) " -n 1 -r
	echo
	if [[ $REPLY =~ ^[Yy]$ ]]; then
	SHOULD_UPLOAD=true
	else
	SHOULD_UPLOAD=false
	fi
	fi

	if [ "$SHOULD_UPLOAD" == "true" ]; then
	# Check if upload script exists
	if [ ! -f "upload_to_huggingface.py" ]; then
	log "Creating Hugging Face upload script..."
	echo "Please ensure upload_to_huggingface.py is in the current directory"
	echo "You can download it from the artifacts provided"
	exit 1
	fi

	# Check for required Python packages
	if ! python -c "import huggingface_hub" 2>/dev/null; then
	echo "Installing required Python packages..."
	pip install huggingface-hub
	fi

	# Configure upload parameters
	if [ "$HF_AUTO_UPLOAD" == "true" ] && [ -n "$HF_TOKEN" ]; then
	# Use configuration file settings
	HF_TOKEN_INPUT=$HF_TOKEN
	REPO_NAME=${HF_REPO_NAME:-survivor-library-text}
	PRIVATE_FLAG=""
	if [ "$HF_PRIVATE" == "true" ]; then
	PRIVATE_FLAG="--private"
	fi
	USERNAME_FLAG=""
	if [ -n "$HF_USERNAME" ]; then
	USERNAME_FLAG="--username $HF_USERNAME"
	fi
	else
	# Interactive configuration
	echo
	echo "=== Hugging Face Upload Configuration ==="

	# Check for existing token
	if [ -n "$HF_TOKEN" ]; then
	echo "Using HF_TOKEN from environment"
	HF_TOKEN_INPUT=$HF_TOKEN
	else
	echo "Please enter your Hugging Face token"
	echo "(Get one from: https://huggingface.co/settings/tokens)"
	read -s -p "Token: " HF_TOKEN_INPUT
	echo
	fi

	read -p "Repository name (default: survivor-library-text): " REPO_NAME
	REPO_NAME=${REPO_NAME:-survivor-library-text}

	read -p "Make dataset private? (y/n): " -n 1 -r
	echo
	PRIVATE_FLAG=""
	if [[ $REPLY =~ ^[Yy]$ ]]; then
	PRIVATE_FLAG="--private"
	fi
	USERNAME_FLAG=""
	fi

	# Run the upload
	log "Starting upload to Hugging Face..."
	log "Repository: $REPO_NAME"
	if [ -n "$PRIVATE_FLAG" ]; then
	log "Privacy: Private"
	else
	log "Privacy: Public"
	fi

	if HF_TOKEN=$HF_TOKEN_INPUT python upload_to_huggingface.py \
	--text-dir "$OUTPUT_BASE_DIR" \
	--repo-name "$REPO_NAME" \
	$USERNAME_FLAG \
	$PRIVATE_FLAG; then
	log "Successfully uploaded to Hugging Face!"
	echo
	echo "🎉 Dataset available at: https://huggingface.co/datasets/$REPO_NAME"
	else
	log "Failed to upload to Hugging Face"
	fi
	fi
	fi

	# Show configuration file tip if not using auto-upload
	if [ "$HF_AUTO_UPLOAD" != "true" ] && [ $total_files -gt 0 ]; then
	echo
	echo "💡 Tip: To enable automatic uploads, create a .hf_config file:"
	echo " cp hf_config_template.sh .hf_config"
	echo " # Edit .hf_config with your settings"
	fi