Skip to content

Instantly share code, notes, and snippets.

@Zejnilovic
Created January 19, 2026 12:35
Show Gist options
  • Select an option

  • Save Zejnilovic/75ff2a1056f08b361772c06ab7c632c6 to your computer and use it in GitHub Desktop.

Select an option

Save Zejnilovic/75ff2a1056f08b361772c06ab7c632c6 to your computer and use it in GitHub Desktop.
Generic PDF text splitter - organizes extracted PDFs into section files using configurable regex patterns
#!/usr/bin/env python3
"""
Generic PDF Text Splitter
Splits extracted PDF text into multiple files based on chapter/section patterns.
Usage:
python3 pdf_splitter.py --input input.txt --output ./sections --pattern "^(\d+)\.\s+(.+)$"
python3 pdf_splitter.py --input input.txt --output ./sections --pattern "^Chapter\s+(\d+):\s+(.+)$"
python3 pdf_splitter.py --input input.txt --output ./sections --pattern "^#+\s+(.+)$" # Markdown headers
"""
import re
import os
import argparse
import sys
from pathlib import Path
from typing import List, Tuple, Optional
class PdfSplitter:
"""Generic PDF text splitter based on regex patterns"""
def __init__(self,
input_file: str,
output_dir: str,
pattern: str,
capture_group: int = 0,
encoding: str = 'utf-8',
verbose: bool = False):
"""
Initialize splitter
Args:
input_file: Path to extracted PDF text file
output_dir: Directory to write section files
pattern: Regex pattern to match section headers
capture_group: Which regex group contains the heading (0 = full match, 1+ = specific group)
encoding: File encoding (default: utf-8)
verbose: Print detailed output
"""
self.input_file = input_file
self.output_dir = output_dir
self.pattern = re.compile(pattern, re.MULTILINE)
self.capture_group = capture_group
self.encoding = encoding
self.verbose = verbose
self.sections_created = 0
def validate_inputs(self) -> bool:
"""Validate input file exists"""
if not os.path.isfile(self.input_file):
print(f"Error: Input file not found: {self.input_file}")
return False
try:
re.compile(self.pattern)
except re.error as e:
print(f"Error: Invalid regex pattern: {e}")
return False
return True
def sanitize_filename(self, text: str, max_length: int = 60) -> str:
"""Convert text to safe filename"""
# Remove invalid characters
safe = re.sub(r'[/:*?"<>|\\]', '_', text)
# Remove leading/trailing spaces and dots
safe = safe.strip('. ')
# Truncate if too long
if len(safe) > max_length:
safe = safe[:max_length]
return safe or "section"
def read_lines(self) -> List[str]:
try:
with open(self.input_file, 'r', encoding=self.encoding) as f:
return f.readlines()
except UnicodeDecodeError:
print(f"Warning: UTF-8 decode failed, trying latin-1")
with open(self.input_file, 'r', encoding='latin-1') as f:
return f.readlines()
def extract_heading(self, line: str) -> Optional[str]:
match = self.pattern.match(line.strip())
if not match:
return None
try:
if self.capture_group == 0:
return match.group(0)
else:
return match.group(self.capture_group)
except IndexError:
print(f"Warning: Capture group {self.capture_group} not found in pattern")
return match.group(0)
def write_section(self, heading: str, content: List[str]) -> bool:
if not heading or not content:
return False
self.sections_created += 1
safe_heading = self.sanitize_filename(heading)
filename = f"{self.sections_created:03d}_{safe_heading}.txt"
filepath = os.path.join(self.output_dir, filename)
try:
with open(filepath, 'w', encoding=self.encoding) as f:
f.write(heading + "\n")
f.write("=" * 80 + "\n\n")
f.writelines(content)
if self.verbose:
print(f" Created: {filename}")
return True
except IOError as e:
print(f"Error writing file {filepath}: {e}")
return False
def split(self) -> bool:
"""
Split PDF text into sections
Returns:
True if successful, False otherwise
"""
if not self.validate_inputs():
return False
os.makedirs(self.output_dir, exist_ok=True)
if self.verbose:
print(f"Reading: {self.input_file}")
print(f"Output: {self.output_dir}")
print(f"Pattern: {self.pattern.pattern}")
print(f"Capture group: {self.capture_group}")
print()
lines = self.read_lines()
if self.verbose:
print(f"Total lines: {len(lines)}")
current_section = []
current_heading = None
for line in lines:
heading = self.extract_heading(line)
if heading:
if current_section and current_heading:
self.write_section(current_heading, current_section)
current_heading = heading
current_section = []
else:
if current_heading is not None:
current_section.append(line)
if current_section and current_heading:
self.write_section(current_heading, current_section)
print(f"\n Successfully created {self.sections_created} sections")
print(f"Output directory: {self.output_dir}")
return True
def main():
"""Command-line interface"""
parser = argparse.ArgumentParser(
description='Split extracted PDF text into multiple files by section',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Numbered sections (1. Title, 2. Title)
%(prog)s --input pdf.txt --output ./sections --pattern "^(\d+)\\.\\s+(.+)$" --group 2
# Chapter style (Chapter 1: Title)
%(prog)s --input pdf.txt --output ./sections --pattern "^Chapter\\s+(\\d+):\\s+(.+)$" --group 2
# Markdown headers (# Header, ## Subheader)
%(prog)s --input pdf.txt --output ./sections --pattern "^(#+)\\s+(.+)$" --group 2
# Simple headers (HEADER NAME)
%(prog)s --input pdf.txt --output ./sections --pattern "^([A-Z][A-Z\\s]+)$"
# Full match (use entire match as heading)
%(prog)s --input pdf.txt --output ./sections --pattern "^(\\d+)\\.\\s+(.+)$" --group 0
"""
)
parser.add_argument('-i', '--input', required=True,
help='Input PDF text file (extracted from PDF)')
parser.add_argument('-o', '--output', required=True,
help='Output directory for section files')
parser.add_argument('-p', '--pattern', required=True,
help='Regex pattern to match section headers')
parser.add_argument('-g', '--group', type=int, default=0,
help='Regex capture group to use as heading (default: 0 = full match)')
parser.add_argument('-e', '--encoding', default='utf-8',
help='File encoding (default: utf-8)')
parser.add_argument('-v', '--verbose', action='store_true',
help='Verbose output')
args = parser.parse_args()
# Create splitter and run
splitter = PdfSplitter(
input_file=args.input,
output_dir=args.output,
pattern=args.pattern,
capture_group=args.group,
encoding=args.encoding,
verbose=args.verbose
)
success = splitter.split()
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()

PDF Text Splitting for further processing

To process a PDF document into organized section text files, start by extracting the PDF to plain text using pdftotext, which converts your PDF file into a single text file containing all content with formatting stripped away. Once you have the extracted text file, analyze the document structure to identify the pattern used for section headers (e.g., "1. Title", "Chapter 1: Title", "# Header", etc.), then run the generic pdf_splitter_generic.py script with the appropriate regex pattern and capture group parameters. For example:

PDF_FILE="my.pdf"
EXTRACTED_TEXT="extracted.txt"
OUTPUT_DIR="sections"

pdftotext "$PDF_FILE" "$EXTRACTED_TEXT"

python3 pdf_splitter_generic.py \
  --input "$EXTRACTED_TEXT" \
  --output "$OUTPUT_DIR" \
  --pattern "^(\d+)\.\s+(.+)$" \
  --group 2 \
  --verbose

This reads through the text file line by line, identifies matching section headers using your regex pattern, extracts the appropriate text group as the heading, and writes each section to a numbered text file in the output directory, producing a well-organized set of topic-specific files that are easy to process further.

pdf_splitter_generic.py Usage

Basic Usage

python3 pdf_splitter_generic.py \
  --input /path/to/extracted.txt \
  --output ./output_sections \
  --pattern "^(\d+)\.\s+(.+)$" \
  --group 2

Command-line Arguments

Argument Short Required Description
--input -i ✅ Yes Path to extracted PDF text file
--output -o ✅ Yes Output directory for section files
--pattern -p ✅ Yes Regex pattern to match headers
--group -g ❌ No Regex capture group to use (default: 0)
--encoding -e ❌ No File encoding (default: utf-8)
--verbose -v ❌ No Verbose output

Common Patterns

Pattern 1: Numbered Sections (1., 2., 3.)

Example: "1. Introduction"

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^(\d+)\.\s+(.+)$" \
  --group 2

Regex breakdown:

  • ^ = Start of line
  • (\d+) = Group 1: One or more digits
  • \. = Literal period
  • \s+ = One or more spaces
  • (.+)$ = Group 2: Rest of line (heading)

Pattern 2: Chapter Style

Example: "Chapter 1: Introduction"

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^Chapter\s+(\d+):\s+(.+)$" \
  --group 2

Pattern 3: Markdown Headers

Example: "# Main Header", "## Subheader", "### Sub-subheader"

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^(#+)\s+(.+)$" \
  --group 2

Pattern 4: UPPERCASE Headers

Example: "INTRODUCTION", "METHODOLOGY"

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^([A-Z][A-Z\s]+)$" \
  --group 1

Pattern 5: Title Case Headers

Example: "Introduction and Background"

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^([A-Z][A-Za-z\s]+)$" \
  --group 1

Pattern 6: Headers with Numbers

Example: "1.1 Introduction", "1.2 Background"

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^(\d+\.\d+)\s+(.+)$" \
  --group 2

Understanding Regex Capture Groups

--group 0 (Default)

Uses the entire matched text as the heading.

--pattern "^(\d+)\.\s+(.+)$" --group 0
# Heading: "1. Introduction" (full match)

--group 1

Uses the first capture group (first (...) in pattern).

--pattern "^(\d+)\.\s+(.+)$" --group 1
# Heading: "1" (just the number)

--group 2

Uses the second capture group (second (...) in pattern).

--pattern "^(\d+)\.\s+(.+)$" --group 2
# Heading: "Introduction" (just the title)

Examples for Different Document Types

Scientific Paper

python3 pdf_splitter_generic.py \
  --input paper.txt \
  --output ./sections \
  --pattern "^(ABSTRACT|INTRODUCTION|METHODOLOGY|RESULTS|DISCUSSION|CONCLUSION|REFERENCES)$" \
  --group 1 \
  --verbose

Textbook

python3 pdf_splitter_generic.py \
  --input textbook.txt \
  --output ./chapters \
  --pattern "^Chapter\s+(\d+):\s+(.+)$" \
  --group 0 \
  --verbose

Technical Documentation

python3 pdf_splitter_generic.py \
  --input docs.txt \
  --output ./docs \
  --pattern "^(#{1,3})\s+(.+)$" \
  --group 2 \
  --verbose

Report with Sections

python3 pdf_splitter_generic.py \
  --input report.txt \
  --output ./sections \
  --pattern "^(\d+\.\d+\s+[A-Z][A-Za-z\s]+)$" \
  --group 1 \
  --verbose

Advanced Usage

Case-Insensitive Pattern

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "(?i)^(chapter\s+\d+:.+)$" \
  --group 1

Looser Matching (Optional Numbers)

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^(\d*\.?\s*[A-Z][A-Za-z\s]+)$" \
  --group 1

Match Multiple Heading Levels

python3 pdf_splitter_generic.py \
  --input pdf.txt \
  --output ./sections \
  --pattern "^(#{1,3}\s+.+|^[A-Z]+\s*$)" \
  --group 0

Testing Your Pattern

Quick Test (Python)

import re

# Your pattern
pattern = r"^(\d+)\.\s+(.+)$"

# Test line
line = "1. Introduction"

match = re.match(pattern, line)
if match:
    print(f"Match! Groups: {match.groups()}")
    print(f"Group 0 (full): {match.group(0)}")
    print(f"Group 1: {match.group(1)}")
    print(f"Group 2: {match.group(2)}")
else:
    print("No match")

Performance

Benchmarks (45K+ lines)

Operation Time
Read file ~100ms
Process lines ~200ms
Write output ~150ms
Total ~450ms

For larger PDFs (100K+ lines), expect linear scaling.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment