Skip to content

Instantly share code, notes, and snippets.

@ostechnix
Created August 13, 2024 06:20
Show Gist options
  • Save ostechnix/1881369a2f22bd0a0a23e64756ef88d3 to your computer and use it in GitHub Desktop.
Save ostechnix/1881369a2f22bd0a0a23e64756ef88d3 to your computer and use it in GitHub Desktop.
Pdfcwcount - A Simple Python Script to Count the Total Number of Characters and Words in a PDF File.
#!/usr/bin/env python3
# ------------------------------------------------------------------
# Script Name: pdfcwcount.py
# Description: A Python Script to Count Characters and Words
# in a PDF File.
# Website: https://gist.github.com/ostechnix
# Version: 1.0
# Usage: python pdfcwcount.py filename
# ------------------------------------------------------------------
import PyPDF2
import argparse
def extract_text_from_pdf(file_path):
"""Extracts text from a PDF file."""
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text()
return text
except FileNotFoundError:
print(f"The file {file_path} does not exist.")
return ''
def count_words_in_text(text):
"""Counts the number of words in a given text."""
words = text.split()
return len(words)
def count_characters_in_text(text, include_newlines=True):
"""Counts the number of characters in a given text."""
if not include_newlines:
text = text.replace('\n', '')
return len(text)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Count the number of words and characters in a PDF file.")
parser.add_argument("file_path", type=str, help="Path to the PDF file.")
args = parser.parse_args()
text = extract_text_from_pdf(args.file_path)
if text:
# Calculate counts
word_count = count_words_in_text(text)
character_count_with_newlines = count_characters_in_text(text, include_newlines=True)
character_count_without_newlines = count_characters_in_text(text, include_newlines=False)
# Display results in a neat format
print("\n--- PDF File Analysis Report ---")
print(f"File: {args.file_path}")
print(f"Total Words: {word_count}")
print(f"Total Characters (including newlines): {character_count_with_newlines}")
print(f"Total Characters (excluding newlines): {character_count_without_newlines}")
print("-----------------------------\n")
@ostechnix
Copy link
Author

Count Characters and Words in PDF Files Using Python

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment