Skip to content

Instantly share code, notes, and snippets.

@sdg-1
Created January 18, 2025 23:22
Show Gist options
  • Save sdg-1/330af609884e4d9216afa9ad2c1ea827 to your computer and use it in GitHub Desktop.
Save sdg-1/330af609884e4d9216afa9ad2c1ea827 to your computer and use it in GitHub Desktop.
import requests
from pdfminer.high_level import extract_text
from collections import Counter
import re
# Step 1: Download and save the PDF
pdf_url = "https://arxiv.org/pdf/1509.02971.pdf"
response = requests.get(pdf_url)
pdf_filename = "nasa_flight_plan.pdf"
with open(pdf_filename, "wb") as pdf_file:
pdf_file.write(response.content)
# Extract text from the PDF
extracted_text = extract_text(pdf_filename)
# Process the text
# Split into sections based on page numbers or headings
sections = extracted_text.split("\n\n") # Use double newlines as section separators
# Count word frequencies (ignoring common stop words for simplicity)
words = re.findall(r"\b\w+\b", extracted_text.lower())
stop_words = {
"the",
"and",
"of",
"to",
"a",
"in",
"on",
"for",
"is",
"with",
"this",
"that",
}
word_frequencies = Counter(word for word in words if word not in stop_words)
# Display results
print("\n--- Extracted Text Summary ---")
print(f"Total Sections: {len(sections)}")
print("\n--- Top 10 Most Frequent Words ---")
for word, freq in word_frequencies.most_common(10):
print(f"{word}: {freq}")
print("\n--- Sample Section ---")
print(sections[1] if len(sections) > 1 else "No sections found.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment