Created
January 18, 2025 23:22
-
-
Save sdg-1/330af609884e4d9216afa9ad2c1ea827 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from pdfminer.high_level import extract_text | |
from collections import Counter | |
import re | |
# Step 1: Download and save the PDF | |
pdf_url = "https://arxiv.org/pdf/1509.02971.pdf" | |
response = requests.get(pdf_url) | |
pdf_filename = "nasa_flight_plan.pdf" | |
with open(pdf_filename, "wb") as pdf_file: | |
pdf_file.write(response.content) | |
# Extract text from the PDF | |
extracted_text = extract_text(pdf_filename) | |
# Process the text | |
# Split into sections based on page numbers or headings | |
sections = extracted_text.split("\n\n") # Use double newlines as section separators | |
# Count word frequencies (ignoring common stop words for simplicity) | |
words = re.findall(r"\b\w+\b", extracted_text.lower()) | |
stop_words = { | |
"the", | |
"and", | |
"of", | |
"to", | |
"a", | |
"in", | |
"on", | |
"for", | |
"is", | |
"with", | |
"this", | |
"that", | |
} | |
word_frequencies = Counter(word for word in words if word not in stop_words) | |
# Display results | |
print("\n--- Extracted Text Summary ---") | |
print(f"Total Sections: {len(sections)}") | |
print("\n--- Top 10 Most Frequent Words ---") | |
for word, freq in word_frequencies.most_common(10): | |
print(f"{word}: {freq}") | |
print("\n--- Sample Section ---") | |
print(sections[1] if len(sections) > 1 else "No sections found.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment