sdg-1 · January 18, 2025 23:22
diff --git a/pdf_miner_example.py b/pdf_miner_example.py
 import requests
 from pdfminer.high_level import extract_text
 from collections import Counter
 import re

 # Step 1: Download and save the PDF
 pdf_url = "https://arxiv.org/pdf/1509.02971.pdf"
 response = requests.get(pdf_url)
 pdf_filename = "nasa_flight_plan.pdf"

 with open(pdf_filename, "wb") as pdf_file:
    pdf_file.write(response.content)

 # Extract text from the PDF
 extracted_text = extract_text(pdf_filename)

 #  Process the text
 # Split into sections based on page numbers or headings
 sections = extracted_text.split("\n\n")  # Use double newlines as section separators


 # Count word frequencies (ignoring common stop words for simplicity)
 words = re.findall(r"\b\w+\b", extracted_text.lower())
 stop_words = {
    "the",
    "and",
    "of",
    "to",
    "a",
    "in",
    "on",
    "for",
    "is",
    "with",
    "this",
    "that",
 }
 word_frequencies = Counter(word for word in words if word not in stop_words)

 #  Display results
 print("\n--- Extracted Text Summary ---")
 print(f"Total Sections: {len(sections)}")


 print("\n--- Top 10 Most Frequent Words ---")
 for word, freq in word_frequencies.most_common(10):
    print(f"{word}: {freq}")

 print("\n--- Sample Section ---")
 print(sections[1] if len(sections) > 1 else "No sections found.")
	import requests
	from pdfminer.high_level import extract_text
	from collections import Counter
	import re

	# Step 1: Download and save the PDF
	pdf_url = "https://arxiv.org/pdf/1509.02971.pdf"
	response = requests.get(pdf_url)
	pdf_filename = "nasa_flight_plan.pdf"

	with open(pdf_filename, "wb") as pdf_file:
	pdf_file.write(response.content)

	# Extract text from the PDF
	extracted_text = extract_text(pdf_filename)

	# Process the text
	# Split into sections based on page numbers or headings
	sections = extracted_text.split("\n\n") # Use double newlines as section separators


	# Count word frequencies (ignoring common stop words for simplicity)
	words = re.findall(r"\b\w+\b", extracted_text.lower())
	stop_words = {
	"the",
	"and",
	"of",
	"to",
	"a",
	"in",
	"on",
	"for",
	"is",
	"with",
	"this",
	"that",
	}
	word_frequencies = Counter(word for word in words if word not in stop_words)

	# Display results
	print("\n--- Extracted Text Summary ---")
	print(f"Total Sections: {len(sections)}")


	print("\n--- Top 10 Most Frequent Words ---")
	for word, freq in word_frequencies.most_common(10):
	print(f"{word}: {freq}")

	print("\n--- Sample Section ---")
	print(sections[1] if len(sections) > 1 else "No sections found.")