Skip to content

Instantly share code, notes, and snippets.

@hepplerj
Created October 28, 2024 13:57
Show Gist options
  • Save hepplerj/f7388b595489fc9446668e9558a29100 to your computer and use it in GitHub Desktop.
Save hepplerj/f7388b595489fc9446668e9558a29100 to your computer and use it in GitHub Desktop.
A Python script using SpaCy to look for place names in SOTU addresses.
# The following script uses NLP to identify locations in the data/ directory
import glob
import json
import re
import time
import spacy
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="place_extractor")
def extract_places(text):
"""Identify places in text using spaCy NER."""
doc = nlp(text)
places = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
return list(set(places)) # Remove duplicates
def geolocate_place(place_name, max_retries=3, delay=1):
"""Geolocate place name to get latitude and longitude with retry and delay."""
for attempt in range(max_retries):
try:
location = geolocator.geocode(place_name, timeout=10)
if location:
return {
"name": place_name,
"lat": location.latitude,
"lon": location.longitude,
}
else:
return {"name": place_name, "lat": None, "lon": None}
except GeocoderTimedOut:
if attempt < max_retries - 1:
time.sleep(delay) # Wait before retrying
else:
return {"name": place_name, "lat": None, "lon": None}
return {"name": place_name, "lat": None, "lon": None}
def extract_year_and_president(filename):
"""Extract year and president's name from the filename."""
basename = filename.split("/")[-1].split(".")[0]
date_part, name_part = basename.split("_", 1)
year = re.search(r"\d{4}", date_part)
year = int(year.group()) if year else None
president_name = " ".join(name_part.split())
return year, president_name
def process_files(file_paths):
"""Process each file to extract and geolocate places, then create JSON structure."""
results = []
for file_path in file_paths:
with open(file_path, "r") as file:
text = file.read()
year, president = extract_year_and_president(file_path)
places = extract_places(text)
geolocated_places = [geolocate_place(place) for place in places]
results.append(
{
"file": file_path,
"year": year,
"president": president,
"places": geolocated_places,
}
)
return results
# Path to text files
file_paths = glob.glob("data/*.txt")
structured_data = process_files(file_paths)
# Save JSON output
with open("geolocated_places.json", "w") as f:
json.dump(structured_data, f, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment