Created
October 28, 2024 13:57
-
-
Save hepplerj/f7388b595489fc9446668e9558a29100 to your computer and use it in GitHub Desktop.
A Python script using SpaCy to look for place names in SOTU addresses.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The following script uses NLP to identify locations in the data/ directory | |
import glob | |
import json | |
import re | |
import time | |
import spacy | |
from geopy.exc import GeocoderTimedOut | |
from geopy.geocoders import Nominatim | |
nlp = spacy.load("en_core_web_sm") | |
geolocator = Nominatim(user_agent="place_extractor") | |
def extract_places(text): | |
"""Identify places in text using spaCy NER.""" | |
doc = nlp(text) | |
places = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}] | |
return list(set(places)) # Remove duplicates | |
def geolocate_place(place_name, max_retries=3, delay=1): | |
"""Geolocate place name to get latitude and longitude with retry and delay.""" | |
for attempt in range(max_retries): | |
try: | |
location = geolocator.geocode(place_name, timeout=10) | |
if location: | |
return { | |
"name": place_name, | |
"lat": location.latitude, | |
"lon": location.longitude, | |
} | |
else: | |
return {"name": place_name, "lat": None, "lon": None} | |
except GeocoderTimedOut: | |
if attempt < max_retries - 1: | |
time.sleep(delay) # Wait before retrying | |
else: | |
return {"name": place_name, "lat": None, "lon": None} | |
return {"name": place_name, "lat": None, "lon": None} | |
def extract_year_and_president(filename): | |
"""Extract year and president's name from the filename.""" | |
basename = filename.split("/")[-1].split(".")[0] | |
date_part, name_part = basename.split("_", 1) | |
year = re.search(r"\d{4}", date_part) | |
year = int(year.group()) if year else None | |
president_name = " ".join(name_part.split()) | |
return year, president_name | |
def process_files(file_paths): | |
"""Process each file to extract and geolocate places, then create JSON structure.""" | |
results = [] | |
for file_path in file_paths: | |
with open(file_path, "r") as file: | |
text = file.read() | |
year, president = extract_year_and_president(file_path) | |
places = extract_places(text) | |
geolocated_places = [geolocate_place(place) for place in places] | |
results.append( | |
{ | |
"file": file_path, | |
"year": year, | |
"president": president, | |
"places": geolocated_places, | |
} | |
) | |
return results | |
# Path to text files | |
file_paths = glob.glob("data/*.txt") | |
structured_data = process_files(file_paths) | |
# Save JSON output | |
with open("geolocated_places.json", "w") as f: | |
json.dump(structured_data, f, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment