Created
January 24, 2020 22:45
-
-
Save saverkamp/3af7e44f1a63e045219ef5ffe3a94a4d to your computer and use it in GitHub Desktop.
Get lists of LC vocabulary terms and URIs from id.loc.gov
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This script traverses all narrower terms of a http://id.loc.gov/ thesaurus | |
(or all terms of a term list) starting at a given term within the tree (replace | |
seedterm in the main code block with your URI of choice) and adds the URI and | |
label to a list. Outputs in CSV and JSON as well as JSONL as patterns for use in | |
rule-based NER with the NLP tool SpaCy. | |
(More info at: https://spacy.io/usage/rule-based-matching#entityruler) | |
NOTE the 5-second rate limit courtesy to the LC servers working hard for your | |
controlled vocabulary needs (see queryTerms() function). You might get away with | |
less, but don't be a jerk about it. | |
""" | |
import requests | |
import csv | |
import json | |
import time | |
#comment this out if you don't need a JSONL patterns file for NER in SpaCy and don't want to install jsonlines | |
import jsonlines | |
def queryTerm(request_url): | |
"""Query the id.loc.gov service for the JSON (MADS/RDF and SKOS/RDF) representation of a given URI and return the JSON response as a Python dict""" | |
#go easy on the LC servers | |
time.sleep(5) | |
headers = {'accept': 'application/json'} | |
query = requests.get(request_url, headers=headers) | |
if query.status_code == 200: | |
response = json.loads(query.content) | |
else: | |
response = None | |
return response | |
def addTerm(request_url, termlist): | |
"""Recursively add thesaurus terms to a list by traversing the tree through narrower terms""" | |
termlist = termlist | |
response = queryTerm(request_url) | |
if response is not None: | |
term = getTermInfo(request_url, response) | |
if len(term) > 0: | |
termlist.append(term) | |
#if there are narrower terms, parse those | |
for r in response: | |
if 'http://www.w3.org/2004/02/skos/core#narrower' in r: | |
narrower = r['http://www.w3.org/2004/02/skos/core#narrower'] | |
for n in narrower: | |
n_url = n['@id'] | |
termlist = addTerm(n_url, termlist) | |
#this works for term lists instead of thesauri | |
for r in response: | |
if 'http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember' in r: | |
narrower = r['http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember'] | |
for n in narrower: | |
n_url = n['@id'] | |
termlist = addTerm(n_url, termlist) | |
return termlist | |
def getTermInfo(request_url, response): | |
"""Get the URI and label for a term. If you want to include more info about the term, you can do it here""" | |
term = {} | |
#iterate through objects in response to find the one with the request_url as id | |
for r in response: | |
if r['@id'] == request_url: | |
term_object = r | |
if 'http://www.loc.gov/mads/rdf/v1#authoritativeLabel' in r: | |
#add URI and label for the term to the master list | |
term['uri'] = r['@id'] | |
term['prefLabel'] = None | |
print(request_url) | |
for p in r['http://www.loc.gov/mads/rdf/v1#authoritativeLabel']: | |
if '@language' in p: | |
if p['@language'] == 'en': | |
term['prefLabel'] = p['@value'] | |
else: | |
term['prefLabel'] = p['@value'] | |
print(term['prefLabel']) | |
return term | |
def writeCsv(termlist, filestem): | |
"""Write output to a CSV file""" | |
filename = filestem + '.csv' | |
f = open(filename, 'w') | |
fieldnames = ['uri', 'prefLabel'] | |
writer = csv.DictWriter(f, fieldnames=fieldnames) | |
writer.writeheader() | |
for t in termlist: | |
writer.writerow(t) | |
f.close() | |
def writeJson(termlist, filestem): | |
"""Write output to a JSON file""" | |
filename = filestem + '.json' | |
f = open(filename, 'w') | |
json.dump(termlist, f) | |
f.close() | |
def writePatternsJsonl(termlist, label): | |
"""Write output to a patterns.jsonl file to use for rule-based NER | |
with SpaCy""" | |
patternlist = [] | |
for t in termlist: | |
if t['prefLabel'] is not None: | |
listitem = {} | |
listitem['label'] = label | |
patterns = [] | |
for s in t['prefLabel'].split(' '): | |
pattern = {} | |
pattern['LOWER'] = s.lower().replace('.', '') | |
patterns.append(pattern) | |
listitem['pattern'] = patterns | |
patternlist.append(listitem) | |
filename = folderpath + 'patterns.jsonl' | |
f = open(filename, 'w') | |
writer = jsonlines.Writer(f) | |
writer.write_all(patternlist) | |
writer.close() | |
f.close() | |
if __name__ == "__main__": | |
termlist = [] | |
#change the seedterm to your top-level term URI | |
seedterm = 'http://id.loc.gov/authorities/performanceMediums/mp2013015382' | |
termlist = addTerm(seedterm, termlist) | |
#dedupe list--some terms have multiple parents | |
termlist = [dict(t) for t in {tuple(d.items()) for d in termlist}] | |
#name your CSV and JSON files here (without extension) | |
folderpath = '' | |
filestem = folderpath + 'lc_performance_mediums_pianos' | |
#comment out any of the below that you don't want | |
#write to CSV | |
writeCsv(termlist, filestem) | |
#write to JSON | |
writeJson(termlist, filestem) | |
#write terms only to patterns.jsonl file for use with SpaCy | |
#change this label to whatever you want to call this custom entity type in SpaCy NER | |
label = 'INSTRUMENT' | |
writePatternsJsonl(termlist, label) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment