Last active
November 5, 2017 20:59
-
-
Save ajosanchez/49d18e54b22f3f408d3557145564f81a to your computer and use it in GitHub Desktop.
extract all links which are people
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
import spacy | |
def hash_text(text, digits=8): | |
return hash(text) % (10 ** digits) | |
def make_edge_dict(unique_edges): | |
edge_dict = {} | |
for edge in unique_edges: | |
try: | |
edge_dict.update( {hash_text(edge.attrs['href']): edge.text} ) | |
except: | |
print("edge error, possible selflink") | |
return edge_dict | |
def extract_edges(record): | |
person = record['name'] | |
person_id = record['id'] | |
html = bs(record['html'], 'lxml') | |
content = " ".join([p.text for p in html.select('#mw-content-text p')]) | |
links = [p for p in html.select('#mw-content-text p a')] | |
tagged_content = nlp(content) | |
people = [e for e in tagged_content.ents if e.label_ == 'PERSON'] | |
edges = [l for l in links if l.text in [p.text for p in people]] | |
unique_edges = list(set(edges)) | |
return ({person_id: person}, make_edge_dict(unique_edges)) | |
nlp = spacy.load('en_core_web_md') | |
adjacency_list = [] | |
for record in articles: | |
adjacency_list.append(extract_edges(record)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment