Created
March 11, 2019 17:44
-
-
Save danecjensen/1478f241394d51f8fcb1e6060003cb2d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import time | |
import pandas as pd | |
import datadotworld as ddw | |
import re | |
import string | |
from pyparsing import anyOpenTag, anyCloseTag | |
from xml.sax.saxutils import unescape as unescape | |
unescape_xml_entities = lambda s: unescape(s, {"'": "'", """: '"', " ":" "}) | |
stripper = (anyOpenTag | anyCloseTag).suppress() | |
def speaker_schedule(): | |
df = pd.DataFrame(columns=['name', 'title_and_location', 'summary', 'date', 'event_time']) | |
index_start = 0 | |
for letter in list(string.ascii_uppercase): | |
print(letter) | |
success_flag = False | |
for _ in range(3): | |
main_page = requests.get(f'https://schedule.sxsw.com/2019/speakers/alpha/{letter}.html') | |
if main_page.status_code == 200: | |
success_flag = True | |
break | |
if not success_flag: | |
print(f'{letter}s failed to load') | |
main_soup = BeautifulSoup(main_page.content, 'html.parser') | |
ind_events = main_soup.find_all('div', class_='row single-event') | |
for i,j in enumerate(ind_events): | |
i += index_start | |
url = 'https://schedule.sxsw.com' + re.search(r'(/2019/speakers/\d+)', str(j)).group() | |
page = requests.get(url) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
summary = str(soup.find('div', class_='row speaker-bio')) | |
title_and_location = str(soup.find_all('div', class_='small-12 columns event-details')) | |
name = str(soup.find('h4')) | |
date = str(soup.find('div', class_='date')) | |
event_time = str(soup.find('div', class_='time')) | |
df.loc[i, :] = [name, title_and_location, summary, date, event_time] | |
time.sleep(0.1) | |
index_start += len(ind_events) | |
print(df.shape) | |
def extract_time(x): | |
t = re.sub(r'(<div class="time">)', '', x) | |
return re.sub(r'(</div>)', '', t) | |
def extract_title_name(x): | |
t = unescape_xml_entities(stripper.transformString(x)) | |
t = t.strip('[]') | |
return t | |
# split title and location returning text | |
def location_split(text): | |
if isinstance(text, str) and ' at ' in text.lower(): | |
text_split = text.split(' at ') | |
try: | |
return text_split[-1] | |
except: | |
return None | |
df['summary'] = df['summary'].apply(lambda x: extract_title_name(x)) | |
df['date'] = df['date'].apply(lambda x: unescape_xml_entities(stripper.transformString(x))) | |
df['event_time'] = df['event_time'].apply(lambda x: extract_time(x)) | |
df['name'] = df['name'].apply(lambda x: extract_title_name(x)) | |
df['name'] = df['name'].apply(lambda x: x.replace('Events featuring ', '')) | |
df['title_and_location'] = df['title_and_location'].apply(lambda x: extract_title_name(x)) | |
df['venue'] = df.title_and_location.apply(lambda x: location_split(x)) | |
with ddw.open_remote_file('sparklesquad/sxsw-schedule', 'speaker_schedule_2019.csv', mode='w') as f: | |
df.to_csv(f, index=False) | |
def music_schedule(): | |
df = pd.DataFrame(columns=['name', 'summary', 'genre', 'subgenre', 'home', 'audio', 'title_and_location', | |
'date', 'event_time']) | |
index_start = 0 | |
for letter in list(string.ascii_uppercase): | |
print(letter) | |
success_flag = False | |
for _ in range(3): | |
main_page = requests.get(f'https://schedule.sxsw.com/2019/artists/alpha/{letter}.html') | |
if main_page.status_code == 200: | |
success_flag = True | |
break | |
if not success_flag: | |
print(f'{letter}s failed to load') | |
main_soup = BeautifulSoup(main_page.content, 'html.parser') | |
ind_events = main_soup.find_all('div', class_='row single-event') | |
for i, j in enumerate(ind_events): | |
i += index_start | |
url = 'https://schedule.sxsw.com' + re.search(r'(/2019/artists/\d+)', str(j)).group() | |
page = requests.get(url) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
try: | |
genre, subgenre, home = soup.select('div div div p div') | |
genre, subgenre, home = str(genre), str(subgenre), str(home) | |
except: | |
genre, subgenre, home = ['', '', ''] | |
audio = str(soup.select('audio')) | |
title_and_location = str(soup.find_all('div', class_='small-12 columns event-details')) | |
name = str(soup.find_all('h1', class_='artist-name')) | |
date = str(soup.find('div', class_='date')) | |
event_time = str(soup.find('div', class_='time')) | |
summary = str(soup.find('div', class_='large-8 small-12 columns')) | |
df.loc[i, :] = [name, summary, genre, subgenre, home, audio, title_and_location, | |
date, event_time] | |
time.sleep(0.1) | |
index_start += len(ind_events) | |
print(df.shape) | |
def extract_genre(x): | |
t = unescape_xml_entities(stripper.transformString(x)) | |
try: | |
t = t.split(':')[1] | |
except: | |
pass | |
t = t.strip() | |
return t | |
def extract_time(x): | |
t = unescape_xml_entities(stripper.transformString(x)) | |
t = t.encode('ascii', 'ignore').decode("utf-8") | |
t = t.replace(' ', '-') | |
return t | |
def extract_audio(x): | |
t = x.strip('[<audio src="') | |
t = t.strip('"></audio>]') | |
return t | |
def extract_title_name(x): | |
t = unescape_xml_entities(stripper.transformString(x)) | |
t = t.strip('[]') | |
return t | |
# split title and location returning text | |
def location_split(text): | |
if isinstance(text, str) and ' at ' in text.lower(): | |
text_split = text.split(' at ') | |
try: | |
return text_split[-1] | |
except: | |
return None | |
df['summary'] = df['summary'].apply(lambda x: unescape_xml_entities(stripper.transformString(x))) | |
df['date'] = df['date'].apply(lambda x: unescape_xml_entities(stripper.transformString(x))) | |
df['genre'] = df['genre'].apply(lambda x: extract_genre(x)) | |
df['subgenre'] = df['subgenre'].apply(lambda x: extract_genre(x)) | |
df['home'] = df['home'].apply(lambda x: extract_genre(x)) | |
df['event_time'] = df['event_time'].apply(lambda x: extract_time(x)) | |
df['audio'] = df['audio'].apply(lambda x: extract_audio(x)) | |
df['title_and_location'] = df['title_and_location'].apply(lambda x: extract_title_name(x)) | |
df['name'] = df['name'].apply(lambda x: extract_title_name(x)) | |
df['venue'] = df.title_and_location.apply(lambda x: location_split(x)) | |
with ddw.open_remote_file('sparklesquad/sxsw-schedule', 'music_schedule_2019.csv', mode='w') as f: | |
df.to_csv(f, index=False) | |
def main(): | |
speaker_schedule() | |
music_schedule() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment