Last active
December 5, 2020 04:24
-
-
Save shushiej/ee0bcea780adda7ccd9118fd0d3c5e5c to your computer and use it in GitHub Desktop.
Extracting JRE YouTube data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pafy | |
# Method to get the total minutes from the Duration Column | |
def convert_to_min(x): | |
splits = str(x).split(":") | |
if len(splits) == 3: | |
hr = int(splits[0]) * 60 | |
total = hr + int(splits[1]) | |
else: | |
total = int(splits[0]) | |
return total | |
# Use string manipulation to get the Guest Name out of the Title | |
def get_guest(x): | |
if("-" in str(x)): | |
guest = str(x).split("-") | |
return guest[1].strip() | |
elif("with" in str(x)): | |
guest = str(x).split("with") | |
return guest[1].strip() | |
else: | |
return x | |
pafy.set_api_key("YOUR_API_KEY") | |
all_jre_playlists= ["https://www.youtube.com/playlist?list=PLk1Sqn_f33Kt_vFRd3OyzqQ78Go4c1fyn", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuQyLE4RjEOdJ_-0epbcBb4", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvXucAFMo5Tc5p8e_mcc-5g", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtYIPnFjpI19BCz2unzWYlJ", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Kvv8T6ZESpJ2nvEHT9xBhlb", | |
"https://www.youtube.com/playlist?list=PLk1Sqn_f33KvtMA4mCQSnzGsZe8qsTdzV", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Ku0Oa3t8MQjV7D_G_PBi8g1", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuU_aJDvMPPAy_SoxXTt_ub", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtVQWWnE_V6-sypm5zUMkU6"] | |
all_jre_info = [] | |
# Iterate through each playlist and add each item to a list | |
for plurl in all_jre_playlists: | |
playlist = pafy.get_playlist(plurl) | |
for i in playlist['items']: | |
all_jre_info.append(i) | |
len(all_jre_info) | |
# 1325 | |
yt_jre = pd.DataFrame.from_dict(all_jre_info) | |
# Explode the MetaData from pafy api into the DataFrame | |
yt_jre = yt_jre['playlist_meta'].apply(pd.Series) | |
# Set the timestamp to seconds | |
yt_jre['timestamp_created'] = pd.to_datetime(yt_jre['time_created'], unit='s') | |
# Convert all the time data into a DatetimeIndex and extract the Year, Month and Day into its own columns. | |
yt_jre['year'] = pd.DatetimeIndex(yt_jre['timestamp_created']).year | |
yt_jre['month']=pd.DatetimeIndex(yt_jre['timestamp_created']).month | |
yt_jre['day'] = pd.DatetimeIndex(yt_jre['timestamp_created']).day | |
# Use Regex to extract the episode number from the Title | |
yt_jre['Episode'] = yt_jre['title'].str.extract('#(\d*)', expand=True) | |
yt_jre['guest'] = yt_jre['title'].apply(lambda x: get_guest(x)) | |
# Convert the duration into an integer of minutes | |
yt_jre['duration_minutes'] = yt_jre['duration'].apply(lambda x: convert_to_min(x)) | |
# Probably could have used regex here. | |
yt_jre['views_raw'] = yt_jre['views'].str.replace(",", "") | |
yt_jre['views_raw'] = yt_jre['views_raw'].astype(int) | |
# Some basic stats | |
yt_jre['views_raw'].mean() | |
# 1323460.4403323263 | |
yt_jre['duration_minutes'].mean() | |
# 150.0725075528701 | |
yt_jre['likes'].mean() | |
# 16713.877643504533 | |
yt_jre['dislikes'].mean() | |
# 1566.5430513595165 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment