shushiej · December 5, 2020 04:24
diff --git a/pafy_jre_yt.py b/pafy_jre_yt.py
 import pandas as pd
 import pafy

 # Method to get the total minutes from the Duration Column
 def convert_to_min(x):
    splits = str(x).split(":")
    if len(splits) == 3:
        hr = int(splits[0]) * 60
        total = hr + int(splits[1])
    else:
        total = int(splits[0])
    
    return total

 # Use string manipulation to get the Guest Name out of the Title
 def get_guest(x):
    if("-" in str(x)):
        guest = str(x).split("-")
        
        return guest[1].strip()
    elif("with" in str(x)):
        guest = str(x).split("with")
        
        return guest[1].strip()
    else:
        return x

 pafy.set_api_key("YOUR_API_KEY")

 all_jre_playlists= ["https://www.youtube.com/playlist?list=PLk1Sqn_f33Kt_vFRd3OyzqQ78Go4c1fyn", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuQyLE4RjEOdJ_-0epbcBb4", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvXucAFMo5Tc5p8e_mcc-5g", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtYIPnFjpI19BCz2unzWYlJ", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Kvv8T6ZESpJ2nvEHT9xBhlb", 
                   "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvtMA4mCQSnzGsZe8qsTdzV", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Ku0Oa3t8MQjV7D_G_PBi8g1", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuU_aJDvMPPAy_SoxXTt_ub", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtVQWWnE_V6-sypm5zUMkU6"]

 all_jre_info = []
 # Iterate through each playlist and add each item to a list
 for plurl in all_jre_playlists:
    playlist = pafy.get_playlist(plurl)
    for i in playlist['items']:
        all_jre_info.append(i)
        
 len(all_jre_info)
 # 1325

 yt_jre = pd.DataFrame.from_dict(all_jre_info)

 # Explode the MetaData from pafy api into the DataFrame
 yt_jre = yt_jre['playlist_meta'].apply(pd.Series)

 # Set the timestamp to seconds
 yt_jre['timestamp_created'] = pd.to_datetime(yt_jre['time_created'], unit='s')

 # Convert all the time data into a DatetimeIndex and extract the Year, Month and Day into its own columns.
 yt_jre['year'] = pd.DatetimeIndex(yt_jre['timestamp_created']).year
 yt_jre['month']=pd.DatetimeIndex(yt_jre['timestamp_created']).month
 yt_jre['day'] = pd.DatetimeIndex(yt_jre['timestamp_created']).day

 # Use Regex to extract the episode number from the Title
 yt_jre['Episode'] = yt_jre['title'].str.extract('#(\d*)', expand=True)
 yt_jre['guest'] = yt_jre['title'].apply(lambda x: get_guest(x))

 # Convert the duration into an integer of minutes
 yt_jre['duration_minutes'] = yt_jre['duration'].apply(lambda x: convert_to_min(x))

 # Probably could have used regex here.
 yt_jre['views_raw'] = yt_jre['views'].str.replace(",", "")
 yt_jre['views_raw'] = yt_jre['views_raw'].astype(int)

 # Some basic stats
 yt_jre['views_raw'].mean()
 # 1323460.4403323263
 yt_jre['duration_minutes'].mean()
 # 150.0725075528701
 yt_jre['likes'].mean()
 # 16713.877643504533
 yt_jre['dislikes'].mean()
 # 1566.5430513595165
	import pandas as pd
	import pafy

	# Method to get the total minutes from the Duration Column
	def convert_to_min(x):
	splits = str(x).split(":")
	if len(splits) == 3:
	hr = int(splits[0]) * 60
	total = hr + int(splits[1])
	else:
	total = int(splits[0])

	return total

	# Use string manipulation to get the Guest Name out of the Title
	def get_guest(x):
	if("-" in str(x)):
	guest = str(x).split("-")

	return guest[1].strip()
	elif("with" in str(x)):
	guest = str(x).split("with")

	return guest[1].strip()
	else:
	return x

	pafy.set_api_key("YOUR_API_KEY")

	all_jre_playlists= ["https://www.youtube.com/playlist?list=PLk1Sqn_f33Kt_vFRd3OyzqQ78Go4c1fyn", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuQyLE4RjEOdJ_-0epbcBb4", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvXucAFMo5Tc5p8e_mcc-5g", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtYIPnFjpI19BCz2unzWYlJ", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Kvv8T6ZESpJ2nvEHT9xBhlb",
	"https://www.youtube.com/playlist?list=PLk1Sqn_f33KvtMA4mCQSnzGsZe8qsTdzV", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Ku0Oa3t8MQjV7D_G_PBi8g1", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuU_aJDvMPPAy_SoxXTt_ub", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtVQWWnE_V6-sypm5zUMkU6"]

	all_jre_info = []
	# Iterate through each playlist and add each item to a list
	for plurl in all_jre_playlists:
	playlist = pafy.get_playlist(plurl)
	for i in playlist['items']:
	all_jre_info.append(i)

	len(all_jre_info)
	# 1325

	yt_jre = pd.DataFrame.from_dict(all_jre_info)

	# Explode the MetaData from pafy api into the DataFrame
	yt_jre = yt_jre['playlist_meta'].apply(pd.Series)

	# Set the timestamp to seconds
	yt_jre['timestamp_created'] = pd.to_datetime(yt_jre['time_created'], unit='s')

	# Convert all the time data into a DatetimeIndex and extract the Year, Month and Day into its own columns.
	yt_jre['year'] = pd.DatetimeIndex(yt_jre['timestamp_created']).year
	yt_jre['month']=pd.DatetimeIndex(yt_jre['timestamp_created']).month
	yt_jre['day'] = pd.DatetimeIndex(yt_jre['timestamp_created']).day

	# Use Regex to extract the episode number from the Title
	yt_jre['Episode'] = yt_jre['title'].str.extract('#(\d*)', expand=True)
	yt_jre['guest'] = yt_jre['title'].apply(lambda x: get_guest(x))

	# Convert the duration into an integer of minutes
	yt_jre['duration_minutes'] = yt_jre['duration'].apply(lambda x: convert_to_min(x))

	# Probably could have used regex here.
	yt_jre['views_raw'] = yt_jre['views'].str.replace(",", "")
	yt_jre['views_raw'] = yt_jre['views_raw'].astype(int)

	# Some basic stats
	yt_jre['views_raw'].mean()
	# 1323460.4403323263
	yt_jre['duration_minutes'].mean()
	# 150.0725075528701
	yt_jre['likes'].mean()
	# 16713.877643504533
	yt_jre['dislikes'].mean()
	# 1566.5430513595165