Last active
November 13, 2023 15:22
-
-
Save davidcairuz/b187bf657e1b1174f712224c1794ce5b to your computer and use it in GitHub Desktop.
Script that scrolls through all of a YouTube playlist and calculates it's total duration. I'm still learning Python so all of your suggestions will be very appreciated.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as soup #used to beautifie the html code | |
import datetime as dt #sum the video's duration time | |
from selenium import webdriver #open webdriver for specific browser | |
from selenium.webdriver.common.keys import Keys #for necessary browser action | |
import time #used for sleep function | |
#line 63 must be modified for different languages | |
#line 35 must be modified for different url | |
times = [] | |
def get_html(): #used to get the html code of the current page | |
innerHTML = driver.execute_script("return document.body.innerHTML") | |
page_soup = soup(innerHTML, 'html.parser') | |
return page_soup | |
def end_of_page(): #used to scroll down to the bottom of the page | |
page_soup = get_html() #gets html to find the number of videos in playlist | |
number_videos_container = page_soup.findAll('yt-formatted-string', {'class':'style-scope ytd-playlist-sidebar-primary-info-renderer'}) #gets the number of videos for a future break in the scroll down loop | |
str_number = number_videos_container[1].text #next lines clean up the string to make it a real number | |
end_of_number = str_number.find(' ') | |
str_number = str_number[:end_of_number].replace('.', '') | |
number_videos = int(str_number) #number of videos | |
times_scroll_down = int((number_videos/100) + 1) | |
for i in range(times_scroll_down): #goes to the end of the playlist automatically | |
elm = driver.find_element_by_tag_name('html') | |
elm.send_keys(Keys.END) | |
time.sleep(2) | |
url = 'https://www.youtube.com/playlist?list=PLOuZHgwKgiV_oXOKyG8iLzyMuSjxKHos0' #change the url to the desired playlist | |
driver = webdriver.Chrome() #inicializes the webdriver | |
driver.get(url) | |
end_of_page() | |
page_soup = get_html() #gets the complete html, after scrolling down, with all the duration and title of videos | |
time_containers = page_soup.findAll('div', {'class':'style-scope ytd-thumbnail'}) | |
time_containers[0].text | |
for container in time_containers: #create a list with timestamps | |
time = container.text[7:].rstrip() | |
if time.count(':') == 1: #make so the timestamp includes hours as 00 if its shorter than 1 hour | |
time = '00:'+ time | |
else: | |
time = time | |
times.append(time) | |
times = list(filter(None, times)) #filter all the ZERO values of the list | |
time_total = dt.timedelta() #sets a variable in the HH:MM:SS format | |
for time in times: | |
(h,m,s) = time.split(':') | |
time = dt.timedelta(hours = int(h), minutes = int(m), seconds = int(s)) | |
time_total += time | |
print ('\nPlaylist time: ' + str(time_total)) | |
print ('\nNumber of avaiable videos: ' + str(len(times))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment