Skip to content

Instantly share code, notes, and snippets.

@sweetmoniker
Last active October 13, 2017 19:47
Show Gist options
  • Save sweetmoniker/4bd05746a6e87a760a25f452b13c54bd to your computer and use it in GitHub Desktop.
Save sweetmoniker/4bd05746a6e87a760a25f452b13c54bd to your computer and use it in GitHub Desktop.
**This code no longer works with the updated Youtube platform. See my other Youtube gist.** This Youtube scraper runs on selenium and beautifulsoup. Props go to user shaurz for some of the base code, but I couldn't get that code to scrape all videos. Essentially, the code uses selenium to open a browser, navigate to a youtube page, and load all …
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from collections import namedtuple
import csv
import time
import datetime
###This script runs on selenium with Chrome. Follow the instructions here to install the webdriver: http://selenium-python.readthedocs.io/installation.html#drivers You probably have to change your PATH. Google it.###
page_url = "https://www.youtube.com/user/thvbgd"
Video = namedtuple("Video", "video_id link title duration views age")
def parse_video_div(div):
try:
video_id = div.get("data-context-item-id", "")
link = "https://www.youtube.com/watch?v=" + video_id
title = div.find("a", "yt-uix-tile-link").text
#in case a stream comes through without a duration tag
if hasattr(div.find("span", "video-time"), 'contents'):
duration = div.find("span", "video-time").contents[0].text
else:
duration = '00:00'
#views and age come in the same div tag but aren't always in the same order.
views_age_metadiv = div.find("ul", "yt-lockup-meta-info")
if str(views_age_metadiv).find("ago") > str(views_age_metadiv).find("views"):
views = int(views_age_metadiv.text.split()[0].replace(',', ''))
age = views_age_metadiv.text.split("views")[1]
elif str(views_age_metadiv).find("ago") < str(views_age_metadiv).find("views"):
views = views_age_metadiv.text.split("ago")[1].rstrip(" views").replace(',', '')
age = views_age_metadiv.text.split()[0]
age = age + " " + views_age_metadiv.text.split()[1] + " ago"
else:
views = 0
age = 0
except:
print("Something got skipped. Don't worry about it. It probably wasn't important anyway.")
pass
return Video(video_id, link, title, duration, views, age)
def parse_videos_page(page):
video_divs = page.find_all("div", "yt-lockup-video")
return [parse_video_div(div) for div in video_divs]
def load_page(page_url):
try:
driver = webdriver.Chrome()
time.sleep(1)
wait = WebDriverWait(driver, 8)
driver.get(page_url)
time.sleep(3)
#show more always fails the first time, so try until succeed.
show_more_success = False
while show_more_success is False:
try:
show_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@class='yt-uix-button yt-uix-button-size-default yt-uix-button-expander yt-uix-expander-head yt-uix-expander-collapsed-body hidden-on-legacy-browsers']")))
show_more_button.click()
print("Minor success. Loading more.")
show_more_success = True
except Exception as e:
print(e)
print("Error. Trying again in 2 seconds.")
time.sleep(2)
time.sleep(1)
view_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='expanded-shelf-view-all-link yt-uix-sessionlink']")))
view_more_button.click()
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@class='yt-uix-button yt-uix-button-size-default yt-uix-button-default load-more-button yt-uix-load-more browse-items-load-more-button']")))
while load_more_button:
try:
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@class='yt-uix-button yt-uix-button-size-default yt-uix-button-default load-more-button yt-uix-load-more browse-items-load-more-button']")))
load_more_button.click()
#this except is probably innocuous
except NoSuchElementException:
#the page is fully loaded
print("Page fully developed")
break
except TimeoutException:
#the code timed out - page load button
print("Page fully developed")
break
except Exception as e:
print(e)
print("load more button broke the script")
break
except (KeyboardInterrupt, SystemExit):
print("Program Stopped")
raise
except Exception as e:
print(e)
print("Some kind of exception occurred. You should probably try again.")
pass
return driver.page_source.encode('utf-8')
def get_videos(page_url):
page = BeautifulSoup(load_page(page_url))
print("Source code retrieved")
videos = parse_videos_page(page)
print("Videos parsed")
return videos
def scrapeYoutubeVideos(page_url):
with open ('file_path\\{}_YouTube.csv'.format(page_url[page_url.find("/",25)+1:len(page_url)]), 'w', newline='', encoding='utf-8') as file:
csv.writer(file).writerow(["id","link","title","duration","views","age"])
scrape_starttime = datetime.datetime.now()
print("Scraping {} Youtube: {} \n Pay attention to the messages below.".format(page_url[page_url.find("/",25)+1:len(page_url)], scrape_starttime))
videos = get_videos(page_url)
for video in videos:
csv.writer(file).writerow(video)
file.close()
print("Done! {} videos scraped in {}".format(len(videos), datetime.datetime.now() - scrape_starttime))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment