Last active
October 13, 2017 19:47
-
-
Save sweetmoniker/4bd05746a6e87a760a25f452b13c54bd to your computer and use it in GitHub Desktop.
**This code no longer works with the updated Youtube platform. See my other Youtube gist.** This Youtube scraper runs on selenium and beautifulsoup. Props go to user shaurz for some of the base code, but I couldn't get that code to scrape all videos. Essentially, the code uses selenium to open a browser, navigate to a youtube page, and load all …
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from bs4 import BeautifulSoup | |
from collections import namedtuple | |
import csv | |
import time | |
import datetime | |
###This script runs on selenium with Chrome. Follow the instructions here to install the webdriver: http://selenium-python.readthedocs.io/installation.html#drivers You probably have to change your PATH. Google it.### | |
page_url = "https://www.youtube.com/user/thvbgd" | |
Video = namedtuple("Video", "video_id link title duration views age") | |
def parse_video_div(div): | |
try: | |
video_id = div.get("data-context-item-id", "") | |
link = "https://www.youtube.com/watch?v=" + video_id | |
title = div.find("a", "yt-uix-tile-link").text | |
#in case a stream comes through without a duration tag | |
if hasattr(div.find("span", "video-time"), 'contents'): | |
duration = div.find("span", "video-time").contents[0].text | |
else: | |
duration = '00:00' | |
#views and age come in the same div tag but aren't always in the same order. | |
views_age_metadiv = div.find("ul", "yt-lockup-meta-info") | |
if str(views_age_metadiv).find("ago") > str(views_age_metadiv).find("views"): | |
views = int(views_age_metadiv.text.split()[0].replace(',', '')) | |
age = views_age_metadiv.text.split("views")[1] | |
elif str(views_age_metadiv).find("ago") < str(views_age_metadiv).find("views"): | |
views = views_age_metadiv.text.split("ago")[1].rstrip(" views").replace(',', '') | |
age = views_age_metadiv.text.split()[0] | |
age = age + " " + views_age_metadiv.text.split()[1] + " ago" | |
else: | |
views = 0 | |
age = 0 | |
except: | |
print("Something got skipped. Don't worry about it. It probably wasn't important anyway.") | |
pass | |
return Video(video_id, link, title, duration, views, age) | |
def parse_videos_page(page): | |
video_divs = page.find_all("div", "yt-lockup-video") | |
return [parse_video_div(div) for div in video_divs] | |
def load_page(page_url): | |
try: | |
driver = webdriver.Chrome() | |
time.sleep(1) | |
wait = WebDriverWait(driver, 8) | |
driver.get(page_url) | |
time.sleep(3) | |
#show more always fails the first time, so try until succeed. | |
show_more_success = False | |
while show_more_success is False: | |
try: | |
show_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@class='yt-uix-button yt-uix-button-size-default yt-uix-button-expander yt-uix-expander-head yt-uix-expander-collapsed-body hidden-on-legacy-browsers']"))) | |
show_more_button.click() | |
print("Minor success. Loading more.") | |
show_more_success = True | |
except Exception as e: | |
print(e) | |
print("Error. Trying again in 2 seconds.") | |
time.sleep(2) | |
time.sleep(1) | |
view_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='expanded-shelf-view-all-link yt-uix-sessionlink']"))) | |
view_more_button.click() | |
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@class='yt-uix-button yt-uix-button-size-default yt-uix-button-default load-more-button yt-uix-load-more browse-items-load-more-button']"))) | |
while load_more_button: | |
try: | |
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@class='yt-uix-button yt-uix-button-size-default yt-uix-button-default load-more-button yt-uix-load-more browse-items-load-more-button']"))) | |
load_more_button.click() | |
#this except is probably innocuous | |
except NoSuchElementException: | |
#the page is fully loaded | |
print("Page fully developed") | |
break | |
except TimeoutException: | |
#the code timed out - page load button | |
print("Page fully developed") | |
break | |
except Exception as e: | |
print(e) | |
print("load more button broke the script") | |
break | |
except (KeyboardInterrupt, SystemExit): | |
print("Program Stopped") | |
raise | |
except Exception as e: | |
print(e) | |
print("Some kind of exception occurred. You should probably try again.") | |
pass | |
return driver.page_source.encode('utf-8') | |
def get_videos(page_url): | |
page = BeautifulSoup(load_page(page_url)) | |
print("Source code retrieved") | |
videos = parse_videos_page(page) | |
print("Videos parsed") | |
return videos | |
def scrapeYoutubeVideos(page_url): | |
with open ('file_path\\{}_YouTube.csv'.format(page_url[page_url.find("/",25)+1:len(page_url)]), 'w', newline='', encoding='utf-8') as file: | |
csv.writer(file).writerow(["id","link","title","duration","views","age"]) | |
scrape_starttime = datetime.datetime.now() | |
print("Scraping {} Youtube: {} \n Pay attention to the messages below.".format(page_url[page_url.find("/",25)+1:len(page_url)], scrape_starttime)) | |
videos = get_videos(page_url) | |
for video in videos: | |
csv.writer(file).writerow(video) | |
file.close() | |
print("Done! {} videos scraped in {}".format(len(videos), datetime.datetime.now() - scrape_starttime)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment