Created
May 25, 2023 04:37
-
-
Save hiromu/4ffd45dba8ce18c8334f6c340267dae6 to your computer and use it in GitHub Desktop.
A script to download videos listed in a tab-separated file with Selenium.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import json | |
import pathlib | |
import sys | |
import time | |
import tempfile | |
import trio | |
from m3u8downloader.main import M3u8Downloader | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
async def accessor(list_tsv, channel): | |
capabilities = webdriver.DesiredCapabilities.CHROME | |
capabilities['goog:loggingPrefs'] = {'performance': 'ALL'} | |
options = Options() | |
options.binary_location = '/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta' | |
options.add_argument('--disable-headless-mode') | |
options.add_argument(f'--user-data-dir={pathlib.Path(__file__).parent / "profile"}') | |
driver = webdriver.Chrome(options=options, desired_capabilities=capabilities) | |
with open(list_tsv) as list_fp: | |
for url, title in csv.reader(list_fp, delimiter='\t'): | |
print('==> Start processing:', title) | |
driver.get(url) | |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'video'))) | |
driver.execute_script('Array.from(document.getElementsByTagName("video")).forEach(e => e.pause())') | |
while True: | |
time.sleep(1) | |
for entry_json in driver.get_log('performance'): | |
entry = json.loads(entry_json['message']) | |
if entry['message']['method'] != 'Network.requestWillBeSent': | |
continue | |
if 'playlist.m3u8' in entry['message']['params']['request']['url']: | |
await channel.send((entry['message']['params']['request']['url'], title)) | |
await channel.send((None, None)) | |
break | |
else: | |
continue | |
break | |
async def downloader(channel): | |
async for url, title in channel: | |
if title is None: | |
continue | |
print('==> Start downloading:', title) | |
with tempfile.TemporaryDirectory() as tempdir: | |
downloader = M3u8Downloader(url, title + '.mp4', tempdir=tempdir, poolsize=2) | |
downloader.start() | |
async def main(list_tsv): | |
send_channel, recv_channel = trio.open_memory_channel(1) | |
async with trio.open_nursery() as nursery: | |
nursery.start_soon(accessor, list_tsv, send_channel) | |
nursery.start_soon(downloader, recv_channel) | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print(f'{sys.argv[0]} list_tsv') | |
sys.exit(-1) | |
trio.run(main, sys.argv[1]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async-generator==1.10 | |
attrs==23.1.0 | |
certifi==2023.5.7 | |
charset-normalizer==3.1.0 | |
exceptiongroup==1.1.1 | |
h11==0.14.0 | |
idna==3.4 | |
m3u8downloader==0.10.1 | |
outcome==1.2.0 | |
PySocks==1.7.1 | |
requests==2.31.0 | |
selenium==4.9.1 | |
sniffio==1.3.0 | |
sortedcontainers==2.4.0 | |
trio==0.22.0 | |
trio-websocket==0.10.2 | |
urllib3==2.0.2 | |
wells==1.5.0 | |
wsproto==1.2.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment