Last active
December 6, 2023 16:47
-
-
Save renini/b469b52394e5e655858639a0b612125c to your computer and use it in GitHub Desktop.
Download VMware Explore Session Videos and Slides
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import json | |
import requests | |
import re | |
from tqdm import tqdm | |
url = 'https://www.vmware.com/bin/vmware/videolibrary/get/response' # videolibrary url | |
headers = { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0' | |
} | |
year = '2023' | |
location = 'Barcelona' | |
limit = '100' # max limit as it seems to return 0-99 results at max as paginated | |
search = 'searchrequest=/videos?q=-vod_on_demand_publish:False %20%2B' | |
filter = 'tags:"'+ location +'" %20%2Btags:"' + year + '"&%20%2Bstate:Active&limit=' + limit + '&offset=' | |
offset = 0 # start at 0, but iterate over it until completely fetched all | |
slides_url = 'https://raw.githubusercontent.com/lamw/vmware-explore-2023-session-urls/master/vmware-explore-emea.md' | |
download_dir = '/mnt/f/vmware/sessions/' # full path to download dir, end with / | |
def name_file(name, description, ext): | |
description = description.replace(" ", "_") # replace all spaces with underscores | |
description = re.sub(r'[<>:"/\\|?*]', '', description) # remove path unsafe chars | |
description = description.replace("’", "") # replace ’ with nothing | |
description = description.replace(" ", " ") # replace with nothing | |
description = description.replace("&", "and") # replace with nothing | |
description = description.replace("–", "-") # replace with nothing | |
description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above? | |
description = description.strip() # strip space and being and end | |
# file_name = f'{name}-{description}.{ext}' | |
file_name = f'{name}.{ext}' | |
return file_name | |
def already_downloaded(file_name): | |
if os.path.isfile(download_dir + file_name): | |
return True | |
else: | |
return False | |
def download_file_fake(url, save_path): | |
print(f"Download file from: {url}") | |
print(f"File downloaded: {save_path}") | |
def download_file(url, save_path): | |
# Send a GET request to the URL with stream=True to download large files in chunks | |
with requests.get(url, stream=True) as response: | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
# Get the total file size in bytes | |
total_size = int(response.headers.get('content-length', 0)) | |
# Create a tqdm progress bar | |
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True) | |
# Open the file in binary write mode | |
with open(save_path, 'wb') as f: | |
# Iterate through the content in chunks and update the progress bar | |
for chunk in response.iter_content(chunk_size=1024): | |
if chunk: | |
f.write(chunk) | |
progress_bar.update(len(chunk)) | |
# Close the progress bar | |
progress_bar.close() | |
print(f"File downloaded: {save_path}") | |
else: | |
print(f"Failed to download the file. Status code: {response.status_code}") | |
if __name__ == '__main__': | |
response = requests.post(url, headers=headers, data=search+filter+str(offset)) | |
data = response.json() | |
total_count = int(data['count']) | |
num_pages = (total_count // int(limit)) + 1 | |
videos = data['videos'] | |
complete_session_info = [] | |
print(f"The count is {total_count}.") | |
print(f"The response contains {len(videos)} videos.") | |
print(f"The num of pages to fetch {num_pages}.") | |
for page in range(1, num_pages): | |
offset += 100 | |
print(f"Fetching more data, now at {page}/{num_pages}.") | |
response = requests.post(url, headers=headers, data=search+filter+str(offset)) | |
data = json.loads(response.text) | |
videos += data['videos'] | |
print(f"The response now contains {len(videos)} videos.") | |
# print(json.dumps(videos, indent=4)) | |
for video in videos: | |
name = video.get('name', 'unknown') # Default to 'unknown' if 'name' is not present | |
description = video.get('description', 'unknown') # Default to 'unknown' if 'name' is not present | |
# iterate within sources and get the https link for the mp4/h264 file | |
sources = video.get('sources', []) | |
for source in sources: | |
codec = source.get('codec', '') | |
container = source.get('container', '') | |
src = source.get('src', '') | |
# Check if codec is H264 and container is MP4 | |
if 'H264' in codec and 'MP4' in container and src.startswith('https'): | |
url = src | |
ext = 'mp4' | |
file_name = name_file(name, description, ext) | |
if not already_downloaded(file_name): | |
print(f"Downloading: {name}") | |
download_file_fake(url, download_dir + file_name) | |
break # we only download one | |
# iterate within texttracks and get the https link for the vtt file | |
sources = video.get('textTracks', []) | |
for source in sources: | |
mimetype = source.get('mimeType', '') | |
src = source.get('src', '') | |
# Check if codec is H264 and container is MP4 | |
if 'webvtt-DISABLED' in mimetype and src.startswith('https'): | |
url = src | |
ext = 'vtt' | |
file_name = name_file(name, description, ext) | |
if not already_downloaded(file_name): | |
print(f"Downloading: {name}") | |
download_file_fake(url, download_dir + file_name) | |
break # we only download one | |
# iterate within posters and get the link for the jpg file | |
sources = video.get('posterSources', []) | |
for source in sources: | |
src = source.get('src', '') | |
if src.endswith('jpg'): | |
url = src | |
ext = 'jpg' | |
file_name = name_file(name, description, ext) | |
if not already_downloaded(file_name): | |
print(f"Downloading: {name}") | |
download_file(url, download_dir + file_name) | |
break # we only download one | |
# iterate within thumbnailSources and get the link for the jpg file | |
sources = video.get('thumbnailSources', []) | |
for source in sources: | |
src = source.get('src', '') | |
if src.endswith('jpg'): | |
url = src | |
ext = 'jpg' | |
file_name = name_file(name + '_thumb', description, ext) | |
if not already_downloaded(file_name): | |
print(f"Downloading: {name}") | |
download_file(url, download_dir + file_name) | |
break # we only download one | |
# create json with info per video | |
customfields = video.get('customFields', []) | |
tags = video.get('tags', []) | |
createdat = video.get('createdAt', 'unknown') | |
longdescription = video.get('longDescription', 'unknown') | |
info = { | |
"name": name, | |
"description": description, | |
"longdescription": longdescription, | |
"customfields": customfields, | |
"tags": tags, | |
"createdat": createdat | |
} | |
complete_session_info.append(info) | |
# complete_session_info[name["id"]] = new_item | |
ext = 'json' | |
file_name = name_file(name, description, ext) | |
if not already_downloaded(file_name): | |
with open(download_dir + file_name, "w") as info_file: | |
json.dump(info, info_file, indent=4) | |
# download slides | |
slides_response = requests.get(slides_url) | |
md_links_to_slides = slides_response.text | |
pdf_url = re.search(r'\[[^\]]*\]\(([^)]*' + name + '[^)]*.pdf)\)', md_links_to_slides) | |
if pdf_url: | |
url = pdf_url.group(1) | |
# print(f"PDF slides found at URL: {url}") | |
ext = 'pdf' | |
file_name = name_file(name, description, ext) | |
if not already_downloaded(file_name): | |
print(f"Downloading: {name}") | |
download_file(url, download_dir + file_name) | |
else: | |
print(f"No PDF slides found for {name}.") | |
# create txt file | |
description = description.replace(" ", "_") # replace all spaces with underscores | |
description = re.sub(r'[<>:\'"/\\|?*]', '', description) # remove path unsafe chars | |
description = description.replace("’", "") # replace ’ with nothing | |
description = description.replace(" ", " ") # replace with nothing | |
description = description.replace("&", "and") # replace with nothing | |
description = description.replace("–", "-") # replace with nothing | |
description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above? | |
description = description.strip() # strip space and being and end | |
description = description.replace(" ", "_") # replace all spaces with underscores | |
ext = 'txt' | |
file_name = name_file(name, description, ext) | |
if "." in file_name: | |
# Split the file name at the last "." | |
base, ext = os.path.splitext(file_name) | |
# Add the suffix to the base | |
new_base = base + "-" + description | |
# Concatenate the base, suffix, and extension | |
file_name = new_base + ext | |
if not already_downloaded(file_name): | |
with open(download_dir + file_name, "w") as txt_file: | |
txt_file.write(name) | |
# create sessions.json | |
# create json with info per video | |
file_name = 'sessions.json' | |
if not already_downloaded(file_name): | |
with open(download_dir + file_name, "w") as sessions_file: | |
json.dump(complete_session_info, sessions_file, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment