Skip to content

Instantly share code, notes, and snippets.

@renini
Last active December 6, 2023 16:47
Show Gist options
  • Save renini/b469b52394e5e655858639a0b612125c to your computer and use it in GitHub Desktop.
Save renini/b469b52394e5e655858639a0b612125c to your computer and use it in GitHub Desktop.
Download VMware Explore Session Videos and Slides
import sys
import os
import json
import requests
import re
from tqdm import tqdm
url = 'https://www.vmware.com/bin/vmware/videolibrary/get/response' # videolibrary url
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
year = '2023'
location = 'Barcelona'
limit = '100' # max limit as it seems to return 0-99 results at max as paginated
search = 'searchrequest=/videos?q=-vod_on_demand_publish:False %20%2B'
filter = 'tags:"'+ location +'" %20%2Btags:"' + year + '"&%20%2Bstate:Active&limit=' + limit + '&offset='
offset = 0 # start at 0, but iterate over it until completely fetched all
slides_url = 'https://raw.githubusercontent.com/lamw/vmware-explore-2023-session-urls/master/vmware-explore-emea.md'
download_dir = '/mnt/f/vmware/sessions/' # full path to download dir, end with /
def name_file(name, description, ext):
description = description.replace(" ", "_") # replace all spaces with underscores
description = re.sub(r'[<>:"/\\|?*]', '', description) # remove path unsafe chars
description = description.replace("&rsquo;", "") # replace &rsquo; with nothing
description = description.replace("&nbsp;", " ") # replace &nbsp; with nothing
description = description.replace("&amp;", "and") # replace &nbsp; with nothing
description = description.replace("&ndash;", "-") # replace &nbsp; with nothing
description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above?
description = description.strip() # strip space and being and end
# file_name = f'{name}-{description}.{ext}'
file_name = f'{name}.{ext}'
return file_name
def already_downloaded(file_name):
if os.path.isfile(download_dir + file_name):
return True
else:
return False
def download_file_fake(url, save_path):
print(f"Download file from: {url}")
print(f"File downloaded: {save_path}")
def download_file(url, save_path):
# Send a GET request to the URL with stream=True to download large files in chunks
with requests.get(url, stream=True) as response:
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Get the total file size in bytes
total_size = int(response.headers.get('content-length', 0))
# Create a tqdm progress bar
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)
# Open the file in binary write mode
with open(save_path, 'wb') as f:
# Iterate through the content in chunks and update the progress bar
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
progress_bar.update(len(chunk))
# Close the progress bar
progress_bar.close()
print(f"File downloaded: {save_path}")
else:
print(f"Failed to download the file. Status code: {response.status_code}")
if __name__ == '__main__':
response = requests.post(url, headers=headers, data=search+filter+str(offset))
data = response.json()
total_count = int(data['count'])
num_pages = (total_count // int(limit)) + 1
videos = data['videos']
complete_session_info = []
print(f"The count is {total_count}.")
print(f"The response contains {len(videos)} videos.")
print(f"The num of pages to fetch {num_pages}.")
for page in range(1, num_pages):
offset += 100
print(f"Fetching more data, now at {page}/{num_pages}.")
response = requests.post(url, headers=headers, data=search+filter+str(offset))
data = json.loads(response.text)
videos += data['videos']
print(f"The response now contains {len(videos)} videos.")
# print(json.dumps(videos, indent=4))
for video in videos:
name = video.get('name', 'unknown') # Default to 'unknown' if 'name' is not present
description = video.get('description', 'unknown') # Default to 'unknown' if 'name' is not present
# iterate within sources and get the https link for the mp4/h264 file
sources = video.get('sources', [])
for source in sources:
codec = source.get('codec', '')
container = source.get('container', '')
src = source.get('src', '')
# Check if codec is H264 and container is MP4
if 'H264' in codec and 'MP4' in container and src.startswith('https'):
url = src
ext = 'mp4'
file_name = name_file(name, description, ext)
if not already_downloaded(file_name):
print(f"Downloading: {name}")
download_file_fake(url, download_dir + file_name)
break # we only download one
# iterate within texttracks and get the https link for the vtt file
sources = video.get('textTracks', [])
for source in sources:
mimetype = source.get('mimeType', '')
src = source.get('src', '')
# Check if codec is H264 and container is MP4
if 'webvtt-DISABLED' in mimetype and src.startswith('https'):
url = src
ext = 'vtt'
file_name = name_file(name, description, ext)
if not already_downloaded(file_name):
print(f"Downloading: {name}")
download_file_fake(url, download_dir + file_name)
break # we only download one
# iterate within posters and get the link for the jpg file
sources = video.get('posterSources', [])
for source in sources:
src = source.get('src', '')
if src.endswith('jpg'):
url = src
ext = 'jpg'
file_name = name_file(name, description, ext)
if not already_downloaded(file_name):
print(f"Downloading: {name}")
download_file(url, download_dir + file_name)
break # we only download one
# iterate within thumbnailSources and get the link for the jpg file
sources = video.get('thumbnailSources', [])
for source in sources:
src = source.get('src', '')
if src.endswith('jpg'):
url = src
ext = 'jpg'
file_name = name_file(name + '_thumb', description, ext)
if not already_downloaded(file_name):
print(f"Downloading: {name}")
download_file(url, download_dir + file_name)
break # we only download one
# create json with info per video
customfields = video.get('customFields', [])
tags = video.get('tags', [])
createdat = video.get('createdAt', 'unknown')
longdescription = video.get('longDescription', 'unknown')
info = {
"name": name,
"description": description,
"longdescription": longdescription,
"customfields": customfields,
"tags": tags,
"createdat": createdat
}
complete_session_info.append(info)
# complete_session_info[name["id"]] = new_item
ext = 'json'
file_name = name_file(name, description, ext)
if not already_downloaded(file_name):
with open(download_dir + file_name, "w") as info_file:
json.dump(info, info_file, indent=4)
# download slides
slides_response = requests.get(slides_url)
md_links_to_slides = slides_response.text
pdf_url = re.search(r'\[[^\]]*\]\(([^)]*' + name + '[^)]*.pdf)\)', md_links_to_slides)
if pdf_url:
url = pdf_url.group(1)
# print(f"PDF slides found at URL: {url}")
ext = 'pdf'
file_name = name_file(name, description, ext)
if not already_downloaded(file_name):
print(f"Downloading: {name}")
download_file(url, download_dir + file_name)
else:
print(f"No PDF slides found for {name}.")
# create txt file
description = description.replace(" ", "_") # replace all spaces with underscores
description = re.sub(r'[<>:\'"/\\|?*]', '', description) # remove path unsafe chars
description = description.replace("&rsquo;", "") # replace &rsquo; with nothing
description = description.replace("&nbsp;", " ") # replace &nbsp; with nothing
description = description.replace("&amp;", "and") # replace &nbsp; with nothing
description = description.replace("&ndash;", "-") # replace &nbsp; with nothing
description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above?
description = description.strip() # strip space and being and end
description = description.replace(" ", "_") # replace all spaces with underscores
ext = 'txt'
file_name = name_file(name, description, ext)
if "." in file_name:
# Split the file name at the last "."
base, ext = os.path.splitext(file_name)
# Add the suffix to the base
new_base = base + "-" + description
# Concatenate the base, suffix, and extension
file_name = new_base + ext
if not already_downloaded(file_name):
with open(download_dir + file_name, "w") as txt_file:
txt_file.write(name)
# create sessions.json
# create json with info per video
file_name = 'sessions.json'
if not already_downloaded(file_name):
with open(download_dir + file_name, "w") as sessions_file:
json.dump(complete_session_info, sessions_file, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment