renini · December 6, 2023 16:47
diff --git a/getvmwaresessions.py b/getvmwaresessions.py
 import sys
 import os
 import json
 import requests
 import re
 from tqdm import tqdm

 url = 'https://www.vmware.com/bin/vmware/videolibrary/get/response' # videolibrary url
 headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
 }

 year = '2023'
 location = 'Barcelona'
 limit = '100' # max limit as it seems to return 0-99 results at max as paginated
 search = 'searchrequest=/videos?q=-vod_on_demand_publish:False %20%2B'
 filter = 'tags:"'+ location +'" %20%2Btags:"' + year + '"&%20%2Bstate:Active&limit=' + limit + '&offset='
 offset = 0 # start at 0, but iterate over it until completely fetched all

 slides_url = 'https://raw.githubusercontent.com/lamw/vmware-explore-2023-session-urls/master/vmware-explore-emea.md'

 download_dir = '/mnt/f/vmware/sessions/' # full path to download dir, end with /

 def name_file(name, description, ext):
    description = description.replace(" ", "_") # replace all spaces with underscores
    description = re.sub(r'[<>:"/\\|?*]', '', description) # remove path unsafe chars
    description = description.replace("&rsquo;", "") # replace &rsquo; with nothing
    description = description.replace("&nbsp;", " ") # replace &nbsp; with nothing
    description = description.replace("&amp;", "and") # replace &nbsp; with nothing
    description = description.replace("&ndash;", "-") # replace &nbsp; with nothing
    description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above?
    description = description.strip() # strip space and being and end
    # file_name = f'{name}-{description}.{ext}'
    file_name = f'{name}.{ext}'
    return file_name

 def already_downloaded(file_name):
    if os.path.isfile(download_dir + file_name):
        return True
    else:
        return False

 def download_file_fake(url, save_path):
    print(f"Download file from: {url}")
    print(f"File downloaded: {save_path}")

 def download_file(url, save_path):
    # Send a GET request to the URL with stream=True to download large files in chunks
    with requests.get(url, stream=True) as response:
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Get the total file size in bytes
            total_size = int(response.headers.get('content-length', 0))

            # Create a tqdm progress bar
            progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

            # Open the file in binary write mode
            with open(save_path, 'wb') as f:
                # Iterate through the content in chunks and update the progress bar
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        progress_bar.update(len(chunk))

            # Close the progress bar
            progress_bar.close()

            print(f"File downloaded: {save_path}")
        else:
            print(f"Failed to download the file. Status code: {response.status_code}")


 if __name__ == '__main__':
    response = requests.post(url, headers=headers, data=search+filter+str(offset))

    data = response.json()

    total_count = int(data['count'])
    num_pages = (total_count // int(limit)) + 1

    videos = data['videos']
    complete_session_info = []

    print(f"The count is {total_count}.")
    print(f"The response contains {len(videos)} videos.")
    print(f"The num of pages to fetch {num_pages}.")

    for page in range(1, num_pages):
        offset += 100
        print(f"Fetching more data, now at {page}/{num_pages}.")
        response = requests.post(url, headers=headers, data=search+filter+str(offset))
        data = json.loads(response.text)
        videos += data['videos']

    print(f"The response now contains {len(videos)} videos.")
    # print(json.dumps(videos, indent=4))

    for video in videos:
        name = video.get('name', 'unknown')  # Default to 'unknown' if 'name' is not present
        description = video.get('description', 'unknown')  # Default to 'unknown' if 'name' is not present

        # iterate within sources and get the https link for the mp4/h264 file
        sources = video.get('sources', [])
        for source in sources:
            codec = source.get('codec', '')
            container = source.get('container', '')
            src = source.get('src', '')
            # Check if codec is H264 and container is MP4
            if 'H264' in codec and 'MP4' in container and src.startswith('https'):
                url = src
                ext = 'mp4'
                file_name = name_file(name, description, ext)
                if not already_downloaded(file_name):
                    print(f"Downloading: {name}")
                    download_file_fake(url, download_dir + file_name)
                    break # we only download one

        # iterate within texttracks and get the https link for the vtt file
        sources = video.get('textTracks', [])
        for source in sources:
            mimetype = source.get('mimeType', '')
            src = source.get('src', '')
            # Check if codec is H264 and container is MP4
            if 'webvtt-DISABLED' in mimetype and src.startswith('https'):
                url = src
                ext = 'vtt'
                file_name = name_file(name, description, ext)
                if not already_downloaded(file_name):
                    print(f"Downloading: {name}")
                    download_file_fake(url, download_dir + file_name)
                    break # we only download one

        # iterate within posters and get the link for the jpg file
        sources = video.get('posterSources', [])
        for source in sources:
            src = source.get('src', '')
            if src.endswith('jpg'):
                url = src
                ext = 'jpg'
                file_name = name_file(name, description, ext)
                if not already_downloaded(file_name):
                    print(f"Downloading: {name}")
                    download_file(url, download_dir + file_name)
                    break # we only download one

        # iterate within thumbnailSources and get the link for the jpg file
        sources = video.get('thumbnailSources', [])
        for source in sources:
            src = source.get('src', '')
            if src.endswith('jpg'):
                url = src
                ext = 'jpg'
                file_name = name_file(name + '_thumb', description, ext)
                if not already_downloaded(file_name):
                    print(f"Downloading: {name}")
                    download_file(url, download_dir + file_name)
                    break # we only download one

        # create json with info per video
        customfields = video.get('customFields', [])
        tags = video.get('tags', [])
        createdat = video.get('createdAt', 'unknown')
        longdescription = video.get('longDescription', 'unknown')
        info = {
            "name": name,
            "description": description,
            "longdescription": longdescription,
            "customfields": customfields,
            "tags": tags,
            "createdat": createdat
        }
        complete_session_info.append(info)
        # complete_session_info[name["id"]] = new_item
        ext = 'json'
        file_name = name_file(name, description, ext)
        if not already_downloaded(file_name):
            with open(download_dir + file_name, "w") as info_file:
                json.dump(info, info_file, indent=4)

        # download slides
        slides_response = requests.get(slides_url)
        md_links_to_slides = slides_response.text
        pdf_url = re.search(r'\[[^\]]*\]\(([^)]*' + name + '[^)]*.pdf)\)', md_links_to_slides)
        if pdf_url:
            url = pdf_url.group(1)
            # print(f"PDF slides found at URL: {url}")
            ext = 'pdf'
            file_name = name_file(name, description, ext)
            if not already_downloaded(file_name):
                    print(f"Downloading: {name}")
                    download_file(url, download_dir + file_name)
        else:
            print(f"No PDF slides found for {name}.")

        # create txt file
        description = description.replace(" ", "_") # replace all spaces with underscores
        description = re.sub(r'[<>:\'"/\\|?*]', '', description) # remove path unsafe chars
        description = description.replace("&rsquo;", "") # replace &rsquo; with nothing
        description = description.replace("&nbsp;", " ") # replace &nbsp; with nothing
        description = description.replace("&amp;", "and") # replace &nbsp; with nothing
        description = description.replace("&ndash;", "-") # replace &nbsp; with nothing
        description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above?
        description = description.strip() # strip space and being and end
        description = description.replace(" ", "_") # replace all spaces with underscores
        ext = 'txt'
        file_name = name_file(name, description, ext)
        if "." in file_name:
            # Split the file name at the last "."
            base, ext = os.path.splitext(file_name)
            # Add the suffix to the base
            new_base = base + "-" + description
            # Concatenate the base, suffix, and extension
            file_name = new_base + ext
            if not already_downloaded(file_name):
                with open(download_dir + file_name, "w") as txt_file:
                    txt_file.write(name)

    # create sessions.json
    # create json with info per video
    file_name = 'sessions.json'
    if not already_downloaded(file_name):
        with open(download_dir + file_name, "w") as sessions_file:
            json.dump(complete_session_info, sessions_file, indent=4)
	import sys
	import os
	import json
	import requests
	import re
	from tqdm import tqdm

	url = 'https://www.vmware.com/bin/vmware/videolibrary/get/response' # videolibrary url
	headers = {
	'Content-Type': 'application/x-www-form-urlencoded',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
	}

	year = '2023'
	location = 'Barcelona'
	limit = '100' # max limit as it seems to return 0-99 results at max as paginated
	search = 'searchrequest=/videos?q=-vod_on_demand_publish:False %20%2B'
	filter = 'tags:"'+ location +'" %20%2Btags:"' + year + '"&%20%2Bstate:Active&limit=' + limit + '&offset='
	offset = 0 # start at 0, but iterate over it until completely fetched all

	slides_url = 'https://raw.githubusercontent.com/lamw/vmware-explore-2023-session-urls/master/vmware-explore-emea.md'

	download_dir = '/mnt/f/vmware/sessions/' # full path to download dir, end with /

	def name_file(name, description, ext):
	description = description.replace(" ", "_") # replace all spaces with underscores
	description = re.sub(r'[<>:"/\\\|?*]', '', description) # remove path unsafe chars
	description = description.replace("’", "") # replace ’ with nothing
	description = description.replace(" ", " ") # replace   with nothing
	description = description.replace("&", "and") # replace   with nothing
	description = description.replace("–", "-") # replace   with nothing
	description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above?
	description = description.strip() # strip space and being and end
	# file_name = f'{name}-{description}.{ext}'
	file_name = f'{name}.{ext}'
	return file_name

	def already_downloaded(file_name):
	if os.path.isfile(download_dir + file_name):
	return True
	else:
	return False

	def download_file_fake(url, save_path):
	print(f"Download file from: {url}")
	print(f"File downloaded: {save_path}")

	def download_file(url, save_path):
	# Send a GET request to the URL with stream=True to download large files in chunks
	with requests.get(url, stream=True) as response:
	# Check if the request was successful (status code 200)
	if response.status_code == 200:
	# Get the total file size in bytes
	total_size = int(response.headers.get('content-length', 0))

	# Create a tqdm progress bar
	progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

	# Open the file in binary write mode
	with open(save_path, 'wb') as f:
	# Iterate through the content in chunks and update the progress bar
	for chunk in response.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)
	progress_bar.update(len(chunk))

	# Close the progress bar
	progress_bar.close()

	print(f"File downloaded: {save_path}")
	else:
	print(f"Failed to download the file. Status code: {response.status_code}")


	if __name__ == '__main__':
	response = requests.post(url, headers=headers, data=search+filter+str(offset))

	data = response.json()

	total_count = int(data['count'])
	num_pages = (total_count // int(limit)) + 1

	videos = data['videos']
	complete_session_info = []

	print(f"The count is {total_count}.")
	print(f"The response contains {len(videos)} videos.")
	print(f"The num of pages to fetch {num_pages}.")

	for page in range(1, num_pages):
	offset += 100
	print(f"Fetching more data, now at {page}/{num_pages}.")
	response = requests.post(url, headers=headers, data=search+filter+str(offset))
	data = json.loads(response.text)
	videos += data['videos']

	print(f"The response now contains {len(videos)} videos.")
	# print(json.dumps(videos, indent=4))

	for video in videos:
	name = video.get('name', 'unknown') # Default to 'unknown' if 'name' is not present
	description = video.get('description', 'unknown') # Default to 'unknown' if 'name' is not present

	# iterate within sources and get the https link for the mp4/h264 file
	sources = video.get('sources', [])
	for source in sources:
	codec = source.get('codec', '')
	container = source.get('container', '')
	src = source.get('src', '')
	# Check if codec is H264 and container is MP4
	if 'H264' in codec and 'MP4' in container and src.startswith('https'):
	url = src
	ext = 'mp4'
	file_name = name_file(name, description, ext)
	if not already_downloaded(file_name):
	print(f"Downloading: {name}")
	download_file_fake(url, download_dir + file_name)
	break # we only download one

	# iterate within texttracks and get the https link for the vtt file
	sources = video.get('textTracks', [])
	for source in sources:
	mimetype = source.get('mimeType', '')
	src = source.get('src', '')
	# Check if codec is H264 and container is MP4
	if 'webvtt-DISABLED' in mimetype and src.startswith('https'):
	url = src
	ext = 'vtt'
	file_name = name_file(name, description, ext)
	if not already_downloaded(file_name):
	print(f"Downloading: {name}")
	download_file_fake(url, download_dir + file_name)
	break # we only download one

	# iterate within posters and get the link for the jpg file
	sources = video.get('posterSources', [])
	for source in sources:
	src = source.get('src', '')
	if src.endswith('jpg'):
	url = src
	ext = 'jpg'
	file_name = name_file(name, description, ext)
	if not already_downloaded(file_name):
	print(f"Downloading: {name}")
	download_file(url, download_dir + file_name)
	break # we only download one

	# iterate within thumbnailSources and get the link for the jpg file
	sources = video.get('thumbnailSources', [])
	for source in sources:
	src = source.get('src', '')
	if src.endswith('jpg'):
	url = src
	ext = 'jpg'
	file_name = name_file(name + '_thumb', description, ext)
	if not already_downloaded(file_name):
	print(f"Downloading: {name}")
	download_file(url, download_dir + file_name)
	break # we only download one

	# create json with info per video
	customfields = video.get('customFields', [])
	tags = video.get('tags', [])
	createdat = video.get('createdAt', 'unknown')
	longdescription = video.get('longDescription', 'unknown')
	info = {
	"name": name,
	"description": description,
	"longdescription": longdescription,
	"customfields": customfields,
	"tags": tags,
	"createdat": createdat
	}
	complete_session_info.append(info)
	# complete_session_info[name["id"]] = new_item
	ext = 'json'
	file_name = name_file(name, description, ext)
	if not already_downloaded(file_name):
	with open(download_dir + file_name, "w") as info_file:
	json.dump(info, info_file, indent=4)

	# download slides
	slides_response = requests.get(slides_url)
	md_links_to_slides = slides_response.text
	pdf_url = re.search(r'\[[^\]]\]\(([^)]' + name + '[^)]*.pdf)\)', md_links_to_slides)
	if pdf_url:
	url = pdf_url.group(1)
	# print(f"PDF slides found at URL: {url}")
	ext = 'pdf'
	file_name = name_file(name, description, ext)
	if not already_downloaded(file_name):
	print(f"Downloading: {name}")
	download_file(url, download_dir + file_name)
	else:
	print(f"No PDF slides found for {name}.")

	# create txt file
	description = description.replace(" ", "_") # replace all spaces with underscores
	description = re.sub(r'[<>:\'"/\\\|?*]', '', description) # remove path unsafe chars
	description = description.replace("’", "") # replace ’ with nothing
	description = description.replace(" ", " ") # replace   with nothing
	description = description.replace("&", "and") # replace   with nothing
	description = description.replace("–", "-") # replace   with nothing
	description = re.sub(r'\&[a-zA-Z]+\;', '_', description) # replace html entities like others we missed above?
	description = description.strip() # strip space and being and end
	description = description.replace(" ", "_") # replace all spaces with underscores
	ext = 'txt'
	file_name = name_file(name, description, ext)
	if "." in file_name:
	# Split the file name at the last "."
	base, ext = os.path.splitext(file_name)
	# Add the suffix to the base
	new_base = base + "-" + description
	# Concatenate the base, suffix, and extension
	file_name = new_base + ext
	if not already_downloaded(file_name):
	with open(download_dir + file_name, "w") as txt_file:
	txt_file.write(name)

	# create sessions.json
	# create json with info per video
	file_name = 'sessions.json'
	if not already_downloaded(file_name):
	with open(download_dir + file_name, "w") as sessions_file:
	json.dump(complete_session_info, sessions_file, indent=4)