YuenSzeHong · April 21, 2024 10:24
diff --git a/download.py b/download.py
 import requests
 import pathlib
 from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor
 import time
 import re

 FILE_TYPE = {'video', 'panorama', 'res', 'album', 'photo', 'audio'}

 FACES = ['f', 'b', 'l', 'r', 'u', 'd']


 def get_soup(url:str) -> BeautifulSoup:
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

 def download(url:str, path:str = './', filename:str|None = None) -> None:
    if filename is None:
        filename = url.split('/')[-1]

    # Note: No more directory existence check or creation here
    print(f'Downloading {filename} to {path} ...')

    with open(pathlib.Path(path) / filename, 'wb') as f:
        response = requests.get(url)
        if response.status_code != 200: # rate limited, queue retry
            raise Exception(f'Failed to download {url} with status code {response.status_code}')
        f.write(response.content)

 def get_panoramas(soup: BeautifulSoup) -> list[str]:

    panoramas = []

    for item in soup.find_all('a', href=True):
        href = item['href']
        # Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash
        # It matches 'panorama_', then captures any characters except for the last underscore and the numeric value
        match = re.search(r'panorama_([^\s]+)_[0-9]+/', href)
        if match:
            panorama_id = match.group(1)  # The captured ID part of the match
            panoramas.append(panorama_id)

    return panoramas


 def get_videos(soup: BeautifulSoup) -> list[str]:

    videos = []

    for item in soup.find_all('a', href=True):
        href = item['href']
        # Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash
        # https://www.mfhk-vt.com/RailwayMuseum/TC/media/video_4E054099_40E5_B381_41C6_48E7BF73644B.mp4
        match = re.search(r'video_([^\s]+)_[0-9]+/', href)
        if match:
            video_id = match.group(1)  # The captured ID part of the match
            videos.append(video_id)

    return videos


 def download_panorama(url:str, panorama:str, base_dir=None) -> None:
    base_dir = pathlib.Path(base_dir) / panorama
    cube_path = base_dir / 'cube'

    # Ensure the base directory and cube subdirectory are created before starting the downloads
    cube_path.mkdir(parents=True, exist_ok=True)

    with ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        for face in FACES:
            # Corrected URL formation for cube faces
            face_url = f'{url}panorama_{panorama}_0/{face}/vr/0.jpg'  # Updated to match the sample link format
            futures.append(executor.submit(download, face_url, str(cube_path), f'{face}.jpg'))


        # Corrected URL formation for the high-definition thumbnail
        hd_thumbnail_url = f'{url}panorama_{panorama}_hd_t.jpg'  # Updated to match the sample link format
        futures.append(executor.submit(download, hd_thumbnail_url, str(base_dir), 'hd_t.jpg'))

        # Wait for all futures to complete if necessary
        for future in futures:
            future.result()


 if __name__ == '__main__':
    url = 'https://www.mfhk-vt.com/RailwayMuseum/TC/media/'

    soup = get_soup(url)

    panoramas = get_panoramas(soup)

    video = get_videos(soup)

    with ThreadPoolExecutor(max_workers=4) as executor:
        for panorama in panoramas:
            print(f'Downloading {panorama} ...')
            executor.submit(download_panorama, url, panorama, './panorama')
        
        for video in video:
            print(f'Downloading {video} ...')
            executor.submit(download, f'{url}video_{video}.mp4', './video', f'{video}.mp4')


 # https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_0/b/vr/0.jpg
 # https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_hd_t.jpg

 # panorama_2545F013_29F7_DB3C_41BD_FB73EE7B31D6_hd_t.jpg
	import requests
	import pathlib
	from bs4 import BeautifulSoup
	from concurrent.futures import ThreadPoolExecutor
	import time
	import re

	FILE_TYPE = {'video', 'panorama', 'res', 'album', 'photo', 'audio'}

	FACES = ['f', 'b', 'l', 'r', 'u', 'd']


	def get_soup(url:str) -> BeautifulSoup:
	response = requests.get(url)
	response.raise_for_status()
	return BeautifulSoup(response.text, 'html.parser')

	def download(url:str, path:str = './', filename:str\|None = None) -> None:
	if filename is None:
	filename = url.split('/')[-1]

	# Note: No more directory existence check or creation here
	print(f'Downloading {filename} to {path} ...')

	with open(pathlib.Path(path) / filename, 'wb') as f:
	response = requests.get(url)
	if response.status_code != 200: # rate limited, queue retry
	raise Exception(f'Failed to download {url} with status code {response.status_code}')
	f.write(response.content)

	def get_panoramas(soup: BeautifulSoup) -> list[str]:

	panoramas = []

	for item in soup.find_all('a', href=True):
	href = item['href']
	# Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash
	# It matches 'panorama_', then captures any characters except for the last underscore and the numeric value
	match = re.search(r'panorama_([^\s]+)_[0-9]+/', href)
	if match:
	panorama_id = match.group(1) # The captured ID part of the match
	panoramas.append(panorama_id)

	return panoramas


	def get_videos(soup: BeautifulSoup) -> list[str]:

	videos = []

	for item in soup.find_all('a', href=True):
	href = item['href']
	# Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash
	# https://www.mfhk-vt.com/RailwayMuseum/TC/media/video_4E054099_40E5_B381_41C6_48E7BF73644B.mp4
	match = re.search(r'video_([^\s]+)_[0-9]+/', href)
	if match:
	video_id = match.group(1) # The captured ID part of the match
	videos.append(video_id)

	return videos


	def download_panorama(url:str, panorama:str, base_dir=None) -> None:
	base_dir = pathlib.Path(base_dir) / panorama
	cube_path = base_dir / 'cube'

	# Ensure the base directory and cube subdirectory are created before starting the downloads
	cube_path.mkdir(parents=True, exist_ok=True)

	with ThreadPoolExecutor(max_workers=2) as executor:
	futures = []
	for face in FACES:
	# Corrected URL formation for cube faces
	face_url = f'{url}panorama_{panorama}_0/{face}/vr/0.jpg' # Updated to match the sample link format
	futures.append(executor.submit(download, face_url, str(cube_path), f'{face}.jpg'))


	# Corrected URL formation for the high-definition thumbnail
	hd_thumbnail_url = f'{url}panorama_{panorama}_hd_t.jpg' # Updated to match the sample link format
	futures.append(executor.submit(download, hd_thumbnail_url, str(base_dir), 'hd_t.jpg'))

	# Wait for all futures to complete if necessary
	for future in futures:
	future.result()


	if __name__ == '__main__':
	url = 'https://www.mfhk-vt.com/RailwayMuseum/TC/media/'

	soup = get_soup(url)

	panoramas = get_panoramas(soup)

	video = get_videos(soup)

	with ThreadPoolExecutor(max_workers=4) as executor:
	for panorama in panoramas:
	print(f'Downloading {panorama} ...')
	executor.submit(download_panorama, url, panorama, './panorama')

	for video in video:
	print(f'Downloading {video} ...')
	executor.submit(download, f'{url}video_{video}.mp4', './video', f'{video}.mp4')


	# https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_0/b/vr/0.jpg
	# https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_hd_t.jpg

	# panorama_2545F013_29F7_DB3C_41BD_FB73EE7B31D6_hd_t.jpg