Skip to content

Instantly share code, notes, and snippets.

@YuenSzeHong
Created April 21, 2024 10:24
Show Gist options
  • Save YuenSzeHong/4bd63d187f7ba1c1948b2c23c508fcb6 to your computer and use it in GitHub Desktop.
Save YuenSzeHong/4bd63d187f7ba1c1948b2c23c508fcb6 to your computer and use it in GitHub Desktop.
import requests
import pathlib
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import time
import re
FILE_TYPE = {'video', 'panorama', 'res', 'album', 'photo', 'audio'}
FACES = ['f', 'b', 'l', 'r', 'u', 'd']
def get_soup(url:str) -> BeautifulSoup:
response = requests.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
def download(url:str, path:str = './', filename:str|None = None) -> None:
if filename is None:
filename = url.split('/')[-1]
# Note: No more directory existence check or creation here
print(f'Downloading {filename} to {path} ...')
with open(pathlib.Path(path) / filename, 'wb') as f:
response = requests.get(url)
if response.status_code != 200: # rate limited, queue retry
raise Exception(f'Failed to download {url} with status code {response.status_code}')
f.write(response.content)
def get_panoramas(soup: BeautifulSoup) -> list[str]:
panoramas = []
for item in soup.find_all('a', href=True):
href = item['href']
# Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash
# It matches 'panorama_', then captures any characters except for the last underscore and the numeric value
match = re.search(r'panorama_([^\s]+)_[0-9]+/', href)
if match:
panorama_id = match.group(1) # The captured ID part of the match
panoramas.append(panorama_id)
return panoramas
def get_videos(soup: BeautifulSoup) -> list[str]:
videos = []
for item in soup.find_all('a', href=True):
href = item['href']
# Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash
# https://www.mfhk-vt.com/RailwayMuseum/TC/media/video_4E054099_40E5_B381_41C6_48E7BF73644B.mp4
match = re.search(r'video_([^\s]+)_[0-9]+/', href)
if match:
video_id = match.group(1) # The captured ID part of the match
videos.append(video_id)
return videos
def download_panorama(url:str, panorama:str, base_dir=None) -> None:
base_dir = pathlib.Path(base_dir) / panorama
cube_path = base_dir / 'cube'
# Ensure the base directory and cube subdirectory are created before starting the downloads
cube_path.mkdir(parents=True, exist_ok=True)
with ThreadPoolExecutor(max_workers=2) as executor:
futures = []
for face in FACES:
# Corrected URL formation for cube faces
face_url = f'{url}panorama_{panorama}_0/{face}/vr/0.jpg' # Updated to match the sample link format
futures.append(executor.submit(download, face_url, str(cube_path), f'{face}.jpg'))
# Corrected URL formation for the high-definition thumbnail
hd_thumbnail_url = f'{url}panorama_{panorama}_hd_t.jpg' # Updated to match the sample link format
futures.append(executor.submit(download, hd_thumbnail_url, str(base_dir), 'hd_t.jpg'))
# Wait for all futures to complete if necessary
for future in futures:
future.result()
if __name__ == '__main__':
url = 'https://www.mfhk-vt.com/RailwayMuseum/TC/media/'
soup = get_soup(url)
panoramas = get_panoramas(soup)
video = get_videos(soup)
with ThreadPoolExecutor(max_workers=4) as executor:
for panorama in panoramas:
print(f'Downloading {panorama} ...')
executor.submit(download_panorama, url, panorama, './panorama')
for video in video:
print(f'Downloading {video} ...')
executor.submit(download, f'{url}video_{video}.mp4', './video', f'{video}.mp4')
# https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_0/b/vr/0.jpg
# https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_hd_t.jpg
# panorama_2545F013_29F7_DB3C_41BD_FB73EE7B31D6_hd_t.jpg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment