Created
April 21, 2024 10:24
-
-
Save YuenSzeHong/4bd63d187f7ba1c1948b2c23c508fcb6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pathlib | |
from bs4 import BeautifulSoup | |
from concurrent.futures import ThreadPoolExecutor | |
import time | |
import re | |
FILE_TYPE = {'video', 'panorama', 'res', 'album', 'photo', 'audio'} | |
FACES = ['f', 'b', 'l', 'r', 'u', 'd'] | |
def get_soup(url:str) -> BeautifulSoup: | |
response = requests.get(url) | |
response.raise_for_status() | |
return BeautifulSoup(response.text, 'html.parser') | |
def download(url:str, path:str = './', filename:str|None = None) -> None: | |
if filename is None: | |
filename = url.split('/')[-1] | |
# Note: No more directory existence check or creation here | |
print(f'Downloading {filename} to {path} ...') | |
with open(pathlib.Path(path) / filename, 'wb') as f: | |
response = requests.get(url) | |
if response.status_code != 200: # rate limited, queue retry | |
raise Exception(f'Failed to download {url} with status code {response.status_code}') | |
f.write(response.content) | |
def get_panoramas(soup: BeautifulSoup) -> list[str]: | |
panoramas = [] | |
for item in soup.find_all('a', href=True): | |
href = item['href'] | |
# Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash | |
# It matches 'panorama_', then captures any characters except for the last underscore and the numeric value | |
match = re.search(r'panorama_([^\s]+)_[0-9]+/', href) | |
if match: | |
panorama_id = match.group(1) # The captured ID part of the match | |
panoramas.append(panorama_id) | |
return panoramas | |
def get_videos(soup: BeautifulSoup) -> list[str]: | |
videos = [] | |
for item in soup.find_all('a', href=True): | |
href = item['href'] | |
# Adjusted regex to capture the ID part before the last underscore followed by a numeric value and slash | |
# https://www.mfhk-vt.com/RailwayMuseum/TC/media/video_4E054099_40E5_B381_41C6_48E7BF73644B.mp4 | |
match = re.search(r'video_([^\s]+)_[0-9]+/', href) | |
if match: | |
video_id = match.group(1) # The captured ID part of the match | |
videos.append(video_id) | |
return videos | |
def download_panorama(url:str, panorama:str, base_dir=None) -> None: | |
base_dir = pathlib.Path(base_dir) / panorama | |
cube_path = base_dir / 'cube' | |
# Ensure the base directory and cube subdirectory are created before starting the downloads | |
cube_path.mkdir(parents=True, exist_ok=True) | |
with ThreadPoolExecutor(max_workers=2) as executor: | |
futures = [] | |
for face in FACES: | |
# Corrected URL formation for cube faces | |
face_url = f'{url}panorama_{panorama}_0/{face}/vr/0.jpg' # Updated to match the sample link format | |
futures.append(executor.submit(download, face_url, str(cube_path), f'{face}.jpg')) | |
# Corrected URL formation for the high-definition thumbnail | |
hd_thumbnail_url = f'{url}panorama_{panorama}_hd_t.jpg' # Updated to match the sample link format | |
futures.append(executor.submit(download, hd_thumbnail_url, str(base_dir), 'hd_t.jpg')) | |
# Wait for all futures to complete if necessary | |
for future in futures: | |
future.result() | |
if __name__ == '__main__': | |
url = 'https://www.mfhk-vt.com/RailwayMuseum/TC/media/' | |
soup = get_soup(url) | |
panoramas = get_panoramas(soup) | |
video = get_videos(soup) | |
with ThreadPoolExecutor(max_workers=4) as executor: | |
for panorama in panoramas: | |
print(f'Downloading {panorama} ...') | |
executor.submit(download_panorama, url, panorama, './panorama') | |
for video in video: | |
print(f'Downloading {video} ...') | |
executor.submit(download, f'{url}video_{video}.mp4', './video', f'{video}.mp4') | |
# https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_0/b/vr/0.jpg | |
# https://www.mfhk-vt.com/RailwayMuseum/TC/media/panorama_983139E2_8EAC_8E11_41C1_310F250215F9_hd_t.jpg | |
# panorama_2545F013_29F7_DB3C_41BD_FB73EE7B31D6_hd_t.jpg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment