Last active
August 19, 2023 08:26
-
-
Save aayushdutt/c7b72e70d6930ea3881af4d4494adf38 to your computer and use it in GitHub Desktop.
Bash script to scrape Feynman Lecture Recordings Playlist from https://www.feynmanlectures.caltech.edu/flptapes.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Number of concurrent downloads | |
num_concurrent_downloads=9 | |
# Function to perform curl request | |
perform_curl() { | |
local url=$1 | |
local filename=$2 | |
printf "\nDownloading: $filename from $url\n" | |
curl "$url" \ | |
-H 'Accept: */*' \ | |
-H 'Accept-Language: en-GB,en-US;q=0.9,en;q=0.8' \ | |
-H 'Connection: keep-alive' \ | |
-H 'DNT: 1' \ | |
-H 'Referer: https://www.feynmanlectures.caltech.edu/flptapes.html' \ | |
-H 'Sec-Fetch-Dest: audio' \ | |
-H 'Sec-Fetch-Mode: no-cors' \ | |
-H 'Sec-Fetch-Site: same-origin' \ | |
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36' \ | |
--compressed -o "$filename" | |
printf "\nCOMPLETE: $filename\n" | |
} | |
# Function to download files concurrently | |
download_concurrently() { | |
local urls=("$@") | |
local num_urls=${#urls[@]} | |
local i=0 | |
local j=0 | |
printf "Downloading $num_urls files\n\n" | |
# Loop through the URLs and perform concurrent downloads | |
while [[ $i -lt $num_urls ]]; do | |
filename=$(basename "${urls[i]}") | |
perform_curl "https://www.feynmanlectures.caltech.edu/protected${urls[i]}" $filename & | |
((i++)) | |
((j++)) | |
# Limit the number of concurrent downloads | |
if [[ $j -eq $num_concurrent_downloads ]]; then | |
wait -n | |
((j--)) | |
fi | |
done | |
# Wait for all remaining downloads to finish | |
wait | |
} | |
# Parse JSON and extract values | |
parse_json() { | |
local json_file=$1 | |
# Read the JSON file using jq | |
local m4a_urls=($(jq -r '.[].m4a' "$json_file")) | |
# Loop through the arrays and perform concurrent curl requests | |
download_concurrently "${m4a_urls[@]}" | |
} | |
# NOTE: You can get the json object by printing console.log(recordings) in the website page https://www.feynmanlectures.caltech.edu/flptapes.html | |
# Save this to a file and change json_file value to it's path | |
json_file="source.json" | |
parse_json "$json_file" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@KayakerMagic Thanks a lot! I have updated the script to include your suggestion