Created
March 6, 2024 02:27
-
-
Save shangeth/3aedd9d883f94fe0c2bf2d7405628ea8 to your computer and use it in GitHub Desktop.
This Python script automates downloading and extracting .tar files from the Common Voice dataset on Hugging Face, using a Hugging Face token for authorization. It creates directories based on set types (e.g., "test"), downloads specified .tar files, extracts their contents, and cleans up by removing the .tar files post-extraction. Ideal for res…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import tarfile | |
# Hugging Face token | |
hf_token = "<HF_TOKEN_HERE>" | |
headers = {"Authorization": f"Bearer {hf_token}"} | |
# Directory to save and extract files | |
set = "test" # train|test|dev | |
n_files = 1 # train=27|test=1|dev=1 | |
save_dir = f"/root/shangeth/t0/mm-llm/data/CommonVoice/data/{set}" | |
if not os.path.exists(save_dir): | |
os.makedirs(save_dir) | |
# Base URL for the files | |
base_url = f"https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1/resolve/main/audio/en/{set}/" | |
# Function to download and extract files | |
def download_and_extract(file_name): | |
url = f"{base_url}{file_name}" | |
local_filename = os.path.join(save_dir, file_name) | |
# Download the file | |
with requests.get(url, headers=headers, stream=True) as r: | |
r.raise_for_status() | |
with open(local_filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=8192): | |
f.write(chunk) | |
# Extract the tar file | |
with tarfile.open(local_filename) as tar: | |
tar.extractall(path=save_dir) | |
os.remove(local_filename) # Remove the tar file after extraction | |
# Loop through the file range | |
for i in range(n_files): # 0 to 27 | |
file_name = f"en_{set}_{i}.tar" | |
print(f"Downloading and extracting {file_name}...") | |
download_and_extract(file_name) | |
print("All files downloaded and extracted.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment