JacobFV · April 21, 2024 15:35 · JacobFV · Apr 21, 2024
diff --git a/README.md b/README.md
diff --git a/huggingface_to_s3.py b/huggingface_to_s3.py
 #!/usr/bin/env python3

 import os
 import argparse
 import boto3
 import requests

 # Parse command-line arguments
 parser = argparse.ArgumentParser(description="Transfer files from Hugging Face repository to S3")
 parser.add_argument("--repo-owner", required=True, help="Owner of the Hugging Face repository")
 parser.add_argument("--repo-name", required=True, help="Name of the Hugging Face repository")
 parser.add_argument("--branch", default="main", help="Branch of the Hugging Face repository (default: main)")
 parser.add_argument("--s3-bucket", required=True, help="Name of the S3 bucket")
 args = parser.parse_args()

 # Hugging Face repository details
 repo_owner = args.repo_owner
 repo_name = args.repo_name
 branch = args.branch

 # S3 bucket details
 s3_bucket_name = args.s3_bucket

 # Create an S3 client
 s3_client = boto3.client("s3")

 # Create the S3 bucket if it doesn't exist
 try:
    s3_client.head_bucket(Bucket=s3_bucket_name)
    print(f"S3 bucket '{s3_bucket_name}' already exists")
 except boto3.exceptions.ClientError as e:
    if e.response["Error"]["Code"] == "404":
        s3_client.create_bucket(Bucket=s3_bucket_name)
        print(f"S3 bucket '{s3_bucket_name}' created successfully")
    else:
        raise

 # Hugging Face API endpoint
 api_url = f"https://api.huggingface.co/repos/{repo_owner}/{repo_name}/tree/{branch}"

 def download_file(file_path):
    file_url = f"https://huggingface.co/{repo_owner}/{repo_name}/resolve/{branch}/{file_path}"
    response = requests.get(file_url)
    if response.status_code == 200:
        return response.content
    else:
        print(f"Failed to download file: {file_path}")
        return None

 def upload_to_s3(file_path, file_content):
    try:
        s3_client.put_object(Body=file_content, Bucket=s3_bucket_name, Key=file_path)
        print(f"Uploaded file to S3: {file_path}")
    except Exception as e:
        print(f"Failed to upload file to S3: {file_path}")
        print(f"Error: {str(e)}")

 def process_files(files):
    for file in files:
        file_path = file["path"]
        if file["type"] == "blob":
            file_content = download_file(file_path)
            if file_content is not None:
                upload_to_s3(file_path, file_content)
        elif file["type"] == "tree":
            response = requests.get(file["url"])
            if response.status_code == 200:
                subfolder_files = response.json()
                process_files(subfolder_files)
            else:
                print(f"Failed to retrieve subfolder: {file_path}")

 def main():
    response = requests.get(api_url)
    if response.status_code == 200:
        files = response.json()
        process_files(files)
    else:
        print("Failed to retrieve repository files")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import os
	import argparse
	import boto3
	import requests

	# Parse command-line arguments
	parser = argparse.ArgumentParser(description="Transfer files from Hugging Face repository to S3")
	parser.add_argument("--repo-owner", required=True, help="Owner of the Hugging Face repository")
	parser.add_argument("--repo-name", required=True, help="Name of the Hugging Face repository")
	parser.add_argument("--branch", default="main", help="Branch of the Hugging Face repository (default: main)")
	parser.add_argument("--s3-bucket", required=True, help="Name of the S3 bucket")
	args = parser.parse_args()

	# Hugging Face repository details
	repo_owner = args.repo_owner
	repo_name = args.repo_name
	branch = args.branch

	# S3 bucket details
	s3_bucket_name = args.s3_bucket

	# Create an S3 client
	s3_client = boto3.client("s3")

	# Create the S3 bucket if it doesn't exist
	try:
	s3_client.head_bucket(Bucket=s3_bucket_name)
	print(f"S3 bucket '{s3_bucket_name}' already exists")
	except boto3.exceptions.ClientError as e:
	if e.response["Error"]["Code"] == "404":
	s3_client.create_bucket(Bucket=s3_bucket_name)
	print(f"S3 bucket '{s3_bucket_name}' created successfully")
	else:
	raise

	# Hugging Face API endpoint
	api_url = f"https://api.huggingface.co/repos/{repo_owner}/{repo_name}/tree/{branch}"

	def download_file(file_path):
	file_url = f"https://huggingface.co/{repo_owner}/{repo_name}/resolve/{branch}/{file_path}"
	response = requests.get(file_url)
	if response.status_code == 200:
	return response.content
	else:
	print(f"Failed to download file: {file_path}")
	return None

	def upload_to_s3(file_path, file_content):
	try:
	s3_client.put_object(Body=file_content, Bucket=s3_bucket_name, Key=file_path)
	print(f"Uploaded file to S3: {file_path}")
	except Exception as e:
	print(f"Failed to upload file to S3: {file_path}")
	print(f"Error: {str(e)}")

	def process_files(files):
	for file in files:
	file_path = file["path"]
	if file["type"] == "blob":
	file_content = download_file(file_path)
	if file_content is not None:
	upload_to_s3(file_path, file_content)
	elif file["type"] == "tree":
	response = requests.get(file["url"])
	if response.status_code == 200:
	subfolder_files = response.json()
	process_files(subfolder_files)
	else:
	print(f"Failed to retrieve subfolder: {file_path}")

	def main():
	response = requests.get(api_url)
	if response.status_code == 200:
	files = response.json()
	process_files(files)
	else:
	print("Failed to retrieve repository files")

	if __name__ == "__main__":
	main()