Last active
April 24, 2020 15:30
-
-
Save adsieg/11d4526493584de7a90a359851af86c2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## 0. PySpark environment, including necessary JAR files for accessing S3 from Spark | |
os.environ['AWS_ACCESS_KEY_ID'] = 'xxx_access_key' | |
os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxx_secret' | |
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python2' | |
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:2.7.1,com.amazonaws:aws-java-sdk-pom:1.10.34,com.databricks:spark-csv_2.11:1.3.0 pyspark-shell' | |
# 1. Connect to S3 Bucket | |
s3 = boto3.client("s3") | |
s3_resource = boto3.resource('s3') | |
# 2. List all buckets | |
response = s3.list_buckets() | |
for bucket in response['Buckets']: | |
print(bucket['Name']) | |
# 3. List all objects in a bucket | |
response = s3.list_objects_v2(Bucket='asiegmusictopspotifysongs') | |
for item in response['Contents']: | |
print(item['Key']) | |
# 4. Read object from S3 to our local environment | |
from io import StringIO, BytesIO | |
import pandas as pd | |
bucket_name = 'asiegmusictopspotifysongs' | |
file_name = 'top50contry.csv' | |
obj = s3.get_object(Bucket=bucket_name, Key=file_name) | |
print(obj) | |
response = s3.list_objects_v2(Bucket=bucket_name) | |
df = pd.DataFrame.from_dict(response['Contents']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment