Last active
February 15, 2019 19:28
-
-
Save ozydingo/f88581400e0ec010887ecf3b5c0d3137 to your computer and use it in GitHub Desktop.
SageMaker getting started tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
# import libraries | |
import boto3, re, sys, math, json, os, sagemaker, urllib.request | |
from sagemaker import get_execution_role | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from IPython.display import Image | |
from IPython.display import display | |
from time import gmtime, strftime | |
from sagemaker.predictor import csv_serializer | |
# Define IAM role | |
role = get_execution_role() | |
prefix = 'sagemaker/DEMO-xgboost-dm' | |
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest', | |
'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest', | |
'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest', | |
'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container | |
my_region = boto3.session.Session().region_name # set the region of the instance | |
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.") | |
# In[2]: | |
# Create s3 resources | |
bucket_name = 'andrew-brain-power-sagemaker-demo' # <--- change this variable to a unique name for your bucket | |
s3 = boto3.resource('s3') | |
try: | |
if my_region == 'us-east-1': | |
s3.create_bucket(Bucket=bucket_name) | |
else: | |
s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region }) | |
print('S3 bucket created successfully') | |
except Exception as e: | |
print('S3 error: ',e) | |
# In[3]: | |
# Download data | |
try: | |
urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv") | |
print('Success: downloaded bank_clean.csv.') | |
except Exception as e: | |
print('Data load error: ',e) | |
try: | |
model_data = pd.read_csv('./bank_clean.csv',index_col=0) | |
print('Success: Data loaded into dataframe.') | |
except Exception as e: | |
print('Data load error: ',e) | |
# In[12]: | |
# Split data into test, train sets | |
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))]) | |
print(train_data.shape, test_data.shape) | |
print(train_data.describe()) | |
# In[13]: | |
# Format and upload data to s3 | |
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False) | |
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv') | |
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv') | |
# In[14]: | |
# Create sagemaker session | |
sess = sagemaker.Session() | |
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess) | |
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100) | |
# In[15]: | |
# fit the model | |
xgb.fit({'train': s3_input_train}) | |
# In[16]: | |
# deploy a prediction server | |
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge') | |
# In[17]: | |
# Prepare test data, predict | |
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).as_matrix() #load the data into an array | |
xgb_predictor.content_type = 'text/csv' # set the data type for an inference | |
xgb_predictor.serializer = csv_serializer # set the serializer type | |
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict! | |
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array | |
print(predictions_array.shape) | |
# In[32]: | |
# Validate results | |
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted']) | |
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100 | |
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p)) | |
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase")) | |
print("Observed") | |
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp)) | |
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp)) | |
# In[ ]: | |
### Clean up resources: | |
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint) | |
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name) | |
bucket_to_delete.objects.all().delete() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment