Last active
April 13, 2023 23:03
-
-
Save jbenninghoff/8df42fa99914dbcad63aec79044ced79 to your computer and use it in GitHub Desktop.
Launch EMR, MR job, then terminate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# jbenninghoff@ 2023-Mar-24 | |
# Script to run XML extraction job from cron | |
# Alternatetively use Step Functions instead of cron: | |
# https://docs.aws.amazon.com/en_us/step-functions/latest/dg/sample-emr-job.html | |
# Or use AWS Data Pipeline: | |
# https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-manage-recurring.html | |
#set -o nounset; set -o errexit; set -o pipefail | |
set -o errexit; set -o pipefail | |
export AWS_PROFILE=jobennin+workday | |
# Required variables to run this script | |
subnetid=subnet-057d3621ea39f57e9 # Private subnet with NAT | |
#subnetid=subnet-089b35dcfb410ccf1 # Must exist in your account's VPC | |
#subnetid=subnet-07d569c2a149af766 # Must exist in your account's VPC | |
key_name=jobennin+workday+usw2 | |
conf_bucket=jb-workday-artifacts | |
log_bucket=jb-workday-logs | |
data_bucket=jobennin-emr-data | |
secconf=usw2-tls #Existing EMR Security Configuration | |
#Push run-job-big.sh script to S3 | |
aws s3 cp run-job-big.sh s3://$conf_bucket/run-job-big.sh | |
#Check for input in S3 | |
if ( aws s3 ls s3://$data_bucket/hp-mapr/input/ > /dev/null ); then | |
: | |
else | |
echo Input data set not in S3 | |
exit 1 | |
fi | |
#Check for existing output in S3 | |
if ( aws s3 ls s3://$data_bucket/hp-mapr/output/ > /dev/null ); then | |
echo Remove output folder in S3 | |
echo Run: aws s3 rm s3://$data_bucket/hp-mapr/output/ --recursive | |
exit 1 | |
else | |
: | |
fi | |
# Check for required instance profile and associated roles | |
if ( aws iam list-instance-profiles --output json |& grep -q 'instance-profile/EMR_EC2_DefaultRole' ); then | |
: | |
else | |
aws emr create-default-roles >& jb-EMR-XMLx-default-roles.json | |
sleep 5 | |
aws iam list-instance-profiles |grep instance-profile/EMR_EC2_DefaultRole # Instance profile sometimes takes time | |
fi | |
igroupConf() { # Instance Group config for EMR cluster | |
cat << EOF1 | |
[ | |
{"InstanceCount":${1:-1}, | |
"InstanceGroupType":"TASK","InstanceType":"m6g.8xlarge","Name":"Worker nodes" | |
}, | |
{"InstanceCount":3, | |
"InstanceGroupType":"CORE","InstanceType":"m6g.8xlarge","Name":"Core nodes" | |
}, | |
{"InstanceCount":1, | |
"InstanceGroupType":"MASTER","InstanceType":"m6g.4xlarge","Name":"Master nodes" | |
} | |
] | |
EOF1 | |
} | |
emrConfig() { # Add EMR configuration file settings | |
# Graceful timeout, set at 3hrs, required for managed scaling | |
# 3hrs could be set lower(saves more money) but needs testing to be safe and sure | |
cat << EOF1 | |
[ | |
{ | |
"Classification":"emrfs-site", | |
"Properties":{ | |
"fs.s3.maxConnections": "10000" | |
} | |
}, | |
{ | |
"Classification":"yarn-site", | |
"Properties":{ | |
"yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs": "10800" | |
} | |
} | |
] | |
EOF1 | |
} | |
managedScaling() { # Use with: --managed-scaling-policy "$(managedScaling)" | |
# Allows cluster to scale down as Maps are done, creates upper bound as well | |
cat << EOF | |
{ | |
"ComputeLimits": { | |
"MinimumCapacityUnits": 5, | |
"MaximumCapacityUnits": 84, | |
"MaximumCoreCapacityUnits": 3, | |
"MaximumOnDemandCapacityUnits": 84, | |
"UnitType": "Instances" | |
} | |
} | |
EOF | |
} | |
xmlExtract() { # Run XML Extraction script uploaded to S3 | |
cat << EOF3 | |
[ | |
{ | |
"Name": "XML_EXTRACT", | |
"Args": ["s3://$conf_bucket/run-job-big.sh"], | |
"Jar": "s3://$AWS_DEFAULT_REGION.elasticmapreduce/libs/script-runner/script-runner.jar", | |
"ActionOnFailure": "CONTINUE", | |
"Type": "CUSTOM_JAR" | |
} | |
] | |
EOF3 | |
# "Args": ["s3://$conf_bucket/tls_emr_svcs.sh", "${1:-trino}"], | |
} | |
aws emr create-cluster --name "jb-HP-XML-big" \ | |
--release-label emr-6.10.0 \ | |
--applications Name=Hadoop Name=Ganglia \ | |
--ec2-attributes "KeyName=$key_name,SubnetId=$subnetid" \ | |
--instance-groups "$(igroupConf ${1:-5})" \ | |
--managed-scaling-policy "$(managedScaling)" \ | |
--ebs-root-volume-size 100 \ | |
--configurations "$(emrConfig)" \ | |
--security-configuration $secconf \ | |
--steps "$(xmlExtract)" \ | |
--auto-terminate \ | |
--enable-debugging \ | |
--log-uri "s3://$log_bucket/" \ | |
--use-default-roles | |
# Optional | |
: << '--BLOCK-COMMENT--' | |
--auto-terminate \ | |
--configurations "$(emrConfig)" \ | |
--managed-scaling-policy "$(managedScaling)" | |
--enable-debugging \ | |
--log-uri "s3://$log_bucket/" \ | |
--BLOCK-COMMENT-- | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment