Created
September 6, 2017 17:03
-
-
Save mrchristine/6de92c7d08fae0ab1082bd5ebf0d37c6 to your computer and use it in GitHub Desktop.
spark-submit transient run example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
usage="Add jars to the input arguments to specify the spark job. -h list the supported spark versions" | |
RUNTIME_VERSION="3.2.x-scala2.11" | |
NODE_TYPE="r3.xlarge" | |
while getopts ':hs:' option; do | |
case "$option" in | |
h) echo "$usage" | |
curl -s -n https://myenv.cloud.databricks.com/api/2.0/clusters/spark-versions | jq . | |
exit | |
;; | |
s) RUNTIME_VERSION=$OPTARG | |
;; | |
\?) printf "illegal option: -%s\n" "$OPTARG" >&2 | |
echo "$usage" >&2 | |
exit 1 | |
;; | |
esac | |
done | |
shift $((OPTIND - 1)) | |
## 2 parts to use spark-submit within Databricks running locally. | |
for jar in "$@" | |
do | |
echo "Path: " $jar | |
echo "Filename: " $(basename $jar) | |
fname=$(basename $jar) | |
path="/home/myuser/jars/" | |
# 1. Upload library using DBFS to a specific directory. | |
curl -n \ | |
-F contents=@${jar} -F path=${path}"/"${fname} -F overwrite="true" \ | |
https://myenv.cloud.databricks.com/api/2.0/dbfs/put | |
echo "Spark Version: $RUNTIME_VERSION" | |
echo "DBFS Jar Path: dbfs:${path}${fname}" | |
spark_submit_args=$(cat << EOF | |
{ | |
"run_name": "Miklos Spark Submit Run Now Job", | |
"new_cluster" : { | |
"spark_version": "$RUNTIME_VERSION", | |
"node_type_id": "$NODE_TYPE", | |
"num_workers": 1 }, | |
"email_notifications": | |
{"on_start": [],"on_success": [],"on_failure": []}, | |
"timeout_seconds": 3600, | |
"max_retries": 1, | |
"spark_submit_task": | |
{"parameters": [ "--conf", "spark.driver.maxResultSize=5g", | |
"--class","org.apache.spark.examples.SparkPi", | |
"dbfs:${path}${fname}", "10"]}} | |
EOF | |
) | |
# 2. Use the DBFS path you uploaded to in part 1 | |
job_run=`curl -X POST -s -n -H 'Content-Type:application/json' -d "$spark_submit_args" https://myenv.cloud.databricks.com/api/2.0/jobs/runs/submit` | |
run_status=`curl -X GET -s -n -H 'Content-Type:application/json' -d "$job_run" https://myenv.cloud.databricks.com/api/2.0/jobs/runs/get` | |
echo "Run job id: $job_run" | |
echo "$run_status" | jq . | |
done | |
echo -e "\nCompleted!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment