Last active
February 13, 2019 12:35
-
-
Save avigail-oron/aab91769d5805bfb9a8e9c31f01dfcb8 to your computer and use it in GitHub Desktop.
spark on yarn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Build spark with YARN support, without hadoop dependencies | |
//(this will prevent clashes when distributing the spark assembly jar over yarn cluster) | |
build/mvn -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests clean package | |
[Does not work for me... | |
//Create a spark distribution | |
./make-distribution.sh --name <some name> --tgz -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests | |
] | |
//setup the same user in the cluster on your spark clinet machine: | |
sudo addgroup hadoop | |
sudo adduser --ingroup hadoop hduser | |
sudo usermod -aG sudo hduser | |
su hduser | |
//Spark will need yarn/hadoop client conf to run on yarn | |
//copy the conf dir (etc/hadoop) from the hadoop/yarn cluster and then: | |
export YARN_CONF_DIR=<local copy of hadoop conf dir, absolute path down to the last dir level> | |
//Spark relies on hadoop jars (even on the client). | |
//since we have built yarn w/o hadoop (to avoid collisions on executors) we need to rely on local hadoop artifacts | |
//this is how it looks in my host: | |
export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/* | |
//make sure resource manager, name nodes and data nodes are running (use jps on the cluster machines) | |
//run a PI example on local machine (using 2 cpu threads): | |
./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local[2] --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100 | |
//This is running the same example on the yarn cluster that is specified in the hadoop config files that YARN_CONF_DIR point to: | |
./bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment