avigail-oron · February 13, 2019 12:35
diff --git a/spark on yarn b/spark on yarn
 //Build spark with YARN support, without hadoop dependencies 
 //(this will prevent clashes when distributing the spark assembly jar over yarn cluster)
 build/mvn -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests clean package

 [Does not work for me...
 //Create a spark distribution
 ./make-distribution.sh --name <some name> --tgz -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests
 ]

 //setup the same user in the cluster on your spark clinet machine:
 sudo addgroup hadoop
 sudo adduser --ingroup hadoop hduser
 sudo usermod -aG sudo hduser
 su hduser

 //Spark will need yarn/hadoop client conf to run on yarn
 //copy the conf dir (etc/hadoop) from the hadoop/yarn cluster and then:
 export YARN_CONF_DIR=<local copy of hadoop conf dir, absolute path down to the last dir level>

 //Spark relies on hadoop jars (even on the client). 
 //since we have built yarn w/o hadoop (to avoid collisions on executors) we need to rely on local hadoop artifacts
 //this is how it looks in my host:
 export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*

 //make sure resource manager, name nodes and data nodes are running (use jps on the cluster machines)

 //run a PI example on local machine (using 2 cpu threads):
 ./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local[2] --deploy-mode client --executor-memory 2G --num-executors 2  examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100

 //This is running the same example on the yarn cluster that is specified in the hadoop config files that YARN_CONF_DIR point to:
 ./bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn --deploy-mode client --executor-memory 2G --num-executors 2  examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100
	//Build spark with YARN support, without hadoop dependencies
	//(this will prevent clashes when distributing the spark assembly jar over yarn cluster)
	build/mvn -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests clean package

	[Does not work for me...
	//Create a spark distribution
	./make-distribution.sh --name <some name> --tgz -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests
	]

	//setup the same user in the cluster on your spark clinet machine:
	sudo addgroup hadoop
	sudo adduser --ingroup hadoop hduser
	sudo usermod -aG sudo hduser
	su hduser

	//Spark will need yarn/hadoop client conf to run on yarn
	//copy the conf dir (etc/hadoop) from the hadoop/yarn cluster and then:
	export YARN_CONF_DIR=<local copy of hadoop conf dir, absolute path down to the last dir level>

	//Spark relies on hadoop jars (even on the client).
	//since we have built yarn w/o hadoop (to avoid collisions on executors) we need to rely on local hadoop artifacts
	//this is how it looks in my host:
	export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/:/usr/local/hadoop/share/hadoop/common/:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/:/usr/local/hadoop/share/hadoop/hdfs/:/usr/local/hadoop/share/hadoop/mapreduce/lib/:/usr/local/hadoop/share/hadoop/mapreduce/:/usr/local/hadoop/share/hadoop/yarn/lib/:/usr/local/hadoop/share/hadoop/yarn/

	//make sure resource manager, name nodes and data nodes are running (use jps on the cluster machines)

	//run a PI example on local machine (using 2 cpu threads):
	./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local[2] --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100

	//This is running the same example on the yarn cluster that is specified in the hadoop config files that YARN_CONF_DIR point to:
	./bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100