Last active
December 27, 2018 07:33
-
-
Save avigail-oron/7395a3e8454005a7755f143e86b48905 to your computer and use it in GitHub Desktop.
Setup Hadoop3 cluster
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
OS: ubuntu 16.04 | |
//To run on each node: | |
//==================== | |
//Prerequisistes | |
//-------------- | |
sudo apt install default-jdk | |
//verify java installed | |
java -version | |
//create dedicated user for hadoop | |
sudo addgroup hadoop | |
sudo adduser --ingroup hadoop hduser | |
sudo usermod -aG sudo hduser | |
//configure ssh, used by hadoop for inter cluster comm | |
//generate an SSH key for the hduser user: | |
su - hduser | |
ssh-keygen -t rsa -P "" | |
//enable access to ssh via the generated key | |
cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys | |
//test the connection | |
ssh localhost | |
//Install Hadoop | |
//-------------- | |
//Download hadoop binary distro from here: http://www.apache.org/dyn/closer.cgi/hadoop/core | |
//Or copy your own built distro | |
//at any case, place the folder under /usr/local dir | |
sudo chown -R hduser:hadoop hadoop | |
//edit /usr/local/hadoop/etc/hadoop/hadoop-env.sh and set JAVA_HOME to point to /usr | |
//HDFS | |
//----- | |
sudo mkdir -p /app/hadoop/tmp | |
sudo chown hduser:hadoop /app/hadoop/tmp | |
cd /usr/local/hadoop | |
//edit etc/hadoop/core-site.xml and add inside the <configuration></configuration> section: | |
<property> | |
<name>hadoop.tmp.dir</name> | |
<value>/app/hadoop/tmp</value> | |
<description>A base for other temporary directories.</description> | |
</property> | |
<property> | |
<name>fs.default.name</name> | |
<value>hdfs://localhost:54310</value> | |
<description>The name of the default file system. A URI whose | |
scheme and authority determine the FileSystem implementation. The | |
uri's scheme determines the config property (fs.SCHEME.impl) naming | |
the FileSystem implementation class. The uri's authority is used to | |
determine the host, port, etc. for a filesystem.</description> | |
</property> | |
//Add to same place in file etc/hadoop/mapred-site.xml: | |
<property> | |
<name>mapred.job.tracker</name> | |
<value>localhost:54311</value> | |
<description>The host and port that the MapReduce job tracker runs | |
at. If "local", then jobs are run in-process as a single map | |
and reduce task. | |
</description> | |
</property> | |
//in etc/hadoop/hdfs-site.xml: | |
<property> | |
<name>dfs.replication</name> | |
<value>1</value> | |
<description>Default block replication. | |
The actual number of replications can be specified when the file is created. | |
The default is used if replication is not specified in create time. | |
</description> | |
</property> | |
/usr/local/hadoop/bin/hadoop namenode -format | |
//start HDFS: | |
sbin/start-dfs.sh | |
//start yarn: | |
sbin/start-yarn.sh | |
//run jps command to see whether hadoop java processes are up & running. | |
//you should have the following on each node: | |
NameNode | |
SecondaryNameNode | |
NodeManager | |
DataNode | |
ResourceManager | |
//Run a MapReduce job to test this node | |
//-------------------------------------- | |
//download a book to use as input: | |
sudo mkdir /tmp/gutenberg | |
cd /tmp/gutenberg | |
sudo wget http://www.gutenberg.org/files/5000/5000-8.txt | |
//copy text file to HDFS: | |
cd /usr/local/hadoop | |
bin/hadoop fs -mkdir -p /user/hduser/gutenberg | |
bin/hadoop fs -put /tmp/gutenberg /user/hduser/ | |
//run WordCount example job. replace version number if needed: | |
bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0-SNAPSHOT.jar wordcount /user/hduser/gutenberg /user/hduser/gutenberg-output | |
//view the result file in HDFS: | |
bin/hadoop fs -cat /user/hduser/gutenberg-output/part-r-00000 | |
//Join the 2 nodes to function as a cluster | |
//------------------------------------------ | |
//stop all hadoop processes on all nodes | |
sbin/stop-all.sh | |
//check with jps that all hadoop processes are down | |
//decide which machine will be the master and which the slave | |
//in my case | |
//master is 192.168.122.187 | |
//slave is 192.168.122.178 | |
//allow the master to ssh to slave w/o password. | |
//To do so run the following command on master (replace with your slave IP): | |
ssh-copy-id -i $HOME/.ssh/id_rsa.pub [email protected] | |
//on all nodes in the cluster, edit the following files: | |
etc/hadoop/core-site.xml | |
//edit the value of fs.default.name to the master's ip | |
etc/hadoop/yarn-site.xml | |
//make sure the configuration block includes this section (replace with your master IP): | |
<configuration> | |
<!-- Site specific YARN configuration properties --> | |
<property> | |
<name>yarn.nodemanager.aux-services</name> | |
<value>mapreduce_shuffle</value> | |
</property> | |
<property> | |
<name>yarn.resourcemanager.hostname</name> | |
<value>192.168.122.187</value> | |
</property> | |
</configuration> | |
etc/hadoop/mapred-site.xml | |
//make sure the configuration block is as following, replace with your master IP: | |
<configuration> | |
<property> | |
<name>mapred.job.tracker</name> | |
<value>192.168.122.187:54311</value> | |
<description>The host and port that the MapReduce job tracker runs | |
at. If "local", then jobs are run in-process as a single map | |
and reduce task. | |
</description> | |
</property> | |
<property> | |
<name>mapreduce.framework.name</name> | |
<value>yarn</value> | |
</property> | |
</configuration> | |
//edit workers file and list all workers - each in a dedicated line (master is a worker as a well, since it has a worker running) | |
//in my case: | |
192.168.122.178 | |
192.168.122.187 | |
//now format the HDFS file system: | |
//run on master only: | |
bin/hadoop namenode -format | |
//start all hadoop processes | |
//launch on master: | |
sbin/start-dfs.sh | |
sbin/start-yarn.sh | |
//verify using jps that all processes are launched cross the cluster as expected. | |
//Resources: | |
//---------- | |
Old guide on setting single node cluster: http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-single-node-cluster/ | |
Old guide on transforming several single-node clusters into a multinode cluster: http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/ | |
Newer article on single node cluster setup: https://www.digitalocean.com/community/tutorials/how-to-install-hadoop-in-stand-alone-mode-on-ubuntu-16-04 | |
Formal apache guide on single node setup: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html | |
Formal apache guide on multinode setup: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/ClusterSetup.html | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment