Last active
May 12, 2025 11:40
-
-
Save airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237 to your computer and use it in GitHub Desktop.
Script to install spark on debian 12
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
echo "\n #####################" | |
echo "Install Spark Script" | |
echo "Done with massive help of Mistral" | |
echo "This is a wizard that it was not tested 100% yet, gonna try it later" | |
echo "Done with a bunch of search around, might not be the best way of installing or using spark \n" | |
# Function to check if the user is root | |
is_root() { | |
if [ "$(id -u)" -eq 0 ]; then | |
return 0 | |
else | |
return 1 | |
fi | |
} | |
# Function to set the environment variable | |
set_env_variable() { | |
echo "export SPARK_ROLE=$1" >> ~/.bashrc | |
echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc | |
} | |
# Ask for the version of Scala and Spark | |
read -p "Enter the version of Scala (default: 2.13.4): " scala_version | |
scala_version=${scala_version:-2.13.4} | |
read -p "Enter the version of Spark (default: 3.5.3): " spark_version | |
spark_version=${spark_version:-3.5.3} | |
# Ask if it's a master or worker machine and for the IP address of the master | |
read -p "Is this a master or worker machine? (master/worker): " role | |
read -p "Enter the IP address of the master: " master_ip | |
set_env_variable "$role" "$master_ip" | |
# Check if the user is root | |
if is_root; then | |
sudo_cmd="" | |
else | |
sudo_cmd="sudo" | |
fi | |
# Update and install dependencies | |
$sudo_cmd apt update | |
$sudo_cmd apt upgrade -y | |
$sudo_cmd apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y | |
# Download and install Scala | |
wget https://www.scala-lang.org/files/archive/scala-$scala_version.deb | |
$sudo_cmd dpkg -i scala*.deb | |
# Download and install Spark | |
wget https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz | |
tar -zxvf spark-$spark_version-bin-hadoop3.tgz | |
$sudo_cmd mv spark-$spark_version-bin-hadoop3 /opt/spark | |
$sudo_cmd cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh | |
# Configure Spark environment | |
echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh | |
echo "SPARK_MASTER_HOST=$SPARK_MASTER_IP" >> /opt/spark/conf/spark-env.sh | |
# Set Spark environment variables | |
echo "export SPARK_HOME=/opt/spark" >> ~/.profile | |
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile | |
echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc | |
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc | |
# Download the make-spark script | |
wget https://gist.githubusercontent.com/airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237/raw/a4eefe14813428e6b1c015a96a9d029ae66d10e9/run.sh -O /opt/spark/bin/make-spark | |
$sudo_cmd chmod +x /opt/spark/bin/make-spark | |
echo "You might need to run 'source ~/.bashrc' and 'source ~/.profile'" | |
echo "You can use it to start or stop Spark services with the following commands:" | |
echo "To start Spark services: make-spark start" | |
echo "To stop Spark services: make-spark stop" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
echo "\n #####################" | |
echo "Install Spark Script" | |
echo "Done with a bunch of search around, might not be the best way of installing or using spark \n" | |
apt update | |
apt upgrade -y | |
apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y | |
# python3 -m venv | |
# source .venv/bin/activate | |
wget https://www.scala-lang.org/files/archive/scala-2.13.4.deb | |
dpkg -i scala*.deb | |
# install maven dependencies | |
# mvn dependency:copy-dependencies | |
wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz | |
tar -zxvf spark-3.5.3-bin-hadoop3.tgz | |
mv spark-3.5.3-bin-hadoop3 /opt/spark | |
cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh | |
echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh | |
echo "SPARK_MASTER_HOST=YOUR_MASTER_IP_HERE" >> /opt/spark/conf/spark-env.sh | |
echo "export SPARK_HOME=/opt/spark" >> ~/.profile | |
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile | |
source ~/.profile | |
echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc | |
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc | |
source ~/.bashrc | |
echo "To start primary: /opt/spark/sbin/start-master.sh" | |
echo "To start worker: /opt/spark/sbin/start-worker.sh spark://PRIMARY_IP:7077" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
echo "\n #####################" | |
echo "Script to run and stop Spark" | |
echo "Done with massive help of Mistral" | |
echo "There might be better ways of doing this, but this is how I'm doing now :P \n" | |
# Function to check if the environment variable is set | |
check_env_variable() { | |
if grep -q "export SPARK_ROLE=" ~/.bashrc && grep -q "export SPARK_MASTER_IP=" ~/.bashrc; then | |
return 0 | |
else | |
return 1 | |
fi | |
} | |
# Function to set the environment variable | |
set_env_variable() { | |
echo "export SPARK_ROLE=$1" >> ~/.bashrc | |
echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc | |
} | |
# Function to find and run the start Hadoop services script | |
start_hadoop_services() { | |
if [ -x /opt/hadoop/sbin/start-dfs.sh ] && [ -x /opt/hadoop/sbin/start-yarn.sh ]; then | |
/opt/hadoop/sbin/start-dfs.sh | |
/opt/hadoop/sbin/start-yarn.sh | |
elif [ -x /usr/local/hadoop/sbin/start-dfs.sh ] && [ -x /usr/local/hadoop/sbin/start-yarn.sh ]; then | |
/usr/local/hadoop/sbin/start-dfs.sh | |
/usr/local/hadoop/sbin/start-yarn.sh | |
else | |
echo "Hadoop services start scripts not found or not executable. Ignoring." | |
fi | |
} | |
# Function to find and run the stop Hadoop services script | |
stop_hadoop_services() { | |
if [ -x /opt/hadoop/sbin/stop-dfs.sh ] && [ -x /opt/hadoop/sbin/stop-yarn.sh ]; then | |
/opt/hadoop/sbin/stop-dfs.sh | |
/opt/hadoop/sbin/stop-yarn.sh | |
elif [ -x /usr/local/hadoop/sbin/stop-dfs.sh ] && [ -x /usr/local/hadoop/sbin/stop-yarn.sh ]; then | |
/usr/local/hadoop/sbin/stop-dfs.sh | |
/usr/local/hadoop/sbin/stop-yarn.sh | |
else | |
echo "Hadoop services stop scripts not found or not executable. Ignoring." | |
fi | |
} | |
# Function to find and run the start Spark services script | |
start_spark_services() { | |
if [ "$SPARK_ROLE" == "master" ]; then | |
start_hadoop_services | |
if [ -x /opt/spark/sbin/start-master.sh ] && [ -x /opt/spark/sbin/start-worker.sh ]; then | |
/opt/spark/sbin/start-master.sh | |
/opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077 | |
elif [ -x /usr/local/spark/sbin/start-master.sh ] && [ -x /usr/local/spark/sbin/start-worker.sh ]; then | |
/usr/local/spark/sbin/start-master.sh | |
/usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077 | |
else | |
echo "Spark services start scripts not found or not executable. Ignoring." | |
fi | |
elif [ "$SPARK_ROLE" == "worker" ]; then | |
if [ -x /opt/spark/sbin/start-worker.sh ]; then | |
/opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077 | |
elif [ -x /usr/local/spark/sbin/start-worker.sh ]; then | |
/usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077 | |
else | |
echo "Spark worker start script not found or not executable. Ignoring." | |
fi | |
else | |
echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'." | |
exit 1 | |
fi | |
} | |
# Function to find and run the stop Spark services script | |
stop_spark_services() { | |
if [ "$SPARK_ROLE" == "master" ]; then | |
stop_hadoop_services | |
if [ -x /opt/spark/sbin/stop-master.sh ] && [ -x /opt/spark/sbin/stop-worker.sh ]; then | |
/opt/spark/sbin/stop-master.sh | |
/opt/spark/sbin/stop-worker.sh | |
elif [ -x /usr/local/spark/sbin/stop-master.sh ] && [ -x /usr/local/spark/sbin/stop-worker.sh ]; then | |
/usr/local/spark/sbin/stop-master.sh | |
/usr/local/spark/sbin/stop-worker.sh | |
else | |
echo "Spark services stop scripts not found or not executable. Ignoring." | |
fi | |
elif [ "$SPARK_ROLE" == "worker" ]; then | |
if [ -x /opt/spark/sbin/stop-worker.sh ]; then | |
/opt/spark/sbin/stop-worker.sh | |
elif [ -x /usr/local/spark/sbin/stop-worker.sh ]; then | |
/usr/local/spark/sbin/stop-worker.sh | |
else | |
echo "Spark worker stop script not found or not executable. Ignoring." | |
fi | |
else | |
echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'." | |
exit 1 | |
fi | |
} | |
# Check if the environment variables are already set | |
if ! check_env_variable; then | |
# Ask the user to specify the role | |
read -p "Is this a master or worker machine? (master/worker): " role | |
read -p "Enter the IP address of the master: " master_ip | |
set_env_variable "$role" "$master_ip" | |
echo "Environment variables have been set. Please run 'source ~/.bashrc' and then rerun this script with the desired action (start/stop)." | |
exit 1 | |
fi | |
# Proceed with the desired action | |
if [ "$1" == "start" ]; then | |
start_spark_services | |
elif [ "$1" == "stop" ]; then | |
stop_spark_services | |
else | |
echo "Invalid argument. Please use 'start' or 'stop'." | |
exit 1 | |
fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export JAVA_HOME=/usr/lib/jvm/default-java | |
# Worker configuration | |
export SPARK_WORKER_MEMORY=14G # Increased from 10G | |
export SPARK_WORKER_CORES=2 | |
export SPARK_WORKER_INSTANCES=2 | |
# Memory settings | |
export SPARK_EXECUTOR_MEMORY=14G # Match this with your SparkSession config | |
export SPARK_DRIVER_MEMORY=4G # Increased from 2G | |
export SPARK_DAEMON_MEMORY=2G | |
# GC settings | |
export SPARK_EXECUTOR_OPTS="-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:G1HeapRegionSize=16M -XX:+HeapDumpOnOutOfMemoryError" | |
export SPARK_DRIVER_OPTS="-XX:+UseG1GC -XX:G1HeapRegionSize=16M" | |
# Off-heap memory settings | |
export SPARK_EXECUTOR_OPTS="$SPARK_EXECUTOR_OPTS -XX:MaxDirectMemorySize=1G" | |
export SPARK_MASTER_HOST= |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment