Skip to content

Instantly share code, notes, and snippets.

@airtonzanon
Last active May 12, 2025 11:40
Show Gist options
  • Save airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237 to your computer and use it in GitHub Desktop.
Save airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237 to your computer and use it in GitHub Desktop.
Script to install spark on debian 12
#!/bin/bash
echo "\n #####################"
echo "Install Spark Script"
echo "Done with massive help of Mistral"
echo "This is a wizard that it was not tested 100% yet, gonna try it later"
echo "Done with a bunch of search around, might not be the best way of installing or using spark \n"
# Function to check if the user is root
is_root() {
if [ "$(id -u)" -eq 0 ]; then
return 0
else
return 1
fi
}
# Function to set the environment variable
set_env_variable() {
echo "export SPARK_ROLE=$1" >> ~/.bashrc
echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc
}
# Ask for the version of Scala and Spark
read -p "Enter the version of Scala (default: 2.13.4): " scala_version
scala_version=${scala_version:-2.13.4}
read -p "Enter the version of Spark (default: 3.5.3): " spark_version
spark_version=${spark_version:-3.5.3}
# Ask if it's a master or worker machine and for the IP address of the master
read -p "Is this a master or worker machine? (master/worker): " role
read -p "Enter the IP address of the master: " master_ip
set_env_variable "$role" "$master_ip"
# Check if the user is root
if is_root; then
sudo_cmd=""
else
sudo_cmd="sudo"
fi
# Update and install dependencies
$sudo_cmd apt update
$sudo_cmd apt upgrade -y
$sudo_cmd apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y
# Download and install Scala
wget https://www.scala-lang.org/files/archive/scala-$scala_version.deb
$sudo_cmd dpkg -i scala*.deb
# Download and install Spark
wget https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz
tar -zxvf spark-$spark_version-bin-hadoop3.tgz
$sudo_cmd mv spark-$spark_version-bin-hadoop3 /opt/spark
$sudo_cmd cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh
# Configure Spark environment
echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
echo "SPARK_MASTER_HOST=$SPARK_MASTER_IP" >> /opt/spark/conf/spark-env.sh
# Set Spark environment variables
echo "export SPARK_HOME=/opt/spark" >> ~/.profile
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile
echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc
# Download the make-spark script
wget https://gist.githubusercontent.com/airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237/raw/a4eefe14813428e6b1c015a96a9d029ae66d10e9/run.sh -O /opt/spark/bin/make-spark
$sudo_cmd chmod +x /opt/spark/bin/make-spark
echo "You might need to run 'source ~/.bashrc' and 'source ~/.profile'"
echo "You can use it to start or stop Spark services with the following commands:"
echo "To start Spark services: make-spark start"
echo "To stop Spark services: make-spark stop"
#!/bin/bash
echo "\n #####################"
echo "Install Spark Script"
echo "Done with a bunch of search around, might not be the best way of installing or using spark \n"
apt update
apt upgrade -y
apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y
# python3 -m venv
# source .venv/bin/activate
wget https://www.scala-lang.org/files/archive/scala-2.13.4.deb
dpkg -i scala*.deb
# install maven dependencies
# mvn dependency:copy-dependencies
wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz
tar -zxvf spark-3.5.3-bin-hadoop3.tgz
mv spark-3.5.3-bin-hadoop3 /opt/spark
cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh
echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
echo "SPARK_MASTER_HOST=YOUR_MASTER_IP_HERE" >> /opt/spark/conf/spark-env.sh
echo "export SPARK_HOME=/opt/spark" >> ~/.profile
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile
source ~/.profile
echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc
echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc
source ~/.bashrc
echo "To start primary: /opt/spark/sbin/start-master.sh"
echo "To start worker: /opt/spark/sbin/start-worker.sh spark://PRIMARY_IP:7077"
#!/bin/bash
echo "\n #####################"
echo "Script to run and stop Spark"
echo "Done with massive help of Mistral"
echo "There might be better ways of doing this, but this is how I'm doing now :P \n"
# Function to check if the environment variable is set
check_env_variable() {
if grep -q "export SPARK_ROLE=" ~/.bashrc && grep -q "export SPARK_MASTER_IP=" ~/.bashrc; then
return 0
else
return 1
fi
}
# Function to set the environment variable
set_env_variable() {
echo "export SPARK_ROLE=$1" >> ~/.bashrc
echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc
}
# Function to find and run the start Hadoop services script
start_hadoop_services() {
if [ -x /opt/hadoop/sbin/start-dfs.sh ] && [ -x /opt/hadoop/sbin/start-yarn.sh ]; then
/opt/hadoop/sbin/start-dfs.sh
/opt/hadoop/sbin/start-yarn.sh
elif [ -x /usr/local/hadoop/sbin/start-dfs.sh ] && [ -x /usr/local/hadoop/sbin/start-yarn.sh ]; then
/usr/local/hadoop/sbin/start-dfs.sh
/usr/local/hadoop/sbin/start-yarn.sh
else
echo "Hadoop services start scripts not found or not executable. Ignoring."
fi
}
# Function to find and run the stop Hadoop services script
stop_hadoop_services() {
if [ -x /opt/hadoop/sbin/stop-dfs.sh ] && [ -x /opt/hadoop/sbin/stop-yarn.sh ]; then
/opt/hadoop/sbin/stop-dfs.sh
/opt/hadoop/sbin/stop-yarn.sh
elif [ -x /usr/local/hadoop/sbin/stop-dfs.sh ] && [ -x /usr/local/hadoop/sbin/stop-yarn.sh ]; then
/usr/local/hadoop/sbin/stop-dfs.sh
/usr/local/hadoop/sbin/stop-yarn.sh
else
echo "Hadoop services stop scripts not found or not executable. Ignoring."
fi
}
# Function to find and run the start Spark services script
start_spark_services() {
if [ "$SPARK_ROLE" == "master" ]; then
start_hadoop_services
if [ -x /opt/spark/sbin/start-master.sh ] && [ -x /opt/spark/sbin/start-worker.sh ]; then
/opt/spark/sbin/start-master.sh
/opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
elif [ -x /usr/local/spark/sbin/start-master.sh ] && [ -x /usr/local/spark/sbin/start-worker.sh ]; then
/usr/local/spark/sbin/start-master.sh
/usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
else
echo "Spark services start scripts not found or not executable. Ignoring."
fi
elif [ "$SPARK_ROLE" == "worker" ]; then
if [ -x /opt/spark/sbin/start-worker.sh ]; then
/opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
elif [ -x /usr/local/spark/sbin/start-worker.sh ]; then
/usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
else
echo "Spark worker start script not found or not executable. Ignoring."
fi
else
echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'."
exit 1
fi
}
# Function to find and run the stop Spark services script
stop_spark_services() {
if [ "$SPARK_ROLE" == "master" ]; then
stop_hadoop_services
if [ -x /opt/spark/sbin/stop-master.sh ] && [ -x /opt/spark/sbin/stop-worker.sh ]; then
/opt/spark/sbin/stop-master.sh
/opt/spark/sbin/stop-worker.sh
elif [ -x /usr/local/spark/sbin/stop-master.sh ] && [ -x /usr/local/spark/sbin/stop-worker.sh ]; then
/usr/local/spark/sbin/stop-master.sh
/usr/local/spark/sbin/stop-worker.sh
else
echo "Spark services stop scripts not found or not executable. Ignoring."
fi
elif [ "$SPARK_ROLE" == "worker" ]; then
if [ -x /opt/spark/sbin/stop-worker.sh ]; then
/opt/spark/sbin/stop-worker.sh
elif [ -x /usr/local/spark/sbin/stop-worker.sh ]; then
/usr/local/spark/sbin/stop-worker.sh
else
echo "Spark worker stop script not found or not executable. Ignoring."
fi
else
echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'."
exit 1
fi
}
# Check if the environment variables are already set
if ! check_env_variable; then
# Ask the user to specify the role
read -p "Is this a master or worker machine? (master/worker): " role
read -p "Enter the IP address of the master: " master_ip
set_env_variable "$role" "$master_ip"
echo "Environment variables have been set. Please run 'source ~/.bashrc' and then rerun this script with the desired action (start/stop)."
exit 1
fi
# Proceed with the desired action
if [ "$1" == "start" ]; then
start_spark_services
elif [ "$1" == "stop" ]; then
stop_spark_services
else
echo "Invalid argument. Please use 'start' or 'stop'."
exit 1
fi
export JAVA_HOME=/usr/lib/jvm/default-java
# Worker configuration
export SPARK_WORKER_MEMORY=14G # Increased from 10G
export SPARK_WORKER_CORES=2
export SPARK_WORKER_INSTANCES=2
# Memory settings
export SPARK_EXECUTOR_MEMORY=14G # Match this with your SparkSession config
export SPARK_DRIVER_MEMORY=4G # Increased from 2G
export SPARK_DAEMON_MEMORY=2G
# GC settings
export SPARK_EXECUTOR_OPTS="-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:G1HeapRegionSize=16M -XX:+HeapDumpOnOutOfMemoryError"
export SPARK_DRIVER_OPTS="-XX:+UseG1GC -XX:G1HeapRegionSize=16M"
# Off-heap memory settings
export SPARK_EXECUTOR_OPTS="$SPARK_EXECUTOR_OPTS -XX:MaxDirectMemorySize=1G"
export SPARK_MASTER_HOST=
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment