airtonzanon · May 12, 2025 11:40
diff --git a/install-spark-wizard.sh b/install-spark-wizard.sh
 #!/bin/bash

 echo "\n #####################"
 echo "Install Spark Script"
 echo "Done with massive help of Mistral"
 echo "This is a wizard that it was not tested 100% yet, gonna try it later"
 echo "Done with a bunch of search around, might not be the best way of installing or using spark \n"

 # Function to check if the user is root
 is_root() {
    if [ "$(id -u)" -eq 0 ]; then
        return 0
    else
        return 1
    fi
 }

 # Function to set the environment variable
 set_env_variable() {
    echo "export SPARK_ROLE=$1" >> ~/.bashrc
    echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc
 }

 # Ask for the version of Scala and Spark
 read -p "Enter the version of Scala (default: 2.13.4): " scala_version
 scala_version=${scala_version:-2.13.4}

 read -p "Enter the version of Spark (default: 3.5.3): " spark_version
 spark_version=${spark_version:-3.5.3}

 # Ask if it's a master or worker machine and for the IP address of the master
 read -p "Is this a master or worker machine? (master/worker): " role
 read -p "Enter the IP address of the master: " master_ip
 set_env_variable "$role" "$master_ip"

 # Check if the user is root
 if is_root; then
    sudo_cmd=""
 else
    sudo_cmd="sudo"
 fi

 # Update and install dependencies
 $sudo_cmd apt update
 $sudo_cmd apt upgrade -y
 $sudo_cmd apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y

 # Download and install Scala
 wget https://www.scala-lang.org/files/archive/scala-$scala_version.deb
 $sudo_cmd dpkg -i scala*.deb

 # Download and install Spark
 wget https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz
 tar -zxvf spark-$spark_version-bin-hadoop3.tgz
 $sudo_cmd mv spark-$spark_version-bin-hadoop3 /opt/spark
 $sudo_cmd cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh

 # Configure Spark environment
 echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
 echo "SPARK_MASTER_HOST=$SPARK_MASTER_IP" >> /opt/spark/conf/spark-env.sh

 # Set Spark environment variables
 echo "export SPARK_HOME=/opt/spark" >> ~/.profile
 echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile

 echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc
 echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc


 # Download the make-spark script
 wget https://gist.githubusercontent.com/airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237/raw/a4eefe14813428e6b1c015a96a9d029ae66d10e9/run.sh -O /opt/spark/bin/make-spark
 $sudo_cmd chmod +x /opt/spark/bin/make-spark

 echo "You might need to run 'source ~/.bashrc' and 'source ~/.profile'"
 echo "You can use it to start or stop Spark services with the following commands:"
 echo "To start Spark services: make-spark start"
 echo "To stop Spark services: make-spark stop"
diff --git a/install_spark.sh b/install_spark.sh
 #!/bin/bash

 echo "\n #####################"
 echo "Install Spark Script"
 echo "Done with a bunch of search around, might not be the best way of installing or using spark \n"


 apt update
 apt upgrade -y
 apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y
 # python3 -m venv
 # source .venv/bin/activate
 wget https://www.scala-lang.org/files/archive/scala-2.13.4.deb
 dpkg -i scala*.deb
 # install maven dependencies
 # mvn dependency:copy-dependencies
 wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz
 tar -zxvf spark-3.5.3-bin-hadoop3.tgz
 mv spark-3.5.3-bin-hadoop3 /opt/spark
 cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh

 echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
 echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh

 echo "SPARK_MASTER_HOST=YOUR_MASTER_IP_HERE" >> /opt/spark/conf/spark-env.sh


 echo "export SPARK_HOME=/opt/spark" >> ~/.profile
 echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile
 source ~/.profile
 echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc
 echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc
 source ~/.bashrc

 echo "To start primary: /opt/spark/sbin/start-master.sh"
 echo "To start worker: /opt/spark/sbin/start-worker.sh spark://PRIMARY_IP:7077"
diff --git a/run.sh b/run.sh
 #!/bin/bash

 echo "\n #####################"
 echo "Script to run and stop Spark"
 echo "Done with massive help of Mistral"
 echo "There might be better ways of doing this, but this is how I'm doing now :P \n"


 # Function to check if the environment variable is set
 check_env_variable() {
    if grep -q "export SPARK_ROLE=" ~/.bashrc && grep -q "export SPARK_MASTER_IP=" ~/.bashrc; then
        return 0
    else
        return 1
    fi
 }

 # Function to set the environment variable
 set_env_variable() {
    echo "export SPARK_ROLE=$1" >> ~/.bashrc
    echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc
 }

 # Function to find and run the start Hadoop services script
 start_hadoop_services() {
    if [ -x /opt/hadoop/sbin/start-dfs.sh ] && [ -x /opt/hadoop/sbin/start-yarn.sh ]; then
        /opt/hadoop/sbin/start-dfs.sh
        /opt/hadoop/sbin/start-yarn.sh
    elif [ -x /usr/local/hadoop/sbin/start-dfs.sh ] && [ -x /usr/local/hadoop/sbin/start-yarn.sh ]; then
        /usr/local/hadoop/sbin/start-dfs.sh
        /usr/local/hadoop/sbin/start-yarn.sh
    else
        echo "Hadoop services start scripts not found or not executable. Ignoring."
    fi
 }

 # Function to find and run the stop Hadoop services script
 stop_hadoop_services() {
    if [ -x /opt/hadoop/sbin/stop-dfs.sh ] && [ -x /opt/hadoop/sbin/stop-yarn.sh ]; then
        /opt/hadoop/sbin/stop-dfs.sh
        /opt/hadoop/sbin/stop-yarn.sh
    elif [ -x /usr/local/hadoop/sbin/stop-dfs.sh ] && [ -x /usr/local/hadoop/sbin/stop-yarn.sh ]; then
        /usr/local/hadoop/sbin/stop-dfs.sh
        /usr/local/hadoop/sbin/stop-yarn.sh
    else
        echo "Hadoop services stop scripts not found or not executable. Ignoring."
    fi
 }

 # Function to find and run the start Spark services script
 start_spark_services() {
    if [ "$SPARK_ROLE" == "master" ]; then
        start_hadoop_services
        if [ -x /opt/spark/sbin/start-master.sh ] && [ -x /opt/spark/sbin/start-worker.sh ]; then
            /opt/spark/sbin/start-master.sh
            /opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
        elif [ -x /usr/local/spark/sbin/start-master.sh ] && [ -x /usr/local/spark/sbin/start-worker.sh ]; then
            /usr/local/spark/sbin/start-master.sh
            /usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
        else
            echo "Spark services start scripts not found or not executable. Ignoring."
        fi
    elif [ "$SPARK_ROLE" == "worker" ]; then
        if [ -x /opt/spark/sbin/start-worker.sh ]; then
            /opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
        elif [ -x /usr/local/spark/sbin/start-worker.sh ]; then
            /usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
        else
            echo "Spark worker start script not found or not executable. Ignoring."
        fi
    else
        echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'."
        exit 1
    fi
 }

 # Function to find and run the stop Spark services script
 stop_spark_services() {
    if [ "$SPARK_ROLE" == "master" ]; then
        stop_hadoop_services
        if [ -x /opt/spark/sbin/stop-master.sh ] && [ -x /opt/spark/sbin/stop-worker.sh ]; then
            /opt/spark/sbin/stop-master.sh
            /opt/spark/sbin/stop-worker.sh
        elif [ -x /usr/local/spark/sbin/stop-master.sh ] && [ -x /usr/local/spark/sbin/stop-worker.sh ]; then
            /usr/local/spark/sbin/stop-master.sh
            /usr/local/spark/sbin/stop-worker.sh
        else
            echo "Spark services stop scripts not found or not executable. Ignoring."
        fi
    elif [ "$SPARK_ROLE" == "worker" ]; then
        if [ -x /opt/spark/sbin/stop-worker.sh ]; then
            /opt/spark/sbin/stop-worker.sh
        elif [ -x /usr/local/spark/sbin/stop-worker.sh ]; then
            /usr/local/spark/sbin/stop-worker.sh
        else
            echo "Spark worker stop script not found or not executable. Ignoring."
        fi
    else
        echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'."
        exit 1
    fi
 }

 # Check if the environment variables are already set
 if ! check_env_variable; then
    # Ask the user to specify the role
    read -p "Is this a master or worker machine? (master/worker): " role
    read -p "Enter the IP address of the master: " master_ip
    set_env_variable "$role" "$master_ip"

    echo "Environment variables have been set. Please run 'source ~/.bashrc' and then rerun this script with the desired action (start/stop)."
    exit 1
 fi

 # Proceed with the desired action
 if [ "$1" == "start" ]; then
    start_spark_services
 elif [ "$1" == "stop" ]; then
    stop_spark_services
 else
    echo "Invalid argument. Please use 'start' or 'stop'."
    exit 1
 fi
diff --git a/spark-env.sh b/spark-env.sh
 export JAVA_HOME=/usr/lib/jvm/default-java
 # Worker configuration
 export SPARK_WORKER_MEMORY=14G        # Increased from 10G
 export SPARK_WORKER_CORES=2
 export SPARK_WORKER_INSTANCES=2

 # Memory settings
 export SPARK_EXECUTOR_MEMORY=14G       # Match this with your SparkSession config
 export SPARK_DRIVER_MEMORY=4G         # Increased from 2G
 export SPARK_DAEMON_MEMORY=2G

 # GC settings
 export SPARK_EXECUTOR_OPTS="-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:G1HeapRegionSize=16M -XX:+HeapDumpOnOutOfMemoryError"
 export SPARK_DRIVER_OPTS="-XX:+UseG1GC -XX:G1HeapRegionSize=16M"

 # Off-heap memory settings
 export SPARK_EXECUTOR_OPTS="$SPARK_EXECUTOR_OPTS -XX:MaxDirectMemorySize=1G"

 export SPARK_MASTER_HOST=
	#!/bin/bash

	echo "\n #####################"
	echo "Install Spark Script"
	echo "Done with massive help of Mistral"
	echo "This is a wizard that it was not tested 100% yet, gonna try it later"
	echo "Done with a bunch of search around, might not be the best way of installing or using spark \n"

	# Function to check if the user is root
	is_root() {
	if [ "$(id -u)" -eq 0 ]; then
	return 0
	else
	return 1
	fi
	}

	# Function to set the environment variable
	set_env_variable() {
	echo "export SPARK_ROLE=$1" >> ~/.bashrc
	echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc
	}

	# Ask for the version of Scala and Spark
	read -p "Enter the version of Scala (default: 2.13.4): " scala_version
	scala_version=${scala_version:-2.13.4}

	read -p "Enter the version of Spark (default: 3.5.3): " spark_version
	spark_version=${spark_version:-3.5.3}

	# Ask if it's a master or worker machine and for the IP address of the master
	read -p "Is this a master or worker machine? (master/worker): " role
	read -p "Enter the IP address of the master: " master_ip
	set_env_variable "$role" "$master_ip"

	# Check if the user is root
	if is_root; then
	sudo_cmd=""
	else
	sudo_cmd="sudo"
	fi

	# Update and install dependencies
	$sudo_cmd apt update
	$sudo_cmd apt upgrade -y
	$sudo_cmd apt install curl wget git default-jdk software-properties-common python3 python3-pip python3-venv vim -y

	# Download and install Scala
	wget https://www.scala-lang.org/files/archive/scala-$scala_version.deb
	$sudo_cmd dpkg -i scala*.deb

	# Download and install Spark
	wget https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz
	tar -zxvf spark-$spark_version-bin-hadoop3.tgz
	$sudo_cmd mv spark-$spark_version-bin-hadoop3 /opt/spark
	$sudo_cmd cp /opt/spark/conf/spark-env.sh.template /opt/spark/conf/spark-env.sh

	# Configure Spark environment
	echo "export JAVA_HOME=/usr/lib/jvm/default-java" >> /opt/spark/conf/spark-env.sh
	echo "export SPARK_WORKER_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
	echo "export SPARK_WORKER_CORES=2" >> /opt/spark/conf/spark-env.sh
	echo "export SPARK_DAEMON_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
	echo "export SPARK_EXECUTOR_CORES=2" >> /opt/spark/conf/spark-env.sh
	echo "export SPARK_EXECUTOR_MEMORY=5G" >> /opt/spark/conf/spark-env.sh
	echo "export SPARK_DRIVER_MEMORY=1G" >> /opt/spark/conf/spark-env.sh
	echo "SPARK_MASTER_HOST=$SPARK_MASTER_IP" >> /opt/spark/conf/spark-env.sh

	# Set Spark environment variables
	echo "export SPARK_HOME=/opt/spark" >> ~/.profile
	echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.profile

	echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc
	echo "export PATH=$PATH:/opt/spark/bin:/opt/spark/sbin" >> ~/.bashrc


	# Download the make-spark script
	wget https://gist.githubusercontent.com/airtonzanon/d0f9ee584ed1e4367d6b7feccf3a6237/raw/a4eefe14813428e6b1c015a96a9d029ae66d10e9/run.sh -O /opt/spark/bin/make-spark
	$sudo_cmd chmod +x /opt/spark/bin/make-spark

	echo "You might need to run 'source ~/.bashrc' and 'source ~/.profile'"
	echo "You can use it to start or stop Spark services with the following commands:"
	echo "To start Spark services: make-spark start"
	echo "To stop Spark services: make-spark stop"
	#!/bin/bash

	echo "\n #####################"
	echo "Script to run and stop Spark"
	echo "Done with massive help of Mistral"
	echo "There might be better ways of doing this, but this is how I'm doing now :P \n"


	# Function to check if the environment variable is set
	check_env_variable() {
	if grep -q "export SPARK_ROLE=" ~/.bashrc && grep -q "export SPARK_MASTER_IP=" ~/.bashrc; then
	return 0
	else
	return 1
	fi
	}

	# Function to set the environment variable
	set_env_variable() {
	echo "export SPARK_ROLE=$1" >> ~/.bashrc
	echo "export SPARK_MASTER_IP=$2" >> ~/.bashrc
	}

	# Function to find and run the start Hadoop services script
	start_hadoop_services() {
	if [ -x /opt/hadoop/sbin/start-dfs.sh ] && [ -x /opt/hadoop/sbin/start-yarn.sh ]; then
	/opt/hadoop/sbin/start-dfs.sh
	/opt/hadoop/sbin/start-yarn.sh
	elif [ -x /usr/local/hadoop/sbin/start-dfs.sh ] && [ -x /usr/local/hadoop/sbin/start-yarn.sh ]; then
	/usr/local/hadoop/sbin/start-dfs.sh
	/usr/local/hadoop/sbin/start-yarn.sh
	else
	echo "Hadoop services start scripts not found or not executable. Ignoring."
	fi
	}

	# Function to find and run the stop Hadoop services script
	stop_hadoop_services() {
	if [ -x /opt/hadoop/sbin/stop-dfs.sh ] && [ -x /opt/hadoop/sbin/stop-yarn.sh ]; then
	/opt/hadoop/sbin/stop-dfs.sh
	/opt/hadoop/sbin/stop-yarn.sh
	elif [ -x /usr/local/hadoop/sbin/stop-dfs.sh ] && [ -x /usr/local/hadoop/sbin/stop-yarn.sh ]; then
	/usr/local/hadoop/sbin/stop-dfs.sh
	/usr/local/hadoop/sbin/stop-yarn.sh
	else
	echo "Hadoop services stop scripts not found or not executable. Ignoring."
	fi
	}

	# Function to find and run the start Spark services script
	start_spark_services() {
	if [ "$SPARK_ROLE" == "master" ]; then
	start_hadoop_services
	if [ -x /opt/spark/sbin/start-master.sh ] && [ -x /opt/spark/sbin/start-worker.sh ]; then
	/opt/spark/sbin/start-master.sh
	/opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
	elif [ -x /usr/local/spark/sbin/start-master.sh ] && [ -x /usr/local/spark/sbin/start-worker.sh ]; then
	/usr/local/spark/sbin/start-master.sh
	/usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
	else
	echo "Spark services start scripts not found or not executable. Ignoring."
	fi
	elif [ "$SPARK_ROLE" == "worker" ]; then
	if [ -x /opt/spark/sbin/start-worker.sh ]; then
	/opt/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
	elif [ -x /usr/local/spark/sbin/start-worker.sh ]; then
	/usr/local/spark/sbin/start-worker.sh spark://$SPARK_MASTER_IP:7077
	else
	echo "Spark worker start script not found or not executable. Ignoring."
	fi
	else
	echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'."
	exit 1
	fi
	}

	# Function to find and run the stop Spark services script
	stop_spark_services() {
	if [ "$SPARK_ROLE" == "master" ]; then
	stop_hadoop_services
	if [ -x /opt/spark/sbin/stop-master.sh ] && [ -x /opt/spark/sbin/stop-worker.sh ]; then
	/opt/spark/sbin/stop-master.sh
	/opt/spark/sbin/stop-worker.sh
	elif [ -x /usr/local/spark/sbin/stop-master.sh ] && [ -x /usr/local/spark/sbin/stop-worker.sh ]; then
	/usr/local/spark/sbin/stop-master.sh
	/usr/local/spark/sbin/stop-worker.sh
	else
	echo "Spark services stop scripts not found or not executable. Ignoring."
	fi
	elif [ "$SPARK_ROLE" == "worker" ]; then
	if [ -x /opt/spark/sbin/stop-worker.sh ]; then
	/opt/spark/sbin/stop-worker.sh
	elif [ -x /usr/local/spark/sbin/stop-worker.sh ]; then
	/usr/local/spark/sbin/stop-worker.sh
	else
	echo "Spark worker stop script not found or not executable. Ignoring."
	fi
	else
	echo "Invalid SPARK_ROLE value in .bashrc. Please set it to 'master' or 'worker'."
	exit 1
	fi
	}

	# Check if the environment variables are already set
	if ! check_env_variable; then
	# Ask the user to specify the role
	read -p "Is this a master or worker machine? (master/worker): " role
	read -p "Enter the IP address of the master: " master_ip
	set_env_variable "$role" "$master_ip"

	echo "Environment variables have been set. Please run 'source ~/.bashrc' and then rerun this script with the desired action (start/stop)."
	exit 1
	fi

	# Proceed with the desired action
	if [ "$1" == "start" ]; then
	start_spark_services
	elif [ "$1" == "stop" ]; then
	stop_spark_services
	else
	echo "Invalid argument. Please use 'start' or 'stop'."
	exit 1
	fi
	export JAVA_HOME=/usr/lib/jvm/default-java
	# Worker configuration
	export SPARK_WORKER_MEMORY=14G # Increased from 10G
	export SPARK_WORKER_CORES=2
	export SPARK_WORKER_INSTANCES=2

	# Memory settings
	export SPARK_EXECUTOR_MEMORY=14G # Match this with your SparkSession config
	export SPARK_DRIVER_MEMORY=4G # Increased from 2G
	export SPARK_DAEMON_MEMORY=2G

	# GC settings
	export SPARK_EXECUTOR_OPTS="-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:G1HeapRegionSize=16M -XX:+HeapDumpOnOutOfMemoryError"
	export SPARK_DRIVER_OPTS="-XX:+UseG1GC -XX:G1HeapRegionSize=16M"

	# Off-heap memory settings
	export SPARK_EXECUTOR_OPTS="$SPARK_EXECUTOR_OPTS -XX:MaxDirectMemorySize=1G"

	export SPARK_MASTER_HOST=