Created
May 21, 2024 08:08
-
-
Save wmvanvliet/f833a76aa9efec3f21232f3cbd760d03 to your computer and use it in GitHub Desktop.
Script to request a node through SLURM and create an interactive TMUX session on it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Start tmux | |
tmux new -d -s "slurm$SLURM_JOB_ID" | |
# Make job wait for user to connect | |
sleep 8h |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This will submit a batch script that starts tmux on a node. | |
# Then ssh is used to connect to the node and attach the tmux. | |
MYDIR="$(dirname "$(readlink -f ${BASH_SOURCE[0]})")" | |
# Batch Script that starts TMUX | |
BS=$MYDIR/_interactive | |
# Default partition (Change this to suit your site!) | |
# The user can specify another partition explicitly, sbatch will take | |
# the last one on the command line. | |
DEFAULTPART=interactive | |
# Submit the job and get the job id | |
JOB=$(sbatch --output=/dev/null --error=/dev/null -p $DEFAULTPART $@ $BS \ | |
| grep -E --line-buffered -o -e "\b[0-9]+$") | |
sbatch_status=$? | |
if [ $sbatch_status -ne 0 ];then | |
exit $sbatch_status | |
fi | |
# Make sure the job is always canceled | |
trap "{ scancel --quiet $JOB; exit; }" SIGINT SIGTERM EXIT | |
echo "Waiting for JOBID $JOB to start" | |
while true;do | |
sleep 1s | |
# Check job status | |
STATUS=$(squeue -j $JOB -t all -h -o %t) | |
if [ "$STATUS" = "R" ];then | |
# Job is running, break the while loop | |
break | |
elif [[ "$STATUS" != "PD" && "$STATUS" != "CF" ]];then | |
echo "Job is not Running or Pending ($STATUS). Aborting" | |
scancel $JOB | |
exit 1 | |
fi | |
echo -n "." | |
done | |
# Determine the first node in the job: | |
NODE=$(scontrol show job $JOB|perl -ne 'print "$1" if /BatchHost=(.+)\s/') | |
# SSH to the node and attach tmux | |
sleep 1s | |
ssh -Y -t $NODE tmux a -t slurm$JOB | |
# The trap will now cancel the job before exiting. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment