Maxwell : Automatic Restart after failure

jobs might get preempted, or job commands might fail for various reasons. If you know that the code should work and rather suspect hardware problems, you can have your job restarted after failure, excluding currently used host (for example). A sample script

#!/bin/bash
#SBATCH --partition=maxcpu
#SBATCH --time=1-00:00:00
#SBATCH --nodes=4
#SBATCH --job-name=restart-sample
#SBATCH --output=restart-sample.out
#SBATCH --error=restart-sample.err
#SBATCH --constraint='[75F3|V4&E5-2698|Gold-6240|7402|7642]'
# constraints guarantee that all nodes are reasonably similar. only needed for multi-node jobs
unset LD_PRELOAD 

#
# mpi jobs are typically run faster when using only physical cores
# using only physical cores also reduced stress on cluster file systems
#
np=$(( $(nproc) / 2 ))  # only use physical cores
# for multi-node jobs particularly on newer hardware you'd need that
mpiflags="--mca pml ucx"

#
#  setup mpi environment. Of course needs to be modified according to your needs
#  starting without modules loaded is always a good idea
#
source /etc/profile.d/modules.sh
module purge
module load mpi/openmpi-x86_64

#
# really not need. it just adds some job information to slurms output
#
sjobinfo

#
#  just gather some information about the job (partially) used in term_handler 
#
scontext=$(scontrol show job $SLURM_JOB_ID)
restarts=$(echo "$scontext" | grep -o 'Restarts=.' | cut -d= -f2)
outfile=$(echo "$scontext"  | grep 'StdOut='       | cut -d= -f2)
errfile=$(echo "$scontext"  | grep 'StdErr='       | cut -d= -f2)
timelimit=$(echo "$scontext" | grep -o 'TimeLimit=.*' | awk '{print $1}' | cut -d= -f2)

#
# when enabling automatic restart of jobs, you don't want endless restarts, so impose a limit (used below), e.g.
#
max_restarts=3

#
# the function term_handler takes care of job restarts
# it's invoked when a job gets terminated (i.e. preempted, killed or timeout) - optionally
# it's invoked when a job stop fails - optionally
# Note: after restart, a job will pend for ~3minutes with status BeginTime. That's a good time to cancel jobs ultimately 
#
term_handler()
{
    echo "restarting job at $(date) - Restart: $restarts "
    spostinfo # see below

    if [[ $restarts -lt $max_restarts ]]; then
       # copy the logfile. original will be overwritten by the 2nd run
       cp -v $outfile $outfile.$restarts
       cp -v $errfile $errfile.$restarts

       # requeue the job and put it on hold. It's not possible to change partition otherwise
       scontrol requeuehold $SLURM_JOB_ID

       # exclude nodes used in this job just in case
       scontrol update JobID=$SLURM_JOB_ID ExcNodeList=$SLURM_NODELIST

       # release the job. It will wait in the queue for 3 minutes before the next run can start
       scontrol release $SLURM_JOB_ID
    fi
}

# 
#  when a job gets killed, preempted or runs into timeout, the job_scripts gets a TERM signal 
#  trapping the signal will invoke term_handler, so the job gets restarted
#  just remove it, if you don't want to 
# 
trap 'term_handler' TERM

#
#  run your code
#
mpirun -N $np $mpiflags my-mpi-code
# a non-zero exit code means something went wrong. invoke term_handler to restart the job
[[ $? -gt 0 ]] &&  term_handler

# do another run if the above succeeded
mpirun -N $np $mpiflags my-mpi-code
[[ $? -gt 0 ]] &&  term_handler

#
# really not need. it just adds some (after) job information to slurms output. also added above to term_handler
#
spostinfo