Source: https://code.ill.fr/relaxse/relaxse-code

installation: 

  • MPI+OMP version: /software/relaxse/2022.10/bin/
  • OMP only version: /software/relaxse/2022.10-omp/bin/

Setup:

module load maxwell gcc/9.3 openmpi/3.1.6
./configure --omp --mpi --gnu --build-type production
cd build
make -j 8 VERBOSE=1

Summary

  • Single-node jobs almost always fail or take forever, except when running the omp-only binary
  • Jobs on 2 nodes take forever
  • Using N MPI-threads requires much more memory than using N OMP-threads
  • Jobs on nodes with INTEL Gold-6140/Gold-6240 are much slower than jobs on nodes with EPYC 7543/75F3
  • using 8 or more nodes has not so much impact on the runtime
  • using more OMP threads than number of physical cores per node slows jobs down
  • the temporary files produced by relaxse are huge, can easily reach a few hundred GB. Home-dir is certainly not suited.
  • relaxse has a "mem reduce" option, which slightly reduces memory usage (~20%, see below), at a cost of increased runtime and i/o.
  • OMP option like OMP_PLACES=cores OMP_SCHEDULE="DYNAMIC,1"     don't seem to have an impact

  • SASS_MEM doesn't seem to be used at all. There is also no reference to SASS_MEM in the code.

Best option

  • use 1 MPI-threads per node
  • use as many OMP-threads as physical cores: $(( $(nproc) / 2 ))
  • use EPYC 75F3 or EPYC 7543
  • use 4-6 nodes per job
  • use BeeGFS or better GPFS as working directory
  • see sample-script below


Test runs

Several test runs are not listed, as they failed, ran into timeout, exceeded the memory limit etc. The plots show CPU and memory consumption of the master node in the job. To watch your own jobs: https://max-portal.desy.de/webjobs/ (DESY network only). For completed jobs use j<jobid> in the Username field.

CPUMemory#cores

#nodes

#mpi/node#omp/nodememreduceruntimestatejobidcomment
EPYC 76431TB1281241yes65:25:35ok13143778
EPYC 76431TB1281244yes25:59:00ok13144152

EPYC 76431TB1283164no15:14:52ok13149586
EPYC 76431TB1281164no
time13154408












Gold-6240768G364136no29:01:01ok13151534WD: GPFS
Gold-6240768G364136no29:01:39ok13162462WD: BeeGFS
Gold-6240768G366136no18:56:40ok13162464
Gold-6240768G368136no14:52:48ok13162463












EPYC 75F3512G641164no44:00:00time13157512

mpi+omp binary

EPYC 75F3512G641164

no

62:00:00ok13165664

omp binary

EPYC 75F3512G642164no44:00:00time13157511

EPYC 75F3512G643164no16:11:12ok13157510

EPYC 75431TB643164no15:29:05ok13149708
EPYC 75F3512G644164no12:45:28ok13154402
EPYC 75F3512G646164no09:34:38ok13157773

EPYC 75F3512G646164yes10:23:11ok13262741

EPYC 75F3512G646132yes15:48:41ok13273419

EPYC 75F3512G6461128yes>13hslow13262950

EPYC 75F3512G648164no08:37:10ok13159577
EPYC 75F3512G642164no44:00:00time13157511












Sample script:

#SBATCH --job-name=relaxse   
#SBATCH --nodes=6
#SBATCH --constraint='[75F3|7543]'
#SBATCH --time=1-00:00:00       
#SBATCH --partition=maxcpu,allcpu
#SBATCH --output=%x.o%j       
#SBATCH --error=%x.o%j        
unset LD_PRELOAD
source /etc/profile.d/modules.sh

module purge
module load maxwell gcc/9.3 openmpi/3.1.6

export BINDIR=/software/relaxse/2022.10/bin
#
#  mpi setup
#
export UCX_LOG_LEVEL=error
export OMPI_MCA_pml=ucx
export OMPI_MCA_opal_warn_on_missing_libcuda=0
export OMPI_MCA_mpi_cuda_support=0
#
#  environment
#
export  JOBID=$SLURM_JOB_ID
export  OMP_STACKSIZE=2G                       
export  CURDIR=$PWD
export  SAVEDIR=$CURDIR/save
export  INPDIR=$CURDIR/input
export  Project=relaxse-75f3
export  RestartDIR=$SAVEDIR
export  RestartFile=relaxse.restart
export  calc=sass
#
#  collect some generic job-info
#
/software/tools/bin/sjobinfo

echo "---------------- Job Info ----------------"
for e in JOBID Project calc CURDIR BINDIR INPDIR SAVEDIR RestartDIR RestartFile OMP_STACKSIZE OMP_PLACES OMP_SCHEDULE; do
    printf "%-20s: %s \n" $e ${!e}
done
#
#  function to setup input
#
function populate_workdir {
    case $calc in
	sass)
	    cp $INPDIR/$Project.sass.in          $WorkDir/INPUT
	    cp $INPDIR/$Project.sass.ref0        $WorkDir/$Project.ref0
	    cp $INPDIR/$Project.TraOne           $WorkDir/$Project.TraOne 
	    cp $INPDIR/$Project.TraInt           $WorkDir/$Project.TraInt
	    ;;
	restart)
	    cp $INPDIR/$Project.sass.in          $WorkDir/INPUT
	    cp $INPDIR/$Project.sass.ref0        $WorkDir/$Project.ref0
	    cp $SAVEDIR/$Project.TraOne          $WorkDir/$Project.TraOne 
	    cp $SAVEDIR/$Project.TraInt          $WorkDir/$Project.TraInt
	    cp $RestartDIR/$RestartFile          $WorkDir/$Project.restart
	    ;;
    esac
}
#
#  sass_mem doesn't seem to be used, set to 0
#
export SASS_MEM=$(( 0*1024 )) # 1024=1g
printf "%-20s: %s \n" SASS_MEM $SASS_MEM

#
#  memreduce=yes reduces memory consumption but creates gigantic temporary files
#
memreduce=no

#
#  number of OMP and MPI threads
#
threads=$(( $(nproc) / 2))
np=1

#
#  prepare workdir
#
cd $CURDIR
export  WorkDir=$CURDIR/$JOBID.$np.$threads
mkdir -p   $WorkDir
printf "%-20s: %s \n" WorkDir $WorkDir
populate_workdir
cd $WorkDir

# 
#  patch input file
#
perl -pi -e "s|sizebatch =.*|sizebatch = ${threads}|" INPUT
if [[ $memreduce == "yes" ]]; then
    perl -pi -e "s|mem_reduction = .false.,|mem_reduction = .true.,|" INPUT
fi

mpirun -N $np $binary   >& $CURDIR/$Project.sass.$JOBID.log

#
#  collect some generic info
#
/software/tools/bin/spostinfo

Attachments:

j13157510.png (image/png)
j13157773.png (image/png)
j13162464.png (image/png)
j13157511.png (image/png)
j13154408.png (image/png)
j13144152.png (image/png)
j13165664.png (image/png)
j13262741.png (image/png)
j13157512.png (image/png)
j13262950.png (image/png)
j13273419.png (image/png)
j13273419.png (image/png)