Created by Frank Schluenzen, last modified on Jan 24, 2021 16:09
Just to get a quick idea about namd performance of different compute node, a few test-runs based on the apoa1 benchmark (https://www.ks.uiuc.edu/Research/namd/utilities/) were done. The setup is not tuned at all, and the submission jobs quite naive (and probably simply not well suited). The numbers are not very consistent, so better tailor your own benchmarks to your computational problem. The verbs-smp version seems a good choice. For AMD use all physical cores (w/o hyperthreading), On INTEL it seems that using all cores (incl. hyperthreading) is preferably. For this particular run, the use of GPUs is not adding much.
CPU Job Templates
To iterate over different CPU-types available in the all partition, I used the following job-submission scriplet:
#!/bin/bash
unset LD_PRELOAD
# iterate over number of nodes
for n in 1 ; do
# iterate over cpu constraints
for c in 'EPYC&7402' 'EPYC&7642' Gold-6126 Gold-6140 Gold-6226 Gold-6230 Gold-6240 Silver-4114 'V4&E5-2640' 'V4&E5-2698' ; do
C=$(echo $c | sed 's|&|-|g')
DDIR="${C}-${n}"
mkdir -p "$DDIR"
perl -p -e "s|NUM_NODES|$n|g" namd_template.sh | perl -p -e "s|WD|$DDIR|g" | perl -p -e "s|CONSTRAINT|$C|g" > $DDIR/namd_$DDIR.sh
sbatch -p all -C "$c" $DDIR/namd_$DDIR.sh
done
done
exit
#
# full set of cpu-types: 'EPYC&7402' 'EPYC&7642' Gold-6126 Gold-6140 Gold-6226 Gold-6230 Gold-6240 Silver-4114 'V4&E5-2640' 'V4&E5-2698'
# only few nodes for Gold-6226 Gold-6230 'EPYC&7642'
# reduced set of cpus: 'EPYC&7402' Gold-6126 Gold-6140 Gold-6240 Silver-4114 'V4&E5-2640' 'V4&E5-2698'
The template:
#!/bin/bash
#SBATCH --nodes=NUM_NODES
#SBATCH --time=01:00:00
#SBATCH --job-name=namd-WD
#SBATCH --output=WD.out
#SBATCH --chdir=WD
unset LD_PRELOAD
# get the sample input
tar xf /beegfs/desy/user/schluenz/namd/apoa1.tar.gz --strip-components=1
# clean modules
source /etc/profile.d/modules.sh
module purge
# create nodelist
NODELIST=nodelist.$SLURM_JOBID
rm -f $NODELIST
for n in `echo $SLURM_NODELIST | scontrol show hostnames`; do
echo "host $n" >> $NODELIST
done
# use cpu version
SSH="ssh -o PubkeyAcceptedKeyTypes=+ssh-dss -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR"
if [[ $PWD =~ smp ]]; then
export PATH=/software/namd/NAMD_2.14_Linux-x86_64-verbs-smp:$PATH
else
export PATH=/software/namd/NAMD_2.14_Linux-x86_64-verbs:$PATH
fi
np=$(($(nproc) * NUM_NODES ))
# run namd
for P in 8 16 32 $(( $np / 2 )) $np ; do
PPN=$(( $P / NUM_NODES ))
charmrun ++p $P ++ppn $PPN ++nodelist $NODELIST ++remote-shell "$SSH" $(which namd2) apoa1.namd > namd.$P.out
speed=$(grep WallClock namd.$P.out)
echo "Nodes: NUM_NODES Procs: $P Constraint: CONSTRAINT $speed"
done
One of the resulting batch scripts
#SBATCH --job-name=namd-Gold-6240-1
#SBATCH --output=Gold-6240-1.out
#SBATCH --chdir=Gold-6240-1
unset LD_PRELOAD
# get the sample input
tar xf /beegfs/desy/user/schluenz/namd/apoa1.tar.gz --strip-components=1
# avoid module
source /etc/profile.d/modules.sh
module purge
# create nodelist
NODELIST=nodelist.$SLURM_JOBID
rm -f $NODELIST
for n in `echo $SLURM_NODELIST | scontrol show hostnames`; do
echo "host $n" >> $NODELIST
done
np=$(($(nproc) * 1 ))
# run namd
for P in 8 16 32 $(( $np / 2 )) $np ; do
SSH="ssh -o PubkeyAcceptedKeyTypes=+ssh-dss -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR"
export PATH=/software/namd/NAMD_2.14_Linux-x86_64-verbs:$PATH
charmrun ++p $P ++nodelist $NODELIST ++remote-shell "$SSH" $(which namd2) apoa1.namd > namd.$P.out
speed=$(grep WallClock namd.$P.out)
echo "Nodes: 1 Procs: $P Constraint: Gold-6240 $speed"
done
GPU Job Templates
Job templates for GPUs are very similar:
#!/bin/bash
unset LD_PRELOAD
# only run on single node
for n in 1 ; do
for c in 'Gold-5115&V100&GPUx2' 'Gold-6234&V100&GPUx2' 'Gold-6248&V100&GPUx4' 'Silver-4114&P100&GPUx1' 'Silver-4114&V100&GPUx1' 'Silver-4210&V100&GPUx1' 'Silver-4216&V100&GPUx1' 'Silver-4216&V100&GPUx2' 'V4&E5-2640&K40X&GPUx2' 'V4&E5-2640&P100&GPUx1' 'V4&E5-2640&P100&GPUx2' 'V4&E5-2640&P100&GPUx4' 'V4&E5-2640&V100&GPUx1' 'V4&E5-2695&K40X&GPUx1' 'V4&E5-2698&P100&GPUx2' ; do
C=$(echo $c | sed 's|&|-|g')
DDIR="${C}-${n}"
mkdir -p "$DDIR"
perl -p -e "s|NUM_NODES|$n|g" namd_template_gpu.sh | perl -p -e "s|WD|$DDIR|g" | perl -p -e "s|CONSTRAINT|$C|g" > $DDIR/namd_$DDIR.sh
sbatch -p allgpu -C "$c" $DDIR/namd_$DDIR.sh
done
done
exit
The GPU job template
#!/bin/bash
#SBATCH --nodes=NUM_NODES
#SBATCH --time=01:00:00
#SBATCH --job-name=namd-WD
#SBATCH --output=WD.out
#SBATCH --chdir=WD
unset LD_PRELOAD
# get the sample input
tar xf /beegfs/desy/user/schluenz/namd/apoa1.tar.gz --strip-components=1
# clean modules
source /etc/profile.d/modules.sh
module purge
# use cuda version
export PATH=/software/namd/NAMD_2.14_Linux-x86_64-multicore-CUDA:$PATH
num_gpus=$(nvidia-smi -L | wc -l)
np=$(($(nproc) * NUM_NODES ))
# run namd
for p in 1 2 4 6 8; do
P=$(( $p * $num_gpus ))
charmrun +p $P ++local $(which namd2) apoa1.namd > namd.$P.out
speed=$(grep WallClock namd.$P.out)
echo "GPUs: $num_gpus Procs: $P Constraint: CONSTRAINT $speed"
done
One of the resulting batch scripts
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --time=01:00:00
#SBATCH --job-name=namd-Silver-4114-P100-GPUx1-1
#SBATCH --output=Silver-4114-P100-GPUx1-1.out
#SBATCH --chdir=Silver-4114-P100-GPUx1-1
unset LD_PRELOAD
# get the sample input
tar xf /beegfs/desy/user/schluenz/namd/apoa1.tar.gz --strip-components=1
# clean modules
source /etc/profile.d/modules.sh
module purge
# use cuda version
export PATH=/software/namd/NAMD_2.14_Linux-x86_64-multicore-CUDA:$PATH
num_gpus=$(nvidia-smi -L | wc -l)
np=$(($(nproc) * 1 ))
# run namd
for p in 1 2 4 6 8; do
P=$(( $p * $num_gpus ))
charmrun +p $P ++local $(which namd2) apoa1.namd > namd.$P.out
speed=$(grep WallClock namd.$P.out)
echo "GPUs: $num_gpus Procs: $P Constraint: Silver-4114-P100-GPUx1 $speed"
done
Results for CPU nodes
CPU | #cores (ht) | #nodes | Wallclock for # of threads (++p) per node (s) verbs-smp: ++ppn=#threads/#nodes |
---|
|
|
| 8 | 16 | 32 | ncore | ncore+ht | version |
---|
EPYC-7402 | 48 (96) | 1 | 80.0 | 41.9 | 28.9 | 14.5 | 37.7 | verbs |
---|
|
| 1 | 79.5 | 41.6 | 22.8 | 11.4 | 17.8 | verbs-smp |
---|
|
| 2 | 42.0 | 22.7 | 14.8 | 8.6 | - | verbs |
---|
|
| 2 |
|
|
| 15.9 | 7.1 | verbs-smp |
---|
|
| 4 |
|
|
| 14.6 |
| verbs |
---|
|
| 4 |
|
|
| 27.3 | 7.1 | verbs-smp |
---|
|
| 8 |
|
|
| 12.5 |
| verbs |
---|
|
| 8 |
|
|
| 23.9 | 19.0 | verbs-smp |
---|
EPYC-7642 | 96 (192) | 1 | 81.4 | 42.6 | 23.1 | 16.6 | 10.9 | verbs |
---|
Gold-6126 | 24 (48) | 1 | 74.6 | 39.3 | 31.4 | 25.2 | 17.9 | verbs |
---|
|
| 2 | 74.2 | 38.2 | 20.8 | 12.0 | 20.9 | verbs |
---|
|
| 2 |
|
|
| 24.6 | 13.9 | verbs-smp |
---|
|
| 4 |
|
|
| 9.4 |
| verbs |
---|
|
| 4 |
|
|
| 16.7 | 7.7 | verbs-smp |
---|
|
| 8 |
|
|
| 8.4 |
| verbs |
---|
|
| 8 |
|
|
| 15.0 | 7.1 | verbs-smp |
---|
Gold-6140 | 36 (72) | 1 | 79.6 | 41.5 | 23.0 | 18.1 | 25.4 | verbs |
---|
|
| 2 | 41.2 | 22.0 |
| 17.2 | 13.4 | verbs |
---|
|
| 2 |
|
|
| 18.7 | 9.1 | verbs-smp |
---|
|
| 4 |
|
|
| 14.4 | 11.6 | verbs |
---|
|
| 4 |
|
|
| 15.5 | 6.8 | verbs-smp |
---|
|
| 8 |
|
|
| 13.1 | 14.8 | verbs |
---|
|
| 8 |
|
|
| 13.0 | 4.3 | verbs-smp |
---|
Gold-6226 | 24 (48) | 1 | 71.2 | 37.5 | 30.5 | 24.0 | 16.0 | verbs |
---|
Gold-6230 |
| 1 |
|
|
|
|
|
|
---|
Gold-6240 | 36 (72) | 1 | 75.0 | 39.1 | 21.9 | 14.6 | 23.2 | verbs |
---|
|
| 2 |
|
|
| 16.9 | 12.2 | verbs |
---|
|
| 2 |
|
|
| 17.8 | 7.7 | verbs-smp |
---|
|
| 4 |
|
|
| 15.4 | 30.0 | verbs |
---|
|
| 4 |
|
|
| 15.5 | 16.0 | verbs-smp |
---|
|
| 8 |
|
|
| 13.2 | 17.1 | verbs |
---|
|
| 8 |
|
|
| 12.5 | 5.0 | verbs-smp |
---|
Silver-4114 | 40 (80) | 1 | 90.5 | 47.5 | 39.1 | 43.3 | 27.0 | verbs |
---|
|
| 2 | 47.6 | 25.8 |
| 16.0 | 25.2 | verbs |
---|
|
| 2 |
|
|
| 19.3 | 22.4 | verbs-smp |
---|
|
| 4 |
|
|
| 16.9 | 11.3 | verbs |
---|
|
| 4 |
|
|
| 21.1 | 9.8 | verbs-smp |
---|
|
| 8 |
|
|
|
|
| verbs |
---|
|
| 8 |
|
|
| 17.8 | 7.6 | verbs-smp |
---|
V4-E5-2640 | 20 (40) | 1 | 82.5 | 44.8 | 34.1 | 31.5 | 21.9 | verbs |
---|
|
| 2 |
|
|
| 13.0 | 20.5 | verbs |
---|
|
| 2 |
|
|
| 16.4 | 19.8 | verbs-smp |
---|
|
| 4 |
|
|
| 15.9 | 8.7 | verbs |
---|
|
| 4 |
|
|
| 18.9 | 7.3 | verbs-smp |
---|
|
| 8 |
|
|
| 13.8 | 6.8 | verbs |
---|
|
| 8 |
|
|
| 15.6 | 6.2 | verbs-smp |
---|
V4-E5-2698 | 40 (80) | 1 | 82.4 | 43.5 | 24.4 | 17.6 | 21.7 | verbs |
---|
|
| 2 |
|
|
| 19.6 | 10.1 | verbs |
---|
|
| 2 |
|
|
| 18.9 | 8.4 | verbs-smp |
---|
|
| 4 |
|
|
| 14.3 | 9.1 | verbs |
---|
|
| 4 |
|
|
| 15.2 | 6.6 | verbs-smp |
---|
|
| 8 |
|
|
| 12.7 | 8.6 | verbs |
---|
|
| 8 |
|
|
| 13.0 | 6.7 | verbs-smp |
---|
Best single node runs: EPYC-7402 with 48 threads, or EPYC-7642 with 192 threads
Best 2-node runs: EPYC-7402 with 2*96 cores (verbs-smp)
Fastest time: Gold-6140, 6240 with all cores (incl. HT) on 8 nodes
Results for GPU nodes
CPU | GPU | #GPUs | Wallclock for # of threads (++p) per GPU (s) |
---|
| 1 | 2 | 4 | 6 | 8 |
---|
Gold-5115 | V100 | 2 | 30.9 | 13.4 | 20.3 | 11.8 | 19.6 |
---|
Gold-6234 | V100 | 2 | 27.9 | 11.9 | 19.4 | 10.9 | 18.7 |
---|
Gold-6248 | V100 | 4 |
|
|
|
|
|
---|
Silver-4114 | P100 | 1 | 36.4 | 16.4 | 14.1 | 13.4 | 21.8 |
---|
Silver-4114 | V100 | 1 | 37.4 | 17.5 | 15.2 | 14.5 | 22.6 |
---|
Silver-4210 | V100 | 1 | 34.6 | 15.3 | 13.1 | 12.4 | 20.0 |
---|
Silver-4216 | V100 | 1 | 34.2 | 15.1 | 13.0 | 12.2 | 19.9 |
---|
Silver-4216 | V100 | 2 | 30.7 | 13.4 | 20.3 | 11.8 | 19.6 |
---|
V4-E5-2640 | K40X | 2 | 31.7 | 15.2 | 23.4 | 15.3 | 24.0 |
---|
V4-E5-2640 | P100 | 1 | 35.3 | 15.9 | 13.8 | 13.4 | 21.5 |
---|
V4-E5-2640 | P100 | 2 | 32.1 | 14.9 | 22.1 | 13.1 | 21.5 |
---|
V4-E5-2640 | P100 | 4 |
|
|
|
|
|
---|
V4-E5-2640 | V100 | 1 |
|
|
|
|
|
---|
V4-E5-2695 | K40X | 1 | 36.4 | 20.5 | 18.8 | 19.6 | 28.4 |
---|
V4-E5-2698 | P100 | 2 |
|
|
|
|
|
---|