# Invocation command line:
# /lustre/fsw/devtech/hpc-devtech/cponder/SPEC-HPG/2022-09-03.documentation/bin/harness/runhpc --flagsurl nvhpc_flags.xml --configfile /lustre/fsw/devtech/hpc-devtech/cponder/SPEC-HPG/2022-09-03.documentation/3030083.run_large.11_0.REPORTABLE.128_NODES/REPORTABLE.cfg --define CONTROL=Reportable --iterations 3 --nobuild --action run --tune base --nopower --runmode speed --tune base --size ref large
# output_root was not used for this run
############################################################################

env_vars=1
teeout = yes

backup_config = no
preENV_SPEC_NO_RUNDIR_DEL = on
output_format = text,cfg,html,pdf,rsf

reportable           = 1
mean_anyway          = 0
strict_rundir_verify = 1
ignore_errors        = no

tune = base,peak

iterations=5

showtimer = 0

%define TIMELIMIT

default:
license_num     = 019
test_sponsor    = NVIDIA Corporation
tester          = NVIDIA Corporation

default:
system_vendor      = NVIDIA Corporation
interconnect_fs_hw_switch_fs_model = NVIDIA Quantum QM8700
interconnect_comm_hw_switch_comm_model = NVIDIA Quantum QM8700
system_name000     = Selene: NVIDIA DGX SuperPOD
system_name001 = (AMD EPYC 7742 2.25 GHz, Tesla A100-SXM-80 GB)
node_compute_sw_accel_driver = NVIDIA UNIX x86_64 Kernel Module 470.103.01
hw_avail           = Jul-2020
sw_avail           = Mar-2022
prepared_by000= Carl Ponder (cponder@nvidia.com) &
prepared_by001 = Mathew Colgrove (mcolgrove@nvidia.com)

default:
system_class = SMP

interconnect_comm_syslbl = Multi-rail InfiniBand HDR fabric
interconnect_comm_order = 2
interconnect_comm_purpose = Inter-process communication
interconnect_comm_hw_vendor = NVIDIA
interconnect_comm_hw_model = N/A
interconnect_comm_hw_switch_comm_count = 164
interconnect_comm_hw_switch_comm_ports = 40
interconnect_comm_hw_topo = Full three-level fat-tree
interconnect_comm_hw_switch_comm_data_rate = 200 GB/s per port
interconnect_comm_hw_switch_comm_firmware = MLNX-OS v3.10.2202

interconnect_fs_syslbl = DDN EXAScalar file system
interconnect_fs_order = 3
interconnect_fs_purpose = Global storage
interconnect_fs_hw_vendor = NVIDIA
interconnect_fs_hw_model = N/A
interconnect_fs_hw_switch_fs_count = 26
interconnect_fs_hw_switch_fs_ports = 40
interconnect_fs_hw_topo = Full three-level fat-tree
interconnect_fs_hw_switch_fs_data_rate = 200 GB/s per port
interconnect_fs_hw_switch_fs_firmware = MLNX-OS v3.10.2202

notes_submit_000 = MPI startup command:
notes_submit_005 =   srun command was used to start MPI jobs.
notes_submit_010 =
notes_submit_015 = Individual Ranks were bound to the NUMA nodes, GPUs and NICs using this "wrapper.GPU" bash-script for the case of 1 rank per GPU
notes_submit_020 =
notes_submit_025 =   ln -s -f libnuma.so.1 /usr/lib/x86_64-linux-gnu/libnuma.so
notes_submit_030 =   export LD_LIBRARY_PATH+=:/usr/lib/x86_64-linux-gnu
notes_submit_035 =   export LD_RUN_PATH+=:/usr/lib/x86_64-linux-gnu
notes_submit_040 =   declare -a NUMA_LIST
notes_submit_045 =   declare -a  GPU_LIST
notes_submit_050 =   declare -a  NIC_LIST
notes_submit_055 =   NUMA_LIST=($NUMAS)
notes_submit_060 =   GPU_LIST=($GPUS)
notes_submit_065 =   NIC_LIST=($NICS)
notes_submit_070 =   export UCX_NET_DEVICES=${NIC_LIST[$SLURM_LOCALID]}:1
notes_submit_075 =   export OMPI_MCA_btl_openib_if_include=${NIC_LIST[$SLURM_LOCALID]}
notes_submit_080 =   export CUDA_VISIBLE_DEVICES=${GPU_LIST[$SLURM_LOCALID]}
notes_submit_085 =   numactl -l -N ${NUMA_LIST[$SLURM_LOCALID]} $*
notes_submit_090 =
notes_submit_095 = and this "wrapper.MPS" bash-script for the oversubscribed case.
notes_submit_100 =
notes_submit_105 =   ln -s -f libnuma.so.1 /usr/lib/x86_64-linux-gnu/libnuma.so
notes_submit_110 =   export LD_LIBRARY_PATH+=:/usr/lib/x86_64-linux-gnu
notes_submit_115 =   export LD_RUN_PATH+=:/usr/lib/x86_64-linux-gnu
notes_submit_120 =   declare -a NUMA_LIST
notes_submit_125 =   declare -a  GPU_LIST
notes_submit_130 =   declare -a  NIC_LIST
notes_submit_135 =   NUMA_LIST=($NUMAS)
notes_submit_140 =   GPU_LIST=($GPUS)
notes_submit_145 =   NIC_LIST=($NICS)
notes_submit_150 =   NUM_GPUS=${#GPU_LIST[@]}
notes_submit_155 =   RANKS_PER_GPU=$((SLURM_NTASKS_PER_NODE / NUM_GPUS))
notes_submit_160 =   GPU_LOCAL_RANK=$((SLURM_LOCALID / RANKS_PER_GPU))
notes_submit_165 =   export UCX_NET_DEVICES=${NIC_LIST[$GPU_LOCAL_RANK]}:1
notes_submit_170 =   export OMPI_MCA_btl_openib_if_include=${NIC_LIST[$GPU_LOCAL_RANK]}
notes_submit_175 =   set +e
notes_submit_180 =   nvidia-cuda-mps-control -d 1>&2
notes_submit_185 =   set -e
notes_submit_190 =   export CUDA_VISIBLE_DEVICES=${GPU_LIST[$GPU_LOCAL_RANK]}
notes_submit_195 =   numactl -l -N ${NUMA_LIST[$GPU_LOCAL_RANK]} $*
notes_submit_200 =   if [ $SLURM_LOCALID -eq 0 ]
notes_submit_205 =   then
notes_submit_210 =       echo 'quit' | nvidia-cuda-mps-control 1>&2
notes_submit_215 =   fi

default:
ENV_GPUS  = 0 1 2 3 4 5 6 7
ENV_NICS  = mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9
ENV_NUMAS = 2 3 0 1 6 7 4 5

default:
%define GPUS_PER_NODE 8
%define GPUS_PER_NODEx2 (%{GPUS_PER_NODE}*2)

%define RANKS_PROCS (%{ENV_SLURM_JOB_NUM_NODES}*%{ENV_SLURM_NTASKS_PER_NODE})
%define RANKS_1xGPUS (1*%{ENV_SLURM_JOB_NUM_NODES}*%{GPUS_PER_NODE})
%define RANKS_2xGPUS (2*%{ENV_SLURM_JOB_NUM_NODES}*%{GPUS_PER_NODE})

%define THREADS_PROCS (256/%{ENV_SLURM_NTASKS_PER_NODE})
%define THREADS_1xGPUS (256/%{GPUS_PER_NODE})
%define THREADS_2xGPUS (256/%{GPUS_PER_NODE}/2)

%define HALF_NUMA_THREADS (128/%{GPUS_PER_NODE}/2)

large=base=default:
threads=%{HALF_NUMA_THREADS}

default:
ENV_ENROOT_LOGIN_SHELL=n
ENV_NVIDIA_DISABLE_REQUIRE=1

SRUN_OPTS=
SRUN_OPTS+= --mpi=pmix
SRUN_OPTS+= --container-entrypoint
SRUN_OPTS+= --container-name spec-hpc
SRUN_OPTS+= --container-mounts $[top] --container-workdir `/bin/pwd`
SRUN_OPTS+= -N %{ENV_SLURM_JOB_NUM_NODES}

ranks=%{RANKS_2xGPUS}
threads=%{THREADS_2xGPUS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODEx2} %{TIMELIMIT} $[top]/wrapper.MPS $command

705.lbm_m=peak=default:

513.soma_t,613.soma_s=peak=default:
ranks=%{RANKS_1xGPUS}
threads=%{THREADS_1xGPUS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.GPU $command

518.tealeaf_t,618.tealeaf_s,718.tealeaf_m,818.tealeaf_l=peak=default:
ranks=%{RANKS_1xGPUS}
threads=%{THREADS_1xGPUS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.GPU $command

519.clvleaf_t=peak=default:
ranks=%{RANKS_2xGPUS}
threads=%{THREADS_2xGPUS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODEx2} %{TIMELIMIT} $[top]/wrapper.MPS $command

619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=peak=default:
ranks=%{RANKS_PROCS}
threads=%{THREADS_PROCS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{ENV_SLURM_NTASKS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.MPS $command

819.clvleaf_l=peak=default:
ENV_NV_ACC_GANGLIMIT=3000000
ENV_NUMAS = 3 3 1 1 7 7 5 5
threads=%{HALF_NUMA_THREADS}

521.miniswp_t=peak=default:

621.miniswp_s=peak=default:
basepeak=1

528.pot3d_t,628.pot3d_s,728.pot3d_m=peak=default:

828.pot3d_l=peak=default:
ranks=%{RANKS_1xGPUS}
threads=%{THREADS_1xGPUS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.GPU $command

532.sph_exa_t,632.sph_exa_s=peak=default:
ENV_NV_ACC_GANGLIMIT=3000000
ranks=%{RANKS_PROCS}
threads=%{THREADS_PROCS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{ENV_SLURM_NTASKS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.MPS $command

534.hpgmgfv_t=peak=default:
ranks=%{RANKS_2xGPUS}
threads=%{THREADS_2xGPUS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODEx2} %{TIMELIMIT} $[top]/wrapper.MPS $command

634.hpgmgfv_s,734.hpgmgfv_m,834.hpgmgfv_l=peak=default:
ranks=%{RANKS_PROCS}
threads=%{THREADS_PROCS}
submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{ENV_SLURM_NTASKS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.MPS $command

635.weather_s=peak=default:
basepeak=1

735.weather_m=peak=default:
ENV_NUMAS = 3 3 1 1 7 7 5 5
threads=%{HALF_NUMA_THREADS}

default:
node_compute_syslbl = DGX A100
node_compute_order = 1
node_compute_count = 128
node_compute_purpose = compute
node_compute_hw_vendor = NVIDIA Corporation
node_compute_hw_model = NVIDIA DGX A100 System
node_compute_hw_cpu_name = AMD EPYC 7742
node_compute_hw_ncpuorder = 2 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 128
node_compute_hw_ncoresperchip = 64
node_compute_hw_nthreadspercore = 2
node_compute_hw_cpu_char =Turbo Boost up to 3400 MHz
node_compute_hw_cpu_mhz = 2250
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_scache = 512 KB I+D on chip per core
node_compute_hw_tcache000= 256 MB I+D on chip per chip
node_compute_hw_tcache001 = (16 MB shared / 4 cores)
node_compute_hw_ocache = None
node_compute_hw_memory = 2 TB (32 x 64 GB 2Rx8 PC4-3200AA-R)
node_compute_hw_disk000= OS: 2TB U.2 NVMe SSD drive
node_compute_hw_disk001 = Internal Storage: 30TB (8x 3.84TB U.2 NVMe SSD
node_compute_hw_disk002 = drives)
node_compute_hw_other = None

node_compute_hw_adapter_comm_interconnect = InfiniBand / Communication
node_compute_hw_adapter_comm_model = NVIDIA ConnectX-6 MT28908
node_compute_hw_adapter_comm_count = 8
node_compute_hw_adapter_comm_ports_used = 1
node_compute_hw_adapter_comm_slot_type = PCIe Gen4
node_compute_hw_adapter_comm_data_rate = 200 Gb/s
node_compute_hw_adapter_comm_driver = InfiniBand: 5.4-3.4.0.0
node_compute_hw_adapter_comm_firmware = InfiniBand: 20.32.1010

node_compute_hw_adapter_fs_interconnect = InfiniBand / FileSystem
node_compute_hw_adapter_fs_model = NVIDIA ConnectX-6 MT28908
node_compute_hw_adapter_fs_count = 2
node_compute_hw_adapter_fs_ports_used = 2
node_compute_hw_adapter_fs_slot_type = PCIe Gen4
node_compute_hw_adapter_fs_data_rate = 200 Gb/s
node_compute_hw_adapter_fs_driver = Ethernet: 5.4-3.4.0.0
node_compute_hw_adapter_fs_firmware = Ethernet: 20.32.1010

node_compute_sw_os000 = Ubuntu 20.04
node_compute_sw_os001 = 5.4.0-121-generic
node_compute_sw_localfile = ext4
node_compute_sw_sharedfile = Lustre
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_other = None

default:
node_compute_hw_accel_model = Tesla A100-SXM-80 GB
node_compute_hw_accel_count = 8
node_compute_hw_accel_vendor= NVIDIA Corporation
node_compute_hw_accel_type  = GPU
node_compute_hw_accel_connect = NVLINK 3.0, NVSWITCH 2.0 600 GB/s
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = See Notes

default:
notes_plat_000 = Detailed A100 Information from nvaccelinfo
notes_plat_005 = CUDA Driver Version:           11040
notes_plat_010 = NVRM version:                  NVIDIA UNIX x86_64 Kernel Module 470.7.01
notes_plat_015 = Device Number:                 0
notes_plat_020 = Device Name:                   NVIDIA A100-SXM-80 GB
notes_plat_025 = Device Revision Number:        8.0
notes_plat_030 = Global Memory Size:            85198045184
notes_plat_035 = Number of Multiprocessors:     108
notes_plat_040 = Concurrent Copy and Execution: Yes
notes_plat_045 = Total Constant Memory:         65536
notes_plat_050 = Total Shared Memory per Block: 49152
notes_plat_055 = Registers per Block:           65536
notes_plat_060 = Warp Size:                     32
notes_plat_065 = Maximum Threads per Block:     1024
notes_plat_070 = Maximum Block Dimensions:      1024, 1024, 64
notes_plat_075 = Maximum Grid Dimensions:       2147483647 x 65535 x 65535
notes_plat_080 = Maximum Memory Pitch:          2147483647B
notes_plat_085 = Texture Alignment:             512B
notes_plat_090 = Clock Rate:                    1410 MHz
notes_plat_095 = Execution Timeout:             No
notes_plat_100 = Integrated Device:             No
notes_plat_105 = Can Map Host Memory:           Yes
notes_plat_110 = Compute Mode:                  default
notes_plat_115 = Concurrent Kernels:            Yes
notes_plat_120 = ECC Enabled:                   Yes
notes_plat_125 = Memory Clock Rate:             1593 MHz
notes_plat_130 = Memory Bus Width:              5120 bits
notes_plat_135 = L2 Cache Size:                 41943040 bytes
notes_plat_140 = Max Threads Per SMP:           2048
notes_plat_145 = Async Engines:                 3
notes_plat_150 = Unified Addressing:            Yes
notes_plat_155 = Managed Memory:                Yes
notes_plat_160 = Concurrent Managed Memory:     Yes
notes_plat_165 = Preemption Supported:          Yes
notes_plat_170 = Cooperative Launch:            Yes
notes_plat_175 =   Multi-Device:                Yes
notes_plat_180 = Default Target:                cc80

default:
makeflags=-j 40
flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.2022-11-03.xml

default:
sw_compiler000   = C/C++/Fortran: Version 22.3 of
sw_compiler001   = NVIDIA HPC SDK for Linux
sw_mpi_library   = OpenMPI Version 4.1.2rc4
sw_mpi_other     = HPC-X Software Toolkit Version 2.10
sw_other         = None

default:
label         = %{CONTROL}.11_0

default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

ENV_OMPI_MCA_pml=ucx
ENV_OMPI_MCA_topo=basic
ENV_UCX_LOG_LEVEL=error
ENV_OMPI_MCA_coll_hcoll_enable=1
ENV_CUDA_CACHE_DISABLE=1

ENV_HCOLL_BUFFER_POOL_MEM_PER_NODE=1024Mb
ENV_RETRY_COUNT=1000

ENV_UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy

ENV_UCX_RNDV_SCHEME=get_zcopy
ENV_UCX_RNDV_THRESH=8192
ENV_UCX_MAX_RNDV_RAILS=1

ENV_OMPI_MCA_pml_ucx_devices=any
ENV_OMPI_MCA_pml_ucx_tls=any

default:
pmodel=ACC
OPTIMIZE       = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2
CXXPORTABILITY = --c++17

505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l=default=default:
CPORTABILITY   = -DSPEC_OPENACC_NO_SELF

505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l=peak=default:
OPTIMIZE       = -w -O3 -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80,maxregcount:128 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -mp

513.soma_t,613.soma_s=peak=default:

518.tealeaf_t,618.tealeaf_s,718.tealeaf_m,818.tealeaf_l=peak=default:
OPTIMIZE       = -w -O3 -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -mp
COPTIMIZE      = -Msafeptr

519.clvleaf_t,619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=peak=default:
OPTIMIZE       = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -mp

521.miniswp_t,621.miniswp_s=peak=default:
OPTIMIZE       = -w -O3 -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2
COPTIMIZE      = -Msafeptr

528.pot3d_t,628.pot3d_s,728.pot3d_m,828.pot3d_l=peak=default:
ENV_HCOLL_BUFFER_POOL_MEM_PER_NODE=512Mb
srcalt=acc_async

532.sph_exa_t,632.sph_exa_s=peak=default:
OPTIMIZE       = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -Mquad
COPTIMIZE      = -Msafeptr

534.hpgmgfv_t,634.hpgmgfv_s,734.hpgmgfv_m,834.hpgmgfv_l=peak=default:
OPTIMIZE       = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2
COPTIMIZE      = -Msafeptr

535.weather_t,635.weather_s,735.weather_m,835.weather_l=peak=default:
srcalt=acc_collapse

default:
notes_comp_000 = Binaries built and run within a NVHPC SDK 22.3 CUDA 11.0 Ubuntu 20.04
notes_comp_005 =  Container available from NVIDIA GPU Cloud (NGC):
notes_comp_010 =   https://ngc.nvidia.com/catalog/containers/nvidia:nvhpc
notes_comp_015 =   https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
notes_comp_020 =

default:
notes_000 =Full system details documented here:
notes_005 =https://images.nvidia.com/aem-dam/Solutions/Data-Center/gated-resources/nvidia-dgx-superpod-a100.pdf
notes_010 =
notes_015 =Environment variables set by runhpc before the start of the run:
notes_020 =SPEC_NO_RUNDIR_DEL = "on"
notes_025 =