# Invocation command line:
# /home/mcolgrove/HPC2021/bin/harness/runhpc -c nvhpc_acc --reportable -T base,peak small
# output_root was not used for this run
############################################################################
teeout = yes
makeflags=-j 40
flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.xml
envars = 1

license_num     = 019
test_sponsor    = NVIDIA Corporation
tester          = NVIDIA Corporation

######################################################
# SUT Section
######################################################
#include: dgx.inc

#[Software]
sw_compiler000   = C/C++/Fortran: Version 21.9 of
sw_compiler001   = NVIDIA HPC SDK for Linux
sw_mpi_library   = OpenMPI Version 4.0.5
sw_mpi_other     = None
system_class = SMP
sw_other         = None

#[General notes]

#######################################################################
# End of SUT section
######################################################################

label         = nv
tune          = base,peak
output_format = text,html,pdf
use_submit_for_speed = 1
reportable = 1

#include: selene.inc
#  ----- Begin inclusion of 'selene.inc'
############################################################################
######################################################
# Example configuration information for a
# system under test (SUT) Section
######################################################
# General SUT info
system_vendor      = NVIDIA Corporation
system_name        = DGX A100 (AMD EPYC 7742, Tesla A100-SXM-80GB)
node_compute_sw_accel_driver = NVIDIA UNIX x86_64 Kernel Module 470.57.02
hw_avail           = Jul-2020
sw_avail           = Sep-2021
prepared_by = Mathew Colgrove (mcolgrove@nvidia.com)

# Computation node info
# [Node_Description: Hardware]
node_compute_syslbl = DGX A100
node_compute_order = 1
node_compute_count = 1
node_compute_purpose = compute
node_compute_hw_vendor = NVIDIA Corporation
node_compute_hw_model = DGX A100
node_compute_hw_cpu_name = AMD EPYC 7742
node_compute_hw_ncpuorder = 2 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 128
node_compute_hw_ncoresperchip = 64
node_compute_hw_nthreadspercore = 2
node_compute_hw_cpu_char =Turbo Boost up to 3400MHz
node_compute_hw_cpu_mhz = 2250
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_scache = 512 KB I+D on chip per core
node_compute_hw_tcache000= 256 MB I+D on chip per chip
node_compute_hw_tcache001 = 16 MB shared / 4 cores
node_compute_hw_ocache = None
node_compute_hw_memory = 2 TB (32 x 64 GB 2Rx8 PC4-3200AA-R)
node_compute_hw_disk000= OS: 2TB U.2 NVMe SSD drive
node_compute_hw_disk001 = Internal Storage: 30TB (8x 3.84TB U.2 NVMe SSD
node_compute_hw_disk002 = drives)
node_compute_hw_other = None

#[Node_Description: Accelerator]
node_compute_hw_accel_model = Tesla A100-SXM-80GB
node_compute_hw_accel_count = 8
node_compute_hw_accel_vendor= NVIDIA Corporation
node_compute_hw_accel_type  = GPU
node_compute_hw_accel_connect = NVLINK 3.0, NVSWITCH 2.0 600GB/s
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = See Notes

#[Node_Description: Software]
node_compute_hw_adapter_fs_model = None
node_compute_hw_adapter_fs_count = 0
node_compute_hw_adapter_fs_slot_type = None
node_compute_hw_adapter_fs_data_rate = None
node_compute_hw_adapter_fs_ports_used = 0
node_compute_hw_adapter_fs_interconnect = None
node_compute_hw_adapter_fs_driver = None
node_compute_hw_adapter_fs_firmware = None
node_compute_sw_os000 = Ubuntu 20.04
node_compute_sw_os001 = 4.12.14-94.41-default
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile = None
node_compute_sw_state = Run level 3 (multi-user)
node_compute_sw_other = None

#[Interconnect]
interconnect_fs_syslbl = None
interconnect_fs_order = 0
interconnect_fs_purpose = N/A
interconnect_fs_hw_vendor = N/A
interconnect_fs_hw_model = N/A
interconnect_fs_hw_switch_fs_model= N/A
interconnect_fs_hw_switch_fs_count = 0
interconnect_fs_hw_switch_fs_ports = 0
interconnect_fs_hw_topo = N/A
interconnect_fs_hw_switch_fs_data_rate = 0
interconnect_fs_hw_switch_fs_firmware = 0

#######################################################################
# End of SUT section
# If this config file were to be applied to several SUTs, edits would
# be needed only ABOVE this point.
######################################################################
# ---- End inclusion of '/home/mcolgrove/HPC2021/config/selene.inc'

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
# Compiler Version Flags
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

MPIRUN_OPTS = --bind-to none
MPIRUN_OPTS += --allow-run-as-root
submit = mpirun ${MPIRUN_OPTS} -np $ranks specperl $[top]/bindACC.pl $command

#######################################################################
# Optimization
#######################################################################
default:
pmodel=ACC
ranks=8
threads=1

default=base=default:
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu
CXXPORTABILITY = --c++17

605.lbm_s=peak=default:
basepeak=1

613.soma_s=peak=default:
OPTIMIZE       = -w -fast -O3 -acc=gpu -gpu=pinned

618.tealeaf_s=peak=default:
OPTIMIZE       = -w -fast -Msafeptr -acc=gpu

619.clvleaf_s=peak=default:
basepeak=1

621.miniswp_s=peak=default:
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu -gpu=pinned

628.pot3d_s=peak=default:
OPTIMIZE       = -w -Mstack_arrays -fast -acc=gpu

632.sph_exa_s=peak=default:
basepeak=1

634.hpgmgfv_s=peak=default:
OPTIMIZE       = -w -fast -acc=gpu -gpu=pinned -static-nvidia

635.weather_s=peak=default:
basepeak=1


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
notes_comp_000 = Binaries built and run within a NVHPC SDK 21.9 CUDA 11.4 Ubuntu 20.04
notes_comp_005 =  Container available from NVIDIA's NGC Catalog:
notes_comp_010 =  https://ngc.nvidia.com/catalog/containers/nvidia:nvhpc
notes_comp_015 =
notes_submit_000 = MPI startup command:
notes_submit_005 =   mpirun command was used to start MPI jobs.
notes_submit_010 =
notes_submit_015 = Indiviual Ranks were bound to the CPU cores on the same NUMA node as
notes_submit_020 =  the GPU using 'numactl' within the following "bindACC.pl" perl script:
notes_submit_025 =---- Start bindACC.pl ------
notes_submit_030 =my %core_map = (
notes_submit_035 =  0=>48, 1=>56, 2=>16, 3=>24, 4=>112, 5=>120, 6=>80, 7=>88
notes_submit_040 =);
notes_submit_045 =my %mem_map = (
notes_submit_050 =  0=>3, 1=>3, 2=>1, 3=>1, 4=>7, 5=>7, 6=>5, 7=>5,
notes_submit_055 =);
notes_submit_060 =my $rank = $ENV{OMPI_COMM_WORLD_LOCAL_RANK};
notes_submit_065 =my $mrank = $rank % 8;
notes_submit_070 =my $cplus = int($rank/8);
notes_submit_075 =my $core = $core_map{$mrank} + $cplus;
notes_submit_080 =my $mem = $mem_map{$mrank};
notes_submit_085 =my $cmd = "numactl -C $core -m $mem ";
notes_submit_090 =while (my $arg = shift) {
notes_submit_095 =       $cmd .= "$arg ";
notes_submit_100 =}
notes_submit_105 =system($cmd);
notes_submit_110 =---- End bindACC.pl ------
notes_submit_115 =
notes_plat_000 = Detailed A100 Information from nvaccelinfo
notes_plat_005 = CUDA Driver Version:           11040
notes_plat_010 = NVRM version:                  NVIDIA UNIX x86_64 Kernel Module 470.57.02
notes_plat_015 = Device Number:                 0
notes_plat_020 = Device Name:                   NVIDIA A100-SXM-80GB
notes_plat_025 = Device Revision Number:        8.0
notes_plat_030 = Global Memory Size:            85198045184
notes_plat_035 = Number of Multiprocessors:     108
notes_plat_040 = Concurrent Copy and Execution: Yes
notes_plat_045 = Total Constant Memory:         65536
notes_plat_050 = Total Shared Memory per Block: 49152
notes_plat_055 = Registers per Block:           65536
notes_plat_060 = Warp Size:                     32
notes_plat_065 = Maximum Threads per Block:     1024
notes_plat_070 = Maximum Block Dimensions:      1024, 1024, 64
notes_plat_075 = Maximum Grid Dimensions:       2147483647 x 65535 x 65535
notes_plat_080 = Maximum Memory Pitch:          2147483647B
notes_plat_085 = Texture Alignment:             512B
notes_plat_090 = Clock Rate:                    1410 MHz
notes_plat_095 = Execution Timeout:             No
notes_plat_100 = Integrated Device:             No
notes_plat_105 = Can Map Host Memory:           Yes
notes_plat_110 = Compute Mode:                  default
notes_plat_115 = Concurrent Kernels:            Yes
notes_plat_120 = ECC Enabled:                   Yes
notes_plat_125 = Memory Clock Rate:             1593 MHz
notes_plat_130 = Memory Bus Width:              5120 bits
notes_plat_135 = L2 Cache Size:                 41943040 bytes
notes_plat_140 = Max Threads Per SMP:           2048
notes_plat_145 = Async Engines:                 3
notes_plat_150 = Unified Addressing:            Yes
notes_plat_155 = Managed Memory:                Yes
notes_plat_160 = Concurrent Managed Memory:     Yes
notes_plat_165 = Preemption Supported:          Yes
notes_plat_170 = Cooperative Launch:            Yes
notes_plat_175 =   Multi-Device:                Yes
notes_plat_180 = Default Target:                cc80