# Invocation command line:
# /home/HPC2021F1.0.1/bin/harness/runhpc --reportable --configfile nvhpc_final.cfg --tune base --define ucx --define model=acc --pmodel ACC --threads 1 --ranks 4 --size ref --iterations 3 --nopower --runmode speed --tune base --size ref tiny
# output_root was not used for this run
############################################################################
######################################################################
# Example configuration file for the NVIDIA HPC SDK Compilers
#
# Before using this config file, copy it to a new config (such as nvhpc.cfg) and edit as needed
#
# Defines: "model" => "mpi", "acc", "omp", "tgt", "tgtgpu"  default "mpi"
#          "label" => ext base label, default "nv"
#
# MPI-only Command:
# runhpc -c nvhpc --reportable -T base --define model=mpi --ranks=40 tiny
#
# OpenACC offload to GPU Command:
# runhpc -c nvhpc --reportable -T base --define model=acc --ranks=4  tiny
#   Add "--define ucx" if using OpenMPI 4 with UCX support.
#
# OpenACC offload to Multicore CPU Command:
# runhpc -c nvhpc --reportable -T base --define model=accmc --ranks=4  tiny
#
# OpenMP Command:
# runhpc -c nvhpc --reportable -T base --define model=omp --ranks=1 --threads=40 tiny
#
# OpenMP Target Offload to Host Command:
# runhpc -c nvhpc --reportable -T base --define model=tgt --ranks=1 --threads=40 tiny
#
# OpenMP Target Offload to GPU Command:
# runhpc -c nvhpc --reportable -T base --define model=tgtgpu --ranks=4  tiny
#
#######################################################################

%ifndef %{label}         # IF label is not set use nv
%   define label nv_final
%endif

%ifndef %{model}         # IF model is not set use mpi
%   define model mpi
pmodel = MPI
%endif

teeout = yes

# Adjust the number of make jobs to use here
makeflags=-j 40

flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags.xml

# Tester Information
license_num     = 28
test_sponsor    = Lenovo Global Technology
tester          = Lenovo Global Technology
hw_avail = Aug-2021
sw_avail = Aug-2021
prepared_by = Lenovo Global Technology
system_vendor = Lenovo Global Technology
system_name = ThinkSystem SD650-N V2 (Intel Xeon Platinum 8368Q, Tesla A100-SXM-40GB)
node_fileserver_syslbl = ThinkSystem SD650-N V2
node_fileserver_sw_state = Multi-User, run level 3
node_fileserver_sw_sharedfile = N/A
node_fileserver_sw_other = None
node_fileserver_sw_os = Red Hat Enterprise Linux Server release 8.3
node_fileserver_sw_localfile = xfs
node_fileserver_sw_accel_driver = N/A
node_fileserver_purpose = Fileserver
node_fileserver_order = 1
node_fileserver_hw_vendor = Lenovo Global Technology
node_fileserver_hw_tcache = 57 MB I+D on chip per chip
node_fileserver_hw_scache = 1280 KB I+D on chip per core
node_fileserver_hw_pcache = 32 KB I + 48 KB D on chip per core
node_fileserver_hw_other = None
node_fileserver_hw_ocache = None
node_fileserver_hw_nthreadspercore = 1
node_fileserver_hw_ncpuorder = 2 chips
node_fileserver_hw_ncoresperchip = 38
node_fileserver_hw_ncores = 76
node_fileserver_hw_nchips = 2
node_fileserver_hw_model = ThinkSystem SD650-N V2
node_fileserver_hw_memory = 512 GB (16 x 32 GB 2Rx8 PC4-3200A-R)
node_fileserver_hw_disk = 1 x 960 GB NVME 2.5" SSD
node_fileserver_hw_cpu_name = Intel Xeon Platinum 8368Q
node_fileserver_hw_cpu_mhz = 2600
node_fileserver_hw_cpu_char = Turbo up to 3.7 GHz
node_fileserver_hw_adapter_fs_slot_type = PCI-Express 4.0 x16
node_fileserver_hw_adapter_fs_ports_used = 1
node_fileserver_hw_adapter_fs_model = Mellanox ConnectX-6 HDR
node_fileserver_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-6 HDR
node_fileserver_hw_adapter_fs_firmware = 20.28.1002
node_fileserver_hw_adapter_fs_driver = 5.1-2.3.7
node_fileserver_hw_adapter_fs_data_rate = 200 Gb/s
node_fileserver_hw_adapter_fs_count = 1
node_fileserver_hw_accel_vendor = Nvidia
node_fileserver_hw_accel_type = GPU
node_fileserver_hw_accel_model = Tesla A100 SXM4 40GB
node_fileserver_hw_accel_ecc = Yes
node_fileserver_hw_accel_desc = Nvidia Tesla A100 SXM4 40GB
node_fileserver_hw_accel_count = 4
node_fileserver_hw_accel_connect = Nvidia Tesla A100 SXM4 40GB
node_fileserver_count = 1
node_compute_syslbl = ThinkSystem SD650-N V2
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_sharedfile = NFS
node_compute_sw_other = None
node_compute_sw_localfile = xfs
node_compute_sw_accel_driver = 460.32.03
node_compute_purpose = compute
node_compute_order = 1
node_compute_hw_vendor = Lenovo Global Technology
node_compute_hw_tcache = 57 MB I+D on chip per chip
node_compute_hw_scache = 1280 KB I+D on chip per core
node_compute_hw_pcache = 32 KB I + 48 KB D on chip per core
node_compute_hw_other = None
node_compute_hw_ocache = None
node_compute_hw_nthreadspercore = 1
node_compute_hw_ncpuorder = 2 chips
node_compute_hw_ncoresperchip = 38
node_compute_hw_ncores = 76
node_compute_hw_nchips = 2
node_compute_hw_model = ThinkSystem SD650-N V2
node_compute_hw_memory = 512 GB (16 x 32 GB 2Rx8 PC4-3200A-R)
node_compute_hw_disk = 1 x 480 GB 2.5" SSD
node_compute_hw_cpu_name = Intel Xeon Platinum 8368Q
node_compute_hw_cpu_mhz = 2600
node_compute_hw_cpu_char = Turbo up to 3.7 GHz
node_compute_hw_adapter_fs_slot_type = PCI-Express 4.0 x16
node_compute_hw_adapter_fs_ports_used = 1
node_compute_hw_adapter_fs_model = Mellanox ConnectX-6 HDR
node_compute_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-6 HDR
node_compute_hw_adapter_fs_firmware = 20.28.1002
node_compute_hw_adapter_fs_driver = 5.1-2.3.7
node_compute_hw_adapter_fs_data_rate = 200 Gb/s
node_compute_hw_adapter_fs_count = 1
node_compute_hw_accel_vendor = Nvidia Corporation
node_compute_hw_accel_type = GPU
node_compute_hw_accel_model = Tesla A100 SXM4 40GB
node_compute_hw_accel_ecc = Yes
node_compute_hw_accel_desc = Nvidia Tesla A100 SXM4 40GB
node_compute_hw_accel_count = 4
node_compute_hw_accel_connect = NVLink
node_compute_count = 1
interconnect_fs_syslbl = Nvidia Mellanox ConnectX-6 HDR
interconnect_fs_purpose = MPI Traffic, NFS Access
interconnect_fs_order = 0
interconnect_fs_label = Nvidia Mellanox ConnectX-6 HDR
interconnect_fs_hw_vendor = Nvidia
interconnect_fs_hw_topo = Direct Connect
interconnect_fs_hw_switch_fs_ports = 0
interconnect_fs_hw_switch_fs_model = N/A
interconnect_fs_hw_switch_fs_firmware = N/A
interconnect_fs_hw_switch_fs_data_rate = N/A
interconnect_fs_hw_switch_fs_count = 0
interconnect_fs_hw_model = Nvidia Mellanox ConnectX-6 HDR


######################################################
# SUT Section
######################################################
#include: Example_SUT.inc

#[Software]
system_class = Homogenous Cluster
sw_compiler      = Nvidia HPC SDK 21.5
sw_mpi_library = Open MPI 4.0.5
sw_mpi_other = --
sw_other = --

#[General notes]

#######################################################################
# End of SUT section
######################################################################

######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
label         = %{label}_%{model}
tune          = base
output_format = text
use_submit_for_speed = 1

# Setting 'strict_rundir_verify=0' will allow direct source code modifications
# but will disable the ability to create reportable results.
# May be useful for academic and research purposes
# strict_rundir_verify = 0

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
# Compiler Version Flags
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

# MPI options and binding environment, dependent upon Model being run
# Adjust to match your system

%ifdef %{ucx}
# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI
# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
preENV_UCX_MEMTYPE_CACHE=n
preENV_UCX_TLS=self,shm,cuda_copy
%endif

MPIRUN_OPTS = --bind-to none

# Note that SPH_EXA is known to hang when using multiple nodes with some versions of UCX,
# to work around, add the following setting:
#MPIRUN_OPTS += --mca topo basic
%if %{model} eq 'acc'
        submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks perl $[top]/bind_Toomie.pl $command
	#submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks $command
%endif

%if %{model} eq 'tgtgpu'
        submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks perl $[top]/bind_Toomie.pl $command
        #submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks $command
%endif

%if %{model} eq 'mpi'
        submit = mpirun --allow-run-as-root -x UCX=1 -x UCX_MEMTYPE_CACHE=n -host 192.168.99.171:76 -np $ranks $command
%endif

#submit = mpirun ${MPIRUN_OPTS} -np $ranks $command

#######################################################################
# Optimization

# Note that SPEC baseline rules require that all uses of a given compiler
# use the same flags in the same order. See the SPEChpc Run Rules
# for more details
#      http://www.spec.org/hpc2021/Docs/runrules.html
#
# OPTIMIZE    = flags applicable to all compilers
# FOPTIMIZE   = flags appliable to the Fortran compiler
# COPTIMIZE   = flags appliable to the C compiler
# CXXOPTIMIZE = flags appliable to the C++ compiler
#
# See your compiler manual for information on the flags available
# for your compiler

# Compiler flags applied to all models
default=base=default:
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast
CXXPORTABILITY = --c++17

# OpenACC (GPU) flags
%if %{model} eq 'acc'
pmodel=ACC
OPTIMIZE += -acc=gpu -Minfo=accel -DSPEC_ACCEL_AWARE_MPI
%endif

# OpenACC (Multicore CPU) flags
%if %{model} eq 'accmc'
pmodel=ACC
OPTIMIZE += -acc=multicore -Minfo=accel

521.miniswp_t:
PORTABILITY+= -DSPEC_USE_HOST_THREADS=1

%endif

# OpenMP Threaded (CPU) flags
%if %{model} eq 'omp'
pmodel=OMP
OPTIMIZE += -mp -Minfo=mp
PORTABILITY += -D_OPENMP=201411
%endif

# OpenMP Targeting host flags
%if %{model} eq 'tgt'
pmodel=TGT
OPTIMIZE += -mp -Minfo=mp

# Note that NVHPC is in the process of adding OpenMP
# array reduction support so this option may be removed
# in the future
513.soma_t:
PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE

521.miniswp_t:
PORTABILITY+= -DSPEC_USE_HOST_THREADS=1

%endif

# OpenMP Targeting GPU flags
%if %{model} eq 'tgtgpu'
pmodel=TGT
OPTIMIZE += -mp=gpu -Minfo=mp

# Note that NVHPC is in the process of adding OpenMP
# array reduction support so this option may be removed
# in the future
513.soma_t:
PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE

%endif

613.soma_s=default=default:
%if %{model} eq 'omp'
        PORTABILITY += -D_OPENMP=201411
%endif

# No peak flags set, so make peak use the same flags as base
default=peak=default:


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
notes_000 =Environment variables set by runhpc before the start of the run:
notes_005 =UCX_MEMTYPE_CACHE = "n"
notes_010 =UCX_TLS = "self,shm,cuda_copy"
notes_015               =
node_compute_sw_os000 = Red Hat Enterprise Linux Server release 8.3,
node_compute_sw_os001 = Kernel 4.18.0-193.el8.x86_64
notes_submit_000 =Indiviual Ranks were bound to the CPU cores on the same NUMA node as
notes_submit_005 =the GPU using 'numactl' within the following "bind.pl" perl script:
notes_submit_010 =---- Start bind.pl ------
notes_submit_015 =my %bind;
notes_submit_020 =$bind{0} = "1-3";
notes_submit_025 =$bind{1} = "4-7";
notes_submit_030 =$bind{2} = "8-10";
notes_submit_035 =$bind{3} = "11-14";
notes_submit_040 =$bind{4} = "41-43";
notes_submit_045 =$bind{5} = "44-47";
notes_submit_050 =$bind{6} = "61-63";
notes_submit_055 =$bind{7} = "64-67";
notes_submit_060 =my $rank = $ENV{OMPI_COMM_WORLD_LOCAL_RANK};
notes_submit_065 =my $cmd = "taskset -c $bind{$rank} ";
notes_submit_070 =while (my $arg = shift) {
notes_submit_075 =      $cmd .= "$arg ";
notes_submit_080 =}
notes_submit_085 =my $rc = system($cmd);
notes_submit_090 =exit($rc);
notes_submit_095 =---- End bind.pl ------
notes_submit_100 = The config file option 'submit' was used.
notes_submit_105 = submit = mpirun ${MPIRUN_OPTS} --allow-run-as-root --oversubscribe
notes_submit_110 = -host 192.168.99.171:4,192.168.99.172:4 -x UCX_MEMTYPE_CACHE=n
notes_submit_115 = -mca coll_hcoll_enable 1 -x HCOLL_MAIN_IB=mlx5_0:1 -mca pml ucx
notes_submit_120 = -x UCX_TLS=sm,dc,rc,knem,cuda_copy,cuda_ipc -npernode 4 --map-by core -np $ranks