# Invocation command line:
# /home/opt/app/hpc2021/bin/harness/runhpc --reportable --define EXPID=submission/7.29/small.acc_rank_7 -c xfusion.acc.base-peak.cfg -T base,peak --define model=acc --pmodel ACC --define RANKS=7 --size ref --iterations 2 --flagsurl ./config/flags/nvhpc_flags.xml --output-format=all small
# output_root was not used for this run
############################################################################
######################################################################
# Example configuration file for the NVIDIA HPC SDK Compilers
#
# Before using this config file, copy it to a new config (such as nvhpc.cfg) and edit as needed
#
# Defines: "model" => "mpi", "acc", "omp", "tgt", "tgtgpu"  default "mpi"
#          "label" => ext base label, default "nv"
#
# MPI-only Command:
# runhpc -c nvhpc --reportable -T base --define model=mpi --ranks=40 tiny
#
# OpenACC offload to GPU Command:
# runhpc -c nvhpc --reportable -T base --define model=acc --ranks=4  tiny
#   Add "--define ucx" if using OpenMPI 4 with UCX support.
#
# OpenACC offload to Multicore CPU Command:
# runhpc -c nvhpc --reportable -T base --define model=accmc --ranks=4  tiny
#
# OpenMP Command:
# runhpc -c nvhpc --reportable -T base --define model=omp --ranks=1 --threads=40 tiny
#
# OpenMP Target Offload to Host Command:
# runhpc -c nvhpc --reportable -T base --define model=tgt --ranks=1 --threads=40 tiny
#
# OpenMP Target Offload to GPU Command:
# runhpc -c nvhpc --reportable -T base --define model=tgtgpu --ranks=4  tiny
#
#######################################################################

expid=
%ifdef %{EXPID}
    expid=%{EXPID}
%endif
allow_label_override = yes  # label controls srcalt: simd - for simd
build_in_build_dir=0        # build in run dir
basepeak=0

%ifndef %{label}         # IF label is not set use xfusion
%   define label xfusion
%endif

%ifndef %{model}         # IF model is not set use acc
%   define pmodel ACC
%endif

teeout = yes
makeflags=-j

# Tester Information
license_num     = 6488
test_sponsor    = xFusion
tester          = xFusion


######################################################
# SUT Section
######################################################
#include: Example_SUT.inc
#  ----- Begin inclusion of 'Example_SUT.inc'
############################################################################
######################################################
# Example configuration information for a
# system under test (SUT) Section
######################################################
# General SUT info
system_vendor      = xFusion
system_name        = FusionServer G5500 V6 (Intel Xeon Platinum 8380, Nvidia A100-PCIE-80G)
node_compute_sw_accel_driver = NVIDIA UNIX x86_64 Kernel Module 515.43.04
hw_avail           = Apr-2021
sw_avail           = May-2022
prepared_by = xFusion

# Computation node info
# [Node_Description: Hardware]
node_compute_syslbl = FusionServer G5500 V6
node_compute_order = 1
node_compute_count = 1
node_compute_purpose = compute
node_compute_hw_vendor = xFusion
node_compute_hw_model = FusionServer G5500 V6
node_compute_hw_cpu_name = Intel Xeon Platinum 8380
node_compute_hw_ncpuorder = 1, 2 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 80
node_compute_hw_ncoresperchip = 40
node_compute_hw_nthreadspercore = 1
node_compute_hw_cpu_char = Intel Turbo Boost Technology up to 3.4 GHz
node_compute_hw_cpu_mhz = 2300
node_compute_hw_pcache = 32 KB I + 48 KB D on chip per core
node_compute_hw_scache = 1.25 MB I+D on chip per core
node_compute_hw_tcache   = 60 MB I+D on chip per chip
node_compute_hw_ocache = None
node_compute_hw_memory = 1 TB (16 x 64 GB 2Rx4 PC4-3200A-R)
node_compute_hw_disk =  1 x 3.2 TB  NVMe SSD
node_compute_hw_other = None

#[Node_Description: Accelerator]
node_compute_hw_accel_model = Tesla A100 PCIe 80GB
node_compute_hw_accel_count = 8
node_compute_hw_accel_vendor= Nvidia Corporation
node_compute_hw_accel_type  = GPU
node_compute_hw_accel_connect = PCIe Gen4 x16
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = Nvidia Tesla A100 PCIe 80GB

#[Node_Description: Software]
node_compute_hw_adapter_fs_model = None
node_compute_hw_adapter_fs_count = 0
node_compute_hw_adapter_fs_slot_type = None
node_compute_hw_adapter_fs_data_rate = None
node_compute_hw_adapter_fs_ports_used = 0
node_compute_hw_adapter_fs_interconnect = None
node_compute_hw_adapter_fs_driver = None
node_compute_hw_adapter_fs_firmware = None
node_compute_sw_os000 = CentOS Linux release 8.2.2004
node_compute_sw_os001 = 4.18.0-193.el8.x86_644
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile = None
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_other = None

#[Fileserver]

#[Interconnect]
interconnect_fs_syslbl = None
interconnect_fs_order = 0
interconnect_fs_purpose = None
interconnect_fs_hw_vendor = None
interconnect_fs_hw_model = None
interconnect_fs_hw_switch_fs_model   = None
interconnect_fs_hw_switch_fs_count = 0
interconnect_fs_hw_switch_fs_ports = 0
interconnect_fs_hw_topo = None
interconnect_fs_hw_switch_fs_data_rate = None
interconnect_fs_hw_switch_fs_firmware = None

#######################################################################
# End of SUT section
# If this config file were to be applied to several SUTs, edits would
# be needed only ABOVE this point.
######################################################################
# ---- End inclusion of '/home/HPC2021F1.0.1/config/Example_SUT.inc'

######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
label         = %{label}_%{model}
tune          = all
output_format = all
use_submit_for_speed = 1

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
system_class = SMP
sw_compiler      = Nvidia HPC SDK 22.5
sw_mpi_library = OpenMPI Version 4.0.5, included with NVHPC SDK

# Compiler Version Flags
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

%ifdef %{ucx}
# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI
# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
preENV_UCX_MEMTYPE_CACHE=n
preENV_UCX_TLS=self,shm,cuda_copy
%endif

MPIRUN_OPTS = --allow-run-as-root --bind-to none
submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -np $ranks perl $[top]/bind.pl $command

# Optimization
default:
pmodel=ACC
default=base=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Mfprelaxed -Mnouniform -Mstack_arrays -DSPEC_ACCEL_AWARE_MPI
CXXPORTABILITY = --c++17

505.lbm_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI

513.soma_t=peak=default:
basepeak=1

518.tealeaf_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI

519.clvleaf_t=peak=default:
basepeak=1

521.miniswp_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -gpu=pinned # -DSPEC_ACCEL_AWARE_MPI

528.pot3d_t=peak=default:
basepeak=1

532.sph_exa_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

534.hpgmgfv_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI

535.weather_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

605.lbm_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI

613.soma_s=peak=default:
basepeak=1

618.tealeaf_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI

619.clvleaf_s=peak=default:
basepeak=1

621.miniswp_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -gpu=pinned # -DSPEC_ACCEL_AWARE_MPI

628.pot3d_s=peak=default:
basepeak=1

632.sph_exa_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

634.hpgmgfv_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI

635.weather_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.2022-08-24.xml
notes_submit_000 =MPIRUN_OPTS = --allow-run-as-root --bind-to none
notes_submit_005 =submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -np $ranks perl $[top]/bind.pl $command