# Invocation command line:
# /home/rlieberm/hpc2021-1.0.2/bin/harness/runhpc -c amdgpu_clang.cfg -l --reportable -n 3 -T base --define model=mpi --threads=1 --ranks=512 -i ref tiny --rebuild
# output_root was not used for this run
############################################################################
######################################################################
# Example configuration file for AOMP LLVM/Clang compiler.
#
# Defines: "model" => "mpi", "omp", "omp_target", omp_host_target"
#              default "mpi"
#          "label" => ext base label,
#              default "clang"
#          "gputype" => "host" , "x86", "gfx900", "gfx906", "gfx908"
#              default "host"
# Example runhpc commands
#
# MPI-only Command:
# runhpc -c amdgpu_clang.cfg -I -l -n 1 -T base --define model=mpi --threads=1 --ranks=16 -i test 628
#
# MPI+OpenMP Command:
# runhpc -c amdgpu_clang.cfg -I -l -n 1 -T base --define model=omp --threads=16--ranks=1 -i test 628
#
# MPI+OpenMP target offload Command:
# runhpc -c amdgpu_clang.cfg -I -l -n 1 -T base --define model=omp_target --define gputype=gfx908 --threads=1 --ranks=4 -i test 628
#
# MPI+OpenMP target offload to host Command:
# runhpc -c amdgpu_clang.cfg -I -l -n 1 -T base --define model=omp_host_target --define gputype=x86 --threads=16 --ranks=1 -i test 628
#
#######################################################################

%ifndef %{label}         # IF label is not set use clang
%   define label clang
%endif

%ifndef %{model}       # IF model is not set use mpi
%   define model mpi
%endif

%ifndef %{gputype}
%   define gputype host
%endif
######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"

label         = %{label}_%{model}_%{gputype}
tune          = base
output_format = text
use_submit_for_speed = 1

makeflags = -j 16

#strict_rundir_verify=0

#include: desc_amdgpu.inc
#  ----- Begin inclusion of 'desc_amdgpu.inc'
############################################################################

# Tester Information
license_num     = 0017
test_sponsor    = Advanced Micro Devices
tester          = Advanced Micro Devices

######################################################
# SUT Section
######################################################
# General SUT info
system_vendor      = Advanced Micro Devices
system_name        = Dallas Milan Cluster: Gigabyte H262-Z63 (AMD EPYC 7763)
hw_avail           = Apr-2021
sw_avail           = Aug-2021

# Computation node info
# [Node_Description: Hardware]
node_compute_syslbl = Gigabyte H262-Z63
node_compute_order = 1
node_compute_count = 4
node_compute_purpose = compute
node_compute_hw_vendor = Gigabyte
node_compute_hw_model =  Gigabyte H262-Z63
node_compute_hw_cpu_name =  AMD EPYC 7763
node_compute_hw_ncpuorder = 1,2 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 128
node_compute_hw_ncoresperchip = 64
node_compute_hw_nthreadspercore = 1
node_compute_hw_cpu_char = Max Boost Clock disabled
node_compute_hw_cpu_mhz = 2450
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_scache = 512 KB I+D on chip per core
node_compute_hw_tcache000 = 256 MB I+D on chip per chip
node_compute_hw_tcache001 = 32 MB shared / 8 cores
node_compute_hw_ocache = None
node_compute_hw_memory = 512 GB (16 x 32 GB 2Rx4 PC4-3200AA-R)
node_compute_hw_disk = Intel SSD 520 Series 240GB, 2.5in SATA 6Gb/s
node_compute_hw_other = None

#[Node_Description: Accelerator]

#[Node_Description: Software]
node_compute_hw_adapter_fs_model000= ConnectX-6 Dual port, model number:
node_compute_hw_adapter_fs_model001 = MCX653106A
node_compute_hw_adapter_fs_count = 0
node_compute_hw_adapter_fs_slot_type = None
node_compute_hw_adapter_fs_data_rate = None
node_compute_hw_adapter_fs_ports_used = 0
node_compute_hw_adapter_fs_interconnect = None
node_compute_hw_adapter_fs_driver = None
node_compute_hw_adapter_fs_firmware = None
node_compute_sw_os000 = CentOS Linux release 8.3.2011
node_compute_sw_os001 = Kernel 4.18.0-193 [native to CentOS 8.3]
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile = NFS share
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_other = None

#[Fileserver]

#[Interconnect]
interconnect_fs_syslbl = Mellanox
interconnect_fs_order = 0
interconnect_fs_purpose = MPI Traffic
interconnect_fs_hw_vendor = Mellanox
interconnect_fs_hw_model000= NVIDIA MCX653106A-EFAT ConnectX-6 VPI Adapter
interconnect_fs_hw_model001 = Card HDR100/EDR/100GbE
interconnect_fs_hw_switch_fs_model000 = MLNX_OFED_LINUX-5.2.1.0 (OFED-5.2.1.0)
interconnect_fs_hw_switch_fs_model001 = Switch: 27_2008_2202-MQM8790-HS2X_Ax
interconnect_fs_hw_switch_fs_count = 2
interconnect_fs_hw_switch_fs_ports = 40
interconnect_fs_hw_topo =non-blocking fat tree
interconnect_fs_hw_switch_fs_data_rate = InfiniBand HDR 100 Gb/s
interconnect_fs_hw_switch_fs_firmware = HCA: 20.29.1016

#[Software]
sw_compiler001   = C/C++/Fortran: Version 13.0-0
sw_compiler002   = MLSE ROCm 4.3.0 Compilers
sw_compiler003 = Compiler available by installing ROCm 4.3 or
sw_compiler004 = getting
sw_compiler005 = https://repo.radeon.com/rocm/apt/4.3/pool/main/l/llvm-amdgpu/llvm-amdgpu_13.0.0.21295.40300_amd64.deb
sw_compiler006 = https://repo.radeon.com/rocm/apt/4.3/pool/main/o/openmp-extras4.3.0/openmp-extras4.3.0_12.43.0.40300-52_amd64.deb
sw_mpi_library = OpenMPI Version 4.0.5
sw_mpi_other = None
system_class = Homogenous Cluster
sw_other = None

#[General notes]

#######################################################################
# End of SUT section
# If this config file were to be applied to several SUTs, edits would
# be needed only ABOVE this point.
######################################################################


# ---- End inclusion of '/home/rlieberm/hpc2021-1.0.2/config/desc_amdgpu.inc'
flagsurl000=http://www.spec.org/hpc2021/flags/amd2021_flags.xml

default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
sw_compiler000= LLVM/Clang 13.0

CC_VERSION_OPTION  = --version
CXX_VERSION_OPTION = --version
FC_VERSION_OPTION  = --version

#preENV_OMP_PROC_BIND=true
MPIRUN_OPTS = --bind-to none #socket # core
submit = mpirun ${MPIRUN_OPTS} -np $ranks $command

#######################################################################

default=base=default:
OPTIMIZE      = -O3
COPTIMIZE     =
CXXOPTIMIZE   =
PORTABILITY = -I${AOMP}/include
PORTABILITY_LIBS = -lm
FPPPORTABILITY +=  -DSPEC_USE_MPIFH -I${MPI}/include/

%if %{model} eq 'mpi'
  pmodel=MPI
  MPIRUN_OPTS += --mca topo basic
  submit = mpirun ${MPIRUN_OPTS} -np $ranks $command
%endif

%if %{model} eq 'omp'
  pmodel=OMP
  OPTIMIZE += -fopenmp
  MPIRUN_OPTS = --bind-to  core
  MPIRUN_OPTS += --map-by ppr:1:numa:pe=16 # 16 cores per numa
  #MPIRUN_OPTS += --map-by ppr:1:numa:pe=64 # 64 cores per numa
  submit = mpirun ${MPIRUN_OPTS} -np $ranks $command
%endif

%if %{model} eq 'omp_target'
  pmodel=TGT
  OPTIMIZE += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=%{gputype}
  513.soma_t,613.soma_s:
    PORTABILITY += -DSPEC_NO_VAR_ARRAY_REDUCE
%endif

%if %{model} eq 'omp_host_target'
  pmodel=TGT
  OPTIMIZE += -fopenmp -fopenmp-targets=x86_64-pc-linux-gnu -Xopenmp-target=x86_64-pc-linux-gnu -mcpu=%{gputype}
  521.miniswp_t,621.miniswp_s:
    PORTABILITY += -DSPEC_USE_HOST_THREADS
%endif

# HIP is not a supported hpc2021 model, just for experimentation.
%if %{model} eq 'hip'
  OPTIMIZE += -DSPEC_HIP -DSPEC_CUDA --amdgpu-target=%{gputype}
%endif

# No peak flags set, so make peak use the same flags as base
default=peak=default:
basepeak=1


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
notes_submit_000 = MPI startup command:
notes_submit_005 =   mpirun command was used to start MPI jobs.