# Invocation command line: # /home/HPC2021F1.0.1/bin/harness/runhpc --reportable --configfile nvhpc_final.cfg --tune base --define ucx --define model=acc --pmodel ACC --threads 1 --ranks 4 --size ref --iterations 3 --nopower --runmode speed --tune base --size ref tiny # output_root was not used for this run ############################################################################ ###################################################################### # Example configuration file for the NVIDIA HPC SDK Compilers # # Before using this config file, copy it to a new config (such as nvhpc.cfg) and edit as needed # # Defines: "model" => "mpi", "acc", "omp", "tgt", "tgtgpu" default "mpi" # "label" => ext base label, default "nv" # # MPI-only Command: # runhpc -c nvhpc --reportable -T base --define model=mpi --ranks=40 tiny # # OpenACC offload to GPU Command: # runhpc -c nvhpc --reportable -T base --define model=acc --ranks=4 tiny # Add "--define ucx" if using OpenMPI 4 with UCX support. # # OpenACC offload to Multicore CPU Command: # runhpc -c nvhpc --reportable -T base --define model=accmc --ranks=4 tiny # # OpenMP Command: # runhpc -c nvhpc --reportable -T base --define model=omp --ranks=1 --threads=40 tiny # # OpenMP Target Offload to Host Command: # runhpc -c nvhpc --reportable -T base --define model=tgt --ranks=1 --threads=40 tiny # # OpenMP Target Offload to GPU Command: # runhpc -c nvhpc --reportable -T base --define model=tgtgpu --ranks=4 tiny # ####################################################################### %ifndef %{label} # IF label is not set use nv % define label nv_final %endif %ifndef %{model} # IF model is not set use mpi % define model mpi pmodel = MPI %endif teeout = yes # Adjust the number of make jobs to use here makeflags=-j 40 flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags.xml # Tester Information license_num = 28 test_sponsor = Lenovo Global Technology tester = Lenovo Global Technology hw_avail = Aug-2021 sw_avail = Aug-2021 prepared_by = Lenovo Global Technology system_vendor = Lenovo Global Technology system_name = ThinkSystem SD650-N V2 (Intel Xeon Platinum 8368Q, Tesla A100-SXM-40GB) node_fileserver_syslbl = ThinkSystem SD650-N V2 node_fileserver_sw_state = Multi-User, run level 3 node_fileserver_sw_sharedfile = N/A node_fileserver_sw_other = None node_fileserver_sw_os = Red Hat Enterprise Linux Server release 8.3 node_fileserver_sw_localfile = xfs node_fileserver_sw_accel_driver = N/A node_fileserver_purpose = Fileserver node_fileserver_order = 1 node_fileserver_hw_vendor = Lenovo Global Technology node_fileserver_hw_tcache = 57 MB I+D on chip per chip node_fileserver_hw_scache = 1280 KB I+D on chip per core node_fileserver_hw_pcache = 32 KB I + 48 KB D on chip per core node_fileserver_hw_other = None node_fileserver_hw_ocache = None node_fileserver_hw_nthreadspercore = 1 node_fileserver_hw_ncpuorder = 2 chips node_fileserver_hw_ncoresperchip = 38 node_fileserver_hw_ncores = 76 node_fileserver_hw_nchips = 2 node_fileserver_hw_model = ThinkSystem SD650-N V2 node_fileserver_hw_memory = 512 GB (16 x 32 GB 2Rx8 PC4-3200A-R) node_fileserver_hw_disk = 1 x 960 GB NVME 2.5" SSD node_fileserver_hw_cpu_name = Intel Xeon Platinum 8368Q node_fileserver_hw_cpu_mhz = 2600 node_fileserver_hw_cpu_char = Turbo up to 3.7 GHz node_fileserver_hw_adapter_fs_slot_type = PCI-Express 4.0 x16 node_fileserver_hw_adapter_fs_ports_used = 1 node_fileserver_hw_adapter_fs_model = Mellanox ConnectX-6 HDR node_fileserver_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-6 HDR node_fileserver_hw_adapter_fs_firmware = 20.28.1002 node_fileserver_hw_adapter_fs_driver = 5.1-2.3.7 node_fileserver_hw_adapter_fs_data_rate = 200 Gb/s node_fileserver_hw_adapter_fs_count = 1 node_fileserver_hw_accel_vendor = Nvidia node_fileserver_hw_accel_type = GPU node_fileserver_hw_accel_model = Tesla A100 SXM4 40GB node_fileserver_hw_accel_ecc = Yes node_fileserver_hw_accel_desc = Nvidia Tesla A100 SXM4 40GB node_fileserver_hw_accel_count = 4 node_fileserver_hw_accel_connect = Nvidia Tesla A100 SXM4 40GB node_fileserver_count = 1 node_compute_syslbl = ThinkSystem SD650-N V2 node_compute_sw_state = Multi-user, run level 3 node_compute_sw_sharedfile = NFS node_compute_sw_other = None node_compute_sw_localfile = xfs node_compute_sw_accel_driver = 460.32.03 node_compute_purpose = compute node_compute_order = 1 node_compute_hw_vendor = Lenovo Global Technology node_compute_hw_tcache = 57 MB I+D on chip per chip node_compute_hw_scache = 1280 KB I+D on chip per core node_compute_hw_pcache = 32 KB I + 48 KB D on chip per core node_compute_hw_other = None node_compute_hw_ocache = None node_compute_hw_nthreadspercore = 1 node_compute_hw_ncpuorder = 2 chips node_compute_hw_ncoresperchip = 38 node_compute_hw_ncores = 76 node_compute_hw_nchips = 2 node_compute_hw_model = ThinkSystem SD650-N V2 node_compute_hw_memory = 512 GB (16 x 32 GB 2Rx8 PC4-3200A-R) node_compute_hw_disk = 1 x 480 GB 2.5" SSD node_compute_hw_cpu_name = Intel Xeon Platinum 8368Q node_compute_hw_cpu_mhz = 2600 node_compute_hw_cpu_char = Turbo up to 3.7 GHz node_compute_hw_adapter_fs_slot_type = PCI-Express 4.0 x16 node_compute_hw_adapter_fs_ports_used = 1 node_compute_hw_adapter_fs_model = Mellanox ConnectX-6 HDR node_compute_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-6 HDR node_compute_hw_adapter_fs_firmware = 20.28.1002 node_compute_hw_adapter_fs_driver = 5.1-2.3.7 node_compute_hw_adapter_fs_data_rate = 200 Gb/s node_compute_hw_adapter_fs_count = 1 node_compute_hw_accel_vendor = Nvidia Corporation node_compute_hw_accel_type = GPU node_compute_hw_accel_model = Tesla A100 SXM4 40GB node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = Nvidia Tesla A100 SXM4 40GB node_compute_hw_accel_count = 4 node_compute_hw_accel_connect = NVLink node_compute_count = 1 interconnect_fs_syslbl = Nvidia Mellanox ConnectX-6 HDR interconnect_fs_purpose = MPI Traffic, NFS Access interconnect_fs_order = 0 interconnect_fs_label = Nvidia Mellanox ConnectX-6 HDR interconnect_fs_hw_vendor = Nvidia interconnect_fs_hw_topo = Direct Connect interconnect_fs_hw_switch_fs_ports = 0 interconnect_fs_hw_switch_fs_model = N/A interconnect_fs_hw_switch_fs_firmware = N/A interconnect_fs_hw_switch_fs_data_rate = N/A interconnect_fs_hw_switch_fs_count = 0 interconnect_fs_hw_model = Nvidia Mellanox ConnectX-6 HDR ###################################################### # SUT Section ###################################################### #include: Example_SUT.inc #[Software] system_class = Homogenous Cluster sw_compiler = Nvidia HPC SDK 21.5 sw_mpi_library = Open MPI 4.0.5 sw_mpi_other = -- sw_other = -- #[General notes] ####################################################################### # End of SUT section ###################################################################### ###################################################################### # The header section of the config file. Must appear # before any instances of "section markers" (see below) # # ext = how the binaries you generated will be identified # tune = specify "base" or "peak" or "all" label = %{label}_%{model} tune = base output_format = text use_submit_for_speed = 1 # Setting 'strict_rundir_verify=0' will allow direct source code modifications # but will disable the ability to create reportable results. # May be useful for academic and research purposes # strict_rundir_verify = 0 # Compiler Settings default: CC = mpicc CXX = mpicxx FC = mpif90 # Compiler Version Flags CC_VERSION_OPTION = -V CXX_VERSION_OPTION = -V FC_VERSION_OPTION = -V # MPI options and binding environment, dependent upon Model being run # Adjust to match your system %ifdef %{ucx} # if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI # without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs preENV_UCX_MEMTYPE_CACHE=n preENV_UCX_TLS=self,shm,cuda_copy %endif MPIRUN_OPTS = --bind-to none # Note that SPH_EXA is known to hang when using multiple nodes with some versions of UCX, # to work around, add the following setting: #MPIRUN_OPTS += --mca topo basic %if %{model} eq 'acc' submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks perl $[top]/bind_Toomie.pl $command #submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks $command %endif %if %{model} eq 'tgtgpu' submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks perl $[top]/bind_Toomie.pl $command #submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -host localhost:4 -np $ranks $command %endif %if %{model} eq 'mpi' submit = mpirun --allow-run-as-root -x UCX=1 -x UCX_MEMTYPE_CACHE=n -host 192.168.99.171:76 -np $ranks $command %endif #submit = mpirun ${MPIRUN_OPTS} -np $ranks $command ####################################################################### # Optimization # Note that SPEC baseline rules require that all uses of a given compiler # use the same flags in the same order. See the SPEChpc Run Rules # for more details # http://www.spec.org/hpc2021/Docs/runrules.html # # OPTIMIZE = flags applicable to all compilers # FOPTIMIZE = flags appliable to the Fortran compiler # COPTIMIZE = flags appliable to the C compiler # CXXOPTIMIZE = flags appliable to the C++ compiler # # See your compiler manual for information on the flags available # for your compiler # Compiler flags applied to all models default=base=default: OPTIMIZE = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast CXXPORTABILITY = --c++17 # OpenACC (GPU) flags %if %{model} eq 'acc' pmodel=ACC OPTIMIZE += -acc=gpu -Minfo=accel -DSPEC_ACCEL_AWARE_MPI %endif # OpenACC (Multicore CPU) flags %if %{model} eq 'accmc' pmodel=ACC OPTIMIZE += -acc=multicore -Minfo=accel 521.miniswp_t: PORTABILITY+= -DSPEC_USE_HOST_THREADS=1 %endif # OpenMP Threaded (CPU) flags %if %{model} eq 'omp' pmodel=OMP OPTIMIZE += -mp -Minfo=mp PORTABILITY += -D_OPENMP=201411 %endif # OpenMP Targeting host flags %if %{model} eq 'tgt' pmodel=TGT OPTIMIZE += -mp -Minfo=mp # Note that NVHPC is in the process of adding OpenMP # array reduction support so this option may be removed # in the future 513.soma_t: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE 521.miniswp_t: PORTABILITY+= -DSPEC_USE_HOST_THREADS=1 %endif # OpenMP Targeting GPU flags %if %{model} eq 'tgtgpu' pmodel=TGT OPTIMIZE += -mp=gpu -Minfo=mp # Note that NVHPC is in the process of adding OpenMP # array reduction support so this option may be removed # in the future 513.soma_t: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE %endif 613.soma_s=default=default: %if %{model} eq 'omp' PORTABILITY += -D_OPENMP=201411 %endif # No peak flags set, so make peak use the same flags as base default=peak=default: # The following section was added automatically, and contains settings that # did not appear in the original configuration file, but were added to the # raw file after the run. default: notes_000 =Environment variables set by runhpc before the start of the run: notes_005 =UCX_MEMTYPE_CACHE = "n" notes_010 =UCX_TLS = "self,shm,cuda_copy" notes_015 = node_compute_sw_os000 = Red Hat Enterprise Linux Server release 8.3, node_compute_sw_os001 = Kernel 4.18.0-193.el8.x86_64 notes_submit_000 =Indiviual Ranks were bound to the CPU cores on the same NUMA node as notes_submit_005 =the GPU using 'numactl' within the following "bind.pl" perl script: notes_submit_010 =---- Start bind.pl ------ notes_submit_015 =my %bind; notes_submit_020 =$bind{0} = "1-3"; notes_submit_025 =$bind{1} = "4-7"; notes_submit_030 =$bind{2} = "8-10"; notes_submit_035 =$bind{3} = "11-14"; notes_submit_040 =$bind{4} = "41-43"; notes_submit_045 =$bind{5} = "44-47"; notes_submit_050 =$bind{6} = "61-63"; notes_submit_055 =$bind{7} = "64-67"; notes_submit_060 =my $rank = $ENV{OMPI_COMM_WORLD_LOCAL_RANK}; notes_submit_065 =my $cmd = "taskset -c $bind{$rank} "; notes_submit_070 =while (my $arg = shift) { notes_submit_075 = $cmd .= "$arg "; notes_submit_080 =} notes_submit_085 =my $rc = system($cmd); notes_submit_090 =exit($rc); notes_submit_095 =---- End bind.pl ------ notes_submit_100 = The config file option 'submit' was used. notes_submit_105 = submit = mpirun ${MPIRUN_OPTS} --allow-run-as-root --oversubscribe notes_submit_110 = -host 192.168.99.171:4,192.168.99.172:4 -x UCX_MEMTYPE_CACHE=n notes_submit_115 = -mca coll_hcoll_enable 1 -x HCOLL_MAIN_IB=mlx5_0:1 -mca pml ucx notes_submit_120 = -x UCX_TLS=sm,dc,rc,knem,cuda_copy,cuda_ipc -npernode 4 --map-by core -np $ranks