# Invocation command line: # /home/mcolgrove/HPC2021/bin/harness/runhpc -c nvhpc_acc --reportable -T base,peak small # output_root was not used for this run ############################################################################ teeout = yes makeflags=-j 40 flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.xml envars = 1 license_num = 019 test_sponsor = NVIDIA Corporation tester = NVIDIA Corporation ###################################################### # SUT Section ###################################################### #include: dgx.inc #[Software] sw_compiler000 = C/C++/Fortran: Version 21.9 of sw_compiler001 = NVIDIA HPC SDK for Linux sw_mpi_library = OpenMPI Version 4.0.5 sw_mpi_other = None system_class = SMP sw_other = None #[General notes] ####################################################################### # End of SUT section ###################################################################### label = nv tune = base,peak output_format = text,html,pdf use_submit_for_speed = 1 reportable = 1 #include: selene.inc # ----- Begin inclusion of 'selene.inc' ############################################################################ ###################################################### # Example configuration information for a # system under test (SUT) Section ###################################################### # General SUT info system_vendor = NVIDIA Corporation system_name = DGX A100 (AMD EPYC 7742, Tesla A100-SXM-80GB) node_compute_sw_accel_driver = NVIDIA UNIX x86_64 Kernel Module 470.57.02 hw_avail = Jul-2020 sw_avail = Sep-2021 prepared_by = Mathew Colgrove (mcolgrove@nvidia.com) # Computation node info # [Node_Description: Hardware] node_compute_syslbl = DGX A100 node_compute_order = 1 node_compute_count = 1 node_compute_purpose = compute node_compute_hw_vendor = NVIDIA Corporation node_compute_hw_model = DGX A100 node_compute_hw_cpu_name = AMD EPYC 7742 node_compute_hw_ncpuorder = 2 chips node_compute_hw_nchips = 2 node_compute_hw_ncores = 128 node_compute_hw_ncoresperchip = 64 node_compute_hw_nthreadspercore = 2 node_compute_hw_cpu_char =Turbo Boost up to 3400MHz node_compute_hw_cpu_mhz = 2250 node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core node_compute_hw_scache = 512 KB I+D on chip per core node_compute_hw_tcache000= 256 MB I+D on chip per chip node_compute_hw_tcache001 = 16 MB shared / 4 cores node_compute_hw_ocache = None node_compute_hw_memory = 2 TB (32 x 64 GB 2Rx8 PC4-3200AA-R) node_compute_hw_disk000= OS: 2TB U.2 NVMe SSD drive node_compute_hw_disk001 = Internal Storage: 30TB (8x 3.84TB U.2 NVMe SSD node_compute_hw_disk002 = drives) node_compute_hw_other = None #[Node_Description: Accelerator] node_compute_hw_accel_model = Tesla A100-SXM-80GB node_compute_hw_accel_count = 8 node_compute_hw_accel_vendor= NVIDIA Corporation node_compute_hw_accel_type = GPU node_compute_hw_accel_connect = NVLINK 3.0, NVSWITCH 2.0 600GB/s node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = See Notes #[Node_Description: Software] node_compute_hw_adapter_fs_model = None node_compute_hw_adapter_fs_count = 0 node_compute_hw_adapter_fs_slot_type = None node_compute_hw_adapter_fs_data_rate = None node_compute_hw_adapter_fs_ports_used = 0 node_compute_hw_adapter_fs_interconnect = None node_compute_hw_adapter_fs_driver = None node_compute_hw_adapter_fs_firmware = None node_compute_sw_os000 = Ubuntu 20.04 node_compute_sw_os001 = 4.12.14-94.41-default node_compute_sw_localfile = xfs node_compute_sw_sharedfile = None node_compute_sw_state = Run level 3 (multi-user) node_compute_sw_other = None #[Interconnect] interconnect_fs_syslbl = None interconnect_fs_order = 0 interconnect_fs_purpose = N/A interconnect_fs_hw_vendor = N/A interconnect_fs_hw_model = N/A interconnect_fs_hw_switch_fs_model= N/A interconnect_fs_hw_switch_fs_count = 0 interconnect_fs_hw_switch_fs_ports = 0 interconnect_fs_hw_topo = N/A interconnect_fs_hw_switch_fs_data_rate = 0 interconnect_fs_hw_switch_fs_firmware = 0 ####################################################################### # End of SUT section # If this config file were to be applied to several SUTs, edits would # be needed only ABOVE this point. ###################################################################### # ---- End inclusion of '/home/mcolgrove/HPC2021/config/selene.inc' # Compiler Settings default: CC = mpicc CXX = mpicxx FC = mpif90 # Compiler Version Flags CC_VERSION_OPTION = -V CXX_VERSION_OPTION = -V FC_VERSION_OPTION = -V MPIRUN_OPTS = --bind-to none MPIRUN_OPTS += --allow-run-as-root submit = mpirun ${MPIRUN_OPTS} -np $ranks specperl $[top]/bindACC.pl $command ####################################################################### # Optimization ####################################################################### default: pmodel=ACC ranks=8 threads=1 default=base=default: OPTIMIZE = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu CXXPORTABILITY = --c++17 605.lbm_s=peak=default: basepeak=1 613.soma_s=peak=default: OPTIMIZE = -w -fast -O3 -acc=gpu -gpu=pinned 618.tealeaf_s=peak=default: OPTIMIZE = -w -fast -Msafeptr -acc=gpu 619.clvleaf_s=peak=default: basepeak=1 621.miniswp_s=peak=default: OPTIMIZE = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu -gpu=pinned 628.pot3d_s=peak=default: OPTIMIZE = -w -Mstack_arrays -fast -acc=gpu 632.sph_exa_s=peak=default: basepeak=1 634.hpgmgfv_s=peak=default: OPTIMIZE = -w -fast -acc=gpu -gpu=pinned -static-nvidia 635.weather_s=peak=default: basepeak=1 # The following section was added automatically, and contains settings that # did not appear in the original configuration file, but were added to the # raw file after the run. default: notes_comp_000 = Binaries built and run within a NVHPC SDK 21.9 CUDA 11.4 Ubuntu 20.04 notes_comp_005 = Container available from NVIDIA's NGC Catalog: notes_comp_010 = https://ngc.nvidia.com/catalog/containers/nvidia:nvhpc notes_comp_015 = notes_submit_000 = MPI startup command: notes_submit_005 = mpirun command was used to start MPI jobs. notes_submit_010 = notes_submit_015 = Indiviual Ranks were bound to the CPU cores on the same NUMA node as notes_submit_020 = the GPU using 'numactl' within the following "bindACC.pl" perl script: notes_submit_025 =---- Start bindACC.pl ------ notes_submit_030 =my %core_map = ( notes_submit_035 = 0=>48, 1=>56, 2=>16, 3=>24, 4=>112, 5=>120, 6=>80, 7=>88 notes_submit_040 =); notes_submit_045 =my %mem_map = ( notes_submit_050 = 0=>3, 1=>3, 2=>1, 3=>1, 4=>7, 5=>7, 6=>5, 7=>5, notes_submit_055 =); notes_submit_060 =my $rank = $ENV{OMPI_COMM_WORLD_LOCAL_RANK}; notes_submit_065 =my $mrank = $rank % 8; notes_submit_070 =my $cplus = int($rank/8); notes_submit_075 =my $core = $core_map{$mrank} + $cplus; notes_submit_080 =my $mem = $mem_map{$mrank}; notes_submit_085 =my $cmd = "numactl -C $core -m $mem "; notes_submit_090 =while (my $arg = shift) { notes_submit_095 = $cmd .= "$arg "; notes_submit_100 =} notes_submit_105 =system($cmd); notes_submit_110 =---- End bindACC.pl ------ notes_submit_115 = notes_plat_000 = Detailed A100 Information from nvaccelinfo notes_plat_005 = CUDA Driver Version: 11040 notes_plat_010 = NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.57.02 notes_plat_015 = Device Number: 0 notes_plat_020 = Device Name: NVIDIA A100-SXM-80GB notes_plat_025 = Device Revision Number: 8.0 notes_plat_030 = Global Memory Size: 85198045184 notes_plat_035 = Number of Multiprocessors: 108 notes_plat_040 = Concurrent Copy and Execution: Yes notes_plat_045 = Total Constant Memory: 65536 notes_plat_050 = Total Shared Memory per Block: 49152 notes_plat_055 = Registers per Block: 65536 notes_plat_060 = Warp Size: 32 notes_plat_065 = Maximum Threads per Block: 1024 notes_plat_070 = Maximum Block Dimensions: 1024, 1024, 64 notes_plat_075 = Maximum Grid Dimensions: 2147483647 x 65535 x 65535 notes_plat_080 = Maximum Memory Pitch: 2147483647B notes_plat_085 = Texture Alignment: 512B notes_plat_090 = Clock Rate: 1410 MHz notes_plat_095 = Execution Timeout: No notes_plat_100 = Integrated Device: No notes_plat_105 = Can Map Host Memory: Yes notes_plat_110 = Compute Mode: default notes_plat_115 = Concurrent Kernels: Yes notes_plat_120 = ECC Enabled: Yes notes_plat_125 = Memory Clock Rate: 1593 MHz notes_plat_130 = Memory Bus Width: 5120 bits notes_plat_135 = L2 Cache Size: 41943040 bytes notes_plat_140 = Max Threads Per SMP: 2048 notes_plat_145 = Async Engines: 3 notes_plat_150 = Unified Addressing: Yes notes_plat_155 = Managed Memory: Yes notes_plat_160 = Concurrent Managed Memory: Yes notes_plat_165 = Preemption Supported: Yes notes_plat_170 = Cooperative Launch: Yes notes_plat_175 = Multi-Device: Yes notes_plat_180 = Default Target: cc80