[chibi@centos8 ~]$ ls
NVIDIA_CUDA-11.0_Samples  テンプレート  ドキュメント  音楽  公開
ダウンロード              デスクトップ  ビデオ        画像
[chibi@centos8 ~]$ cd NVIDIA_CUDA-11.0_Samples
[chibi@centos8 NVIDIA_CUDA-11.0_Samples]$ ls
0_Simple     2_Graphics  4_Finance      6_Advanced       EULA.txt  Makefile
1_Utilities  3_Imaging   5_Simulations  7_CUDALibraries  LICENSE   common
[chibi@centos8 NVIDIA_CUDA-11.0_Samples]$ cd 0_Simple
[chibi@centos8 0_Simple]$ ls
UnifiedMemoryStreams          simpleCooperativeGroups
asyncAPI                      simpleCubemapTexture
bf16TensorCoreGemm            simpleCudaGraphs
binaryPartitionCG             simpleDrvRuntime
cdpSimplePrint                simpleIPC
cdpSimpleQuicksort            simpleLayeredTexture
clock                         simpleMPI
clock_nvrtc                   simpleMultiCopy
cppIntegration                simpleMultiGPU
cppOverload                   simpleOccupancy
cudaNvSci                     simpleP2P
cudaOpenMP                    simplePitchLinearTexture
cudaTensorCoreGemm            simplePrintf
dmmaTensorCoreGemm            simpleSeparateCompilation
fp16ScalarProduct             simpleStreams
globalToShmemAsyncCopy        simpleSurfaceWrite
immaTensorCoreGemm            simpleTemplates
inlinePTX                     simpleTemplates_nvrtc
inlinePTX_nvrtc               simpleTexture
matrixMul                     simpleTextureDrv
matrixMulCUBLAS               simpleVoteIntrinsics
matrixMulDrv                  simpleVoteIntrinsics_nvrtc
matrixMul_nvrtc               simpleZeroCopy
memMapIPCDrv                  systemWideAtomics
simpleAWBarrier               template
simpleAssert                  tf32TensorCoreGemm
simpleAssert_nvrtc            vectorAdd
simpleAtomicIntrinsics        vectorAddDrv
simpleAtomicIntrinsics_nvrtc  vectorAddMMAP
simpleAttributes              vectorAdd_nvrtc
simpleCallback
[chibi@centos8 0_Simple]$ cd simpleP2P
[chibi@centos8 simpleP2P]$ ls
Makefile  NsightEclipse.xml  readme.txt  simpleP2P.cu
[chibi@centos8 simpleP2P]$ make
/usr/local/cuda-11.0/bin/nvcc -ccbin g++ -I../../common/inc  -m64    -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o simpleP2P.o -c simpleP2P.cu
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
/usr/local/cuda-11.0/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o simpleP2P simpleP2P.o
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
mkdir -p ../../bin/x86_64/linux/release
cp simpleP2P ../../bin/x86_64/linux/release
[chibi@centos8 simpleP2P]$ ./simpleP2P
[./simpleP2P] - Starting...
Checking for multiple GPUs...
CUDA-capable device count: 2

Checking GPU(s) for support of peer to peer memory access...
> Peer access from TITAN RTX (GPU0) -> TITAN RTX (GPU1) : Yes
> Peer access from TITAN RTX (GPU1) -> TITAN RTX (GPU0) : Yes
Enabling peer access between GPU0 and GPU1...
Allocating buffers (64MB on GPU0, GPU1 and CPU Host)...
Creating event handles...
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 43.54GB/s
Preparing host buffer and memcpy to GPU0...
Run kernel on GPU1, taking source data from GPU0 and writing to GPU1...
Run kernel on GPU0, taking source data from GPU1 and writing to GPU0...
Copy data back to host from GPU0 and verify results...
Disabling peer access...
Shutting down...
Test passed
[chibi@centos8 simpleP2P]$ cd ~/NVIDIA_CUDA-11.0_Samples/1_Utilities
[chibi@centos8 1_Utilities]$ ls
UnifiedMemoryPerf  deviceQuery     p2pBandwidthLatencyTest
bandwidthTest      deviceQueryDrv  topologyQuery
[chibi@centos8 1_Utilities]$ cd p2pBandwidthLatencyTest
[chibi@centos8 p2pBandwidthLatencyTest]$ make
/usr/local/cuda-11.0/bin/nvcc -ccbin g++ -I../../common/inc  -m64    -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o p2pBandwidthLatencyTest.o -c p2pBandwidthLatencyTest.cu
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
/usr/local/cuda-11.0/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
mkdir -p ../../bin/x86_64/linux/release
cp p2pBandwidthLatencyTest ../../bin/x86_64/linux/release
[chibi@centos8 p2pBandwidthLatencyTest]$ ./p2pBandwidthLatencyTest
[P2P (Peer-to-Peer) GPU Bandwidth Latency Test]
Device: 0, TITAN RTX, pciBusID: 3, pciDeviceID: 0, pciDomainID:0
Device: 1, TITAN RTX, pciBusID: 21, pciDeviceID: 0, pciDomainID:0
Device=0 CAN Access Peer Device=1
Device=1 CAN Access Peer Device=0

***NOTE: In case a device doesn't have P2P access to other one, it falls back to normal memcopy procedure.
So you can see lesser Bandwidth (GB/s) and unstable Latency (us) in those cases.

P2P Connectivity Matrix
     D\D     0     1
     0       1     1
     1       1     1
Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)
   D\D     0      1
     0 549.98  11.45
     1  11.42 553.29
Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)
   D\D     0      1
     0 552.71  47.00
     1  46.93 553.29
Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)
   D\D     0      1
     0 553.29  17.62
     1  20.09 557.48
Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)
   D\D     0      1
     0 555.65  93.54
     1  93.74 552.51
P2P=Disabled Latency Matrix (us)
   GPU     0      1
     0   1.88  13.75
     1  13.41   1.84

   CPU     0      1
     0   3.75  10.35
     1  12.02   3.94
P2P=Enabled Latency (P2P Writes) Matrix (us)
   GPU     0      1
     0   1.87   2.10
     1   2.05   1.84

   CPU     0      1
     0   4.10   3.55
     1   3.58   4.09

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
[chibi@centos8 p2pBandwidthLatencyTest]$ cd ~/NVIDIA_CUDA-11.0_Samples/1_Utiliti
es/bandwidthTest
[chibi@centos8 bandwidthTest]$ make
/usr/local/cuda-11.0/bin/nvcc -ccbin g++ -I../../common/inc  -m64    -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o bandwidthTest.o -c bandwidthTest.cu
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
/usr/local/cuda-11.0/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o bandwidthTest bandwidthTest.o
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
mkdir -p ../../bin/x86_64/linux/release
cp bandwidthTest ../../bin/x86_64/linux/release
[chibi@centos8 bandwidthTest]$ ./bandwidthTest
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: TITAN RTX
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     11.9

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     12.4

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     540.3

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
[chibi@centos8 bandwidthTest]$ cd ~/NVIDIA_CUDA-11.0_Samples/1_Utilities/deviceQuery
[chibi@centos8 deviceQuery]$ make
/usr/local/cuda-11.0/bin/nvcc -ccbin g++ -I../../common/inc  -m64    -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o deviceQuery.o -c deviceQuery.cpp
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
/usr/local/cuda-11.0/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -o deviceQuery deviceQuery.o
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
mkdir -p ../../bin/x86_64/linux/release
cp deviceQuery ../../bin/x86_64/linux/release
[chibi@centos8 deviceQuery]$ ./deviceQuery
./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 2 CUDA Capable device(s)

Device 0: "TITAN RTX"
  CUDA Driver Version / Runtime Version          11.0 / 11.0
  CUDA Capability Major/Minor version number:    7.5
  Total amount of global memory:                 24220 MBytes (25396445184 bytes)
  (72) Multiprocessors, ( 64) CUDA Cores/MP:     4608 CUDA Cores
  GPU Max Clock rate:                            1770 MHz (1.77 GHz)
  Memory Clock rate:                             7001 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 6291456 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1024
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 3 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 3 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Device 1: "TITAN RTX"
  CUDA Driver Version / Runtime Version          11.0 / 11.0
  CUDA Capability Major/Minor version number:    7.5
  Total amount of global memory:                 24220 MBytes (25396838400 bytes)
  (72) Multiprocessors, ( 64) CUDA Cores/MP:     4608 CUDA Cores
  GPU Max Clock rate:                            1770 MHz (1.77 GHz)
  Memory Clock rate:                             7001 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 6291456 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1024
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 3 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 33 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
> Peer access from TITAN RTX (GPU0) -> TITAN RTX (GPU1) : Yes
> Peer access from TITAN RTX (GPU1) -> TITAN RTX (GPU0) : Yes

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.0, CUDA Runtime Version = 11.0, NumDevs = 2
Result = PASS
[chibi@centos8 deviceQuery]$ cat /etc/redhat-release
CentOS Linux release 8.2.2004 (Core)
[chibi@centos8 deviceQuery]$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_May__6_19:09:25_PDT_2020
Cuda compilation tools, release 11.0, V11.0.167
Build cuda_11.0_bu.TC445_37.28358933_0
[chibi@centos8 deviceQuery]$ lscpu
アーキテクチャ:                      x86_64
CPU 操作モード:                      32-bit, 64-bit
バイト順序:                          Little Endian
CPU:                                 64
オンラインになっている CPU のリスト: 0-63
コアあたりのスレッド数:              2
ソケットあたりのコア数:              32
ソケット数:                          1
NUMA ノード数:                       4
ベンダー ID:                         AuthenticAMD
CPU ファミリー:                      23
モデル:                              49
モデル名:                            AMD EPYC 7502P 32-Core Processor
ステッピング:                        0
CPU MHz:                             1733.426
CPU 最大 MHz:                        2500.0000
CPU 最小 MHz:                        1500.0000
BogoMIPS:                            4990.72
仮想化:                              AMD-V
L1d キャッシュ:                      32K
L1i キャッシュ:                      32K
L2 キャッシュ:                       512K
L3 キャッシュ:                       16384K
NUMA ノード 0 CPU:                   0-7,32-39
NUMA ノード 1 CPU:                   8-15,40-47
NUMA ノード 2 CPU:                   16-23,48-55
NUMA ノード 3 CPU:                   24-31,56-63
フラグ:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca
[chibi@centos8 deviceQuery]$ lsmem
RANGE                                  SIZE  STATE REMOVABLE     BLOCK
0x0000000000000000-0x0000000007ffffff  128M online        no         0
0x0000000008000000-0x000000002fffffff  640M online       yes       1-5
0x0000000030000000-0x0000000037ffffff  128M online        no         6
0x0000000038000000-0x000000006fffffff  896M online       yes      7-13
0x0000000070000000-0x0000000077ffffff  128M online        no        14
0x0000000078000000-0x000000007fffffff  128M online       yes        15
0x0000000080000000-0x000000009fffffff  512M online        no     16-19
0x0000000100000000-0x0000000107ffffff  128M online        no        32
0x0000000108000000-0x00000003efffffff 11.6G online       yes    33-125
0x00000003f0000000-0x00000003f7ffffff  128M online        no       126
0x00000003f8000000-0x00000007c7ffffff 15.3G online       yes   127-248
0x00000007c8000000-0x0000000867ffffff  2.5G online        no   249-268
0x0000000868000000-0x0000001027ffffff   31G online       yes   269-516
0x0000001028000000-0x0000001067ffffff    1G online        no   517-524
0x0000001068000000-0x000000181fffffff 30.9G online       yes   525-771
0x0000001820000000-0x0000001867ffffff  1.1G online        no   772-780
0x0000001868000000-0x0000002027ffffff   31G online       yes  781-1028
0x0000002028000000-0x000000205fffffff  896M online        no 1029-1035

メモリブロックサイズ  128M
Total online memory:     128G
Total offline memory:      0B
[chibi@centos8 deviceQuery]$ lstopo
Machine (126GB total) + Package L#0
  NUMANode L#0 (P#0 31GB)
    L3 L#0 (16MB)
      L2 L#0 (512KB) + L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
        PU L#0 (P#0)
        PU L#1 (P#32)
      L2 L#1 (512KB) + L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
        PU L#2 (P#1)
        PU L#3 (P#33)
      L2 L#2 (512KB) + L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
        PU L#4 (P#2)
        PU L#5 (P#34)
      L2 L#3 (512KB) + L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
        PU L#6 (P#3)
        PU L#7 (P#35)
    L3 L#1 (16MB)
      L2 L#4 (512KB) + L1d L#4 (32KB) + L1i L#4 (32KB) + Core L#4
        PU L#8 (P#4)
        PU L#9 (P#36)
      L2 L#5 (512KB) + L1d L#5 (32KB) + L1i L#5 (32KB) + Core L#5
        PU L#10 (P#5)
        PU L#11 (P#37)
      L2 L#6 (512KB) + L1d L#6 (32KB) + L1i L#6 (32KB) + Core L#6
        PU L#12 (P#6)
        PU L#13 (P#38)
      L2 L#7 (512KB) + L1d L#7 (32KB) + L1i L#7 (32KB) + Core L#7
        PU L#14 (P#7)
        PU L#15 (P#39)
  NUMANode L#1 (P#1 31GB)
    L3 L#2 (16MB)
      L2 L#8 (512KB) + L1d L#8 (32KB) + L1i L#8 (32KB) + Core L#8
        PU L#16 (P#8)
        PU L#17 (P#40)
      L2 L#9 (512KB) + L1d L#9 (32KB) + L1i L#9 (32KB) + Core L#9
        PU L#18 (P#9)
        PU L#19 (P#41)
      L2 L#10 (512KB) + L1d L#10 (32KB) + L1i L#10 (32KB) + Core L#10
        PU L#20 (P#10)
        PU L#21 (P#42)
      L2 L#11 (512KB) + L1d L#11 (32KB) + L1i L#11 (32KB) + Core L#11
        PU L#22 (P#11)
        PU L#23 (P#43)
    L3 L#3 (16MB)
      L2 L#12 (512KB) + L1d L#12 (32KB) + L1i L#12 (32KB) + Core L#12
        PU L#24 (P#12)
        PU L#25 (P#44)
      L2 L#13 (512KB) + L1d L#13 (32KB) + L1i L#13 (32KB) + Core L#13
        PU L#26 (P#13)
        PU L#27 (P#45)
      L2 L#14 (512KB) + L1d L#14 (32KB) + L1i L#14 (32KB) + Core L#14
        PU L#28 (P#14)
        PU L#29 (P#46)
      L2 L#15 (512KB) + L1d L#15 (32KB) + L1i L#15 (32KB) + Core L#15
        PU L#30 (P#15)
        PU L#31 (P#47)
    HostBridge L#0
      PCIBridge
        PCI 1022:7901
          Block(Disk) L#0 "sda"
      PCIBridge
        PCI 1022:7901
  NUMANode L#2 (P#2 31GB)
    L3 L#4 (16MB)
      L2 L#16 (512KB) + L1d L#16 (32KB) + L1i L#16 (32KB) + Core L#16
        PU L#32 (P#16)
        PU L#33 (P#48)
      L2 L#17 (512KB) + L1d L#17 (32KB) + L1i L#17 (32KB) + Core L#17
        PU L#34 (P#17)
        PU L#35 (P#49)
      L2 L#18 (512KB) + L1d L#18 (32KB) + L1i L#18 (32KB) + Core L#18
        PU L#36 (P#18)
        PU L#37 (P#50)
      L2 L#19 (512KB) + L1d L#19 (32KB) + L1i L#19 (32KB) + Core L#19
        PU L#38 (P#19)
        PU L#39 (P#51)
    L3 L#5 (16MB)
      L2 L#20 (512KB) + L1d L#20 (32KB) + L1i L#20 (32KB) + Core L#20
        PU L#40 (P#20)
        PU L#41 (P#52)
      L2 L#21 (512KB) + L1d L#21 (32KB) + L1i L#21 (32KB) + Core L#21
        PU L#42 (P#21)
        PU L#43 (P#53)
      L2 L#22 (512KB) + L1d L#22 (32KB) + L1i L#22 (32KB) + Core L#22
        PU L#44 (P#22)
        PU L#45 (P#54)
      L2 L#23 (512KB) + L1d L#23 (32KB) + L1i L#23 (32KB) + Core L#23
        PU L#46 (P#23)
        PU L#47 (P#55)
    HostBridge L#3
      PCIBridge
        PCI 10de:1e02
          GPU L#1 "renderD128"
          GPU L#2 "card0"
      PCIBridge
        PCI 1022:7901
  NUMANode L#3 (P#3 31GB)
    L3 L#6 (16MB)
      L2 L#24 (512KB) + L1d L#24 (32KB) + L1i L#24 (32KB) + Core L#24
        PU L#48 (P#24)
        PU L#49 (P#56)
      L2 L#25 (512KB) + L1d L#25 (32KB) + L1i L#25 (32KB) + Core L#25
        PU L#50 (P#25)
        PU L#51 (P#57)
      L2 L#26 (512KB) + L1d L#26 (32KB) + L1i L#26 (32KB) + Core L#26
        PU L#52 (P#26)
        PU L#53 (P#58)
      L2 L#27 (512KB) + L1d L#27 (32KB) + L1i L#27 (32KB) + Core L#27
        PU L#54 (P#27)
        PU L#55 (P#59)
    L3 L#7 (16MB)
      L2 L#28 (512KB) + L1d L#28 (32KB) + L1i L#28 (32KB) + Core L#28
        PU L#56 (P#28)
        PU L#57 (P#60)
      L2 L#29 (512KB) + L1d L#29 (32KB) + L1i L#29 (32KB) + Core L#29
        PU L#58 (P#29)
        PU L#59 (P#61)
      L2 L#30 (512KB) + L1d L#30 (32KB) + L1i L#30 (32KB) + Core L#30
        PU L#60 (P#30)
        PU L#61 (P#62)
      L2 L#31 (512KB) + L1d L#31 (32KB) + L1i L#31 (32KB) + Core L#31
        PU L#62 (P#31)
        PU L#63 (P#63)
    HostBridge L#6
      PCIBridge
        PCI 8086:1533
          Net L#3 "eth0"
      PCIBridge
        PCI 8086:1533
          Net L#4 "eth1"
      PCIBridge
        PCI 10de:1e02
          GPU L#5 "card1"
          GPU L#6 "renderD129"
[chibi@centos8 deviceQuery]$ cat /proc/meminfo
MemTotal:       131611616 kB
MemFree:        129055356 kB
MemAvailable:   129367276 kB
Buffers:            1060 kB
Cached:          1045856 kB
SwapCached:            0 kB
Active:           605536 kB
Inactive:         879040 kB
Active(anon):     437696 kB
Inactive(anon):     9976 kB
Active(file):     167840 kB
Inactive(file):   869064 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:               244 kB
Writeback:             0 kB
AnonPages:        425732 kB
Mapped:           317544 kB
Shmem:             12092 kB
KReclaimable:     147756 kB
Slab:             483516 kB
SReclaimable:     147756 kB
SUnreclaim:       335760 kB
KernelStack:       15696 kB
PageTables:        25012 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:    65805808 kB
Committed_AS:    2799112 kB
VmallocTotal:   34359738367 kB
VmallocUsed:           0 kB
VmallocChunk:          0 kB
Percpu:            39168 kB
HardwareCorrupted:     0 kB
AnonHugePages:    159744 kB
ShmemHugePages:        0 kB
ShmemPmdMapped:        0 kB
HugePages_Total:       0
HugePages_Free:        0
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:       2048 kB
Hugetlb:               0 kB
DirectMap4k:      673744 kB
DirectMap2M:    12822528 kB
DirectMap1G:    121634816 kB
[chibi@centos8 deviceQuery]$ free
              total        used        free      shared  buff/cache   available
Mem:      131611616     1360684   129055408       12092     1195524   129367992
Swap:             0           0           0
[chibi@centos8 deviceQuery]$ cd
[chibi@centos8 ~]$ nvidia-smi
Sun Jul 19 18:34:48 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  TITAN RTX           Off  | 00000000:03:00.0 Off |                  N/A |
| 41%   42C    P8    10W / 280W |    271MiB / 24219MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  TITAN RTX           Off  | 00000000:21:00.0 Off |                  N/A |
| 40%   39C    P8    25W / 280W |      1MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      2232      G   /usr/libexec/Xorg                 105MiB |
|    0   N/A  N/A      2712      G   /usr/bin/gnome-shell              163MiB |
+-----------------------------------------------------------------------------+
[chibi@centos8 ~]$ nvidia-smi nvlink -c
GPU 0: TITAN RTX (UUID: GPU-7fb51c1d-c1e7-35cc-aad7-66971f05ddb7)
         Link 0, P2P is supported: true
         Link 0, Access to system memory supported: true
         Link 0, P2P atomics supported: true
         Link 0, System memory atomics supported: true
         Link 0, SLI is supported: true
         Link 0, Link is supported: false
         Link 1, P2P is supported: true
         Link 1, Access to system memory supported: true
         Link 1, P2P atomics supported: true
         Link 1, System memory atomics supported: true
         Link 1, SLI is supported: true
         Link 1, Link is supported: false
GPU 1: TITAN RTX (UUID: GPU-5a71d61e-f130-637a-b33d-4df555b0ed88)
         Link 0, P2P is supported: true
         Link 0, Access to system memory supported: true
         Link 0, P2P atomics supported: true
         Link 0, System memory atomics supported: true
         Link 0, SLI is supported: true
         Link 0, Link is supported: false
         Link 1, P2P is supported: true
         Link 1, Access to system memory supported: true
         Link 1, P2P atomics supported: true
         Link 1, System memory atomics supported: true
         Link 1, SLI is supported: true
         Link 1, Link is supported: false
[chibi@centos8 ~]$