[chibi@centos8 ~]$ sudo nvidia-docker run --rm -ti nvcr.io/nvidia/tensorflow:19.04-py3
Unable to find image 'nvcr.io/nvidia/tensorflow:19.04-py3' locally
19.04-py3: Pulling from nvidia/tensorflow
34667c7e4631: Pulling fs layer
d18d76a881a4: Pulling fs layer
119c7358fbfc: Pulling fs layer
2aaf13f3eff0: Pulling fs layer
202fa0f8874b: Pulling fs layer
3b700a61ede6: Pulling fs layer
87e6ca450d3f: Pulling fs layer
a1e76dce1aec: Pulling fs layer
9b91fa2f9276: Pulling fs layer
b5877a9add73: Pulling fs layer
bab74df105f1: Pulling fs layer
534bbf505504: Pulling fs layer
a1e76dce1aec: Waiting
f4371944c97d: Pulling fs layer
4615a735431d: Pulling fs layer
9b91fa2f9276: Waiting
629d5c9d75a4: Pulling fs layer
8071b94b5429: Pulling fs layer
6eb8eba2ad5a: Pulling fs layer
b5877a9add73: Waiting
08db5b51b243: Pulling fs layer
f4371944c97d: Waiting
3498ed8c5685: Pulling fs layer
bab74df105f1: Waiting
34bc85bf8bef: Pulling fs layer
5db2639932b5: Waiting
3498ed8c5685: Waiting
4a95ca3431c4: Waiting
4956bf3bbbb9: Waiting
41bc2d0a4d4d: Waiting
a2ceadc61854: Waiting
202fa0f8874b: Waiting
e5cafe011f22: Pull complete
eca19a329cd4: Pull complete
65ee50af0bcc: Pull complete
5f60ec8c32f4: Pull complete
d7dcb657fa13: Pull complete
1f6ef6575fbe: Pull complete
d1ef346a3015: Pull complete
4ef9cb404fd5: Pull complete
f6797f45a018: Pull complete
1d4380527325: Pull complete
965f2629db02: Pull complete
5debff4c8c0a: Pull complete
b3a3a9d82be6: Pull complete
eac05f20b729: Pull complete
3ce0a7f80167: Pull complete
2a21e34a5784: Pull complete
c1ccf19e258e: Pull complete
0b6ea9d0652b: Pull complete
307bc8c3f024: Pull complete
ca75fd593a79: Pull complete
0cd3cdca1af7: Pull complete
48e857e9d372: Pull complete
3264ea403ca9: Pull complete
Digest: sha256:aaebc136d5d50937362675c77afd908bd96cded68846f39163050a023c8a9851
Status: Downloaded newer image for nvcr.io/nvidia/tensorflow:19.04-py3
                                                                                
================
== TensorFlow ==
================

NVIDIA Release 19.04 (build 6132408)
TensorFlow Version 1.13.1

Container image Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
Copyright 2017-2019 The TensorFlow Authors.  All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION.  All rights reserved.
NVIDIA modifications are covered by the license terms that apply to the underlying project or file.

NOTE: MOFED driver for multi-node communication was not detected.
      Multi-node communication performance may be reduced.

NOTE: The SHMEM allocation limit is set to the default of 64MB.  This may be
   insufficient for TensorFlow.  NVIDIA recommends the use of the following flags:
   nvidia-docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 ...

root@06a81cbaf92c:/workspace# ls
README.md  docker-examples  nvidia-examples
root@06a81cbaf92c:/workspace# cd nvidia-examples
root@06a81cbaf92c:/workspace/nvidia-examples# ls
NCF              bert                 cnn           ssdv1.2
OpenSeq2Seq      big_lstm             gnmt_v2       tensorrt
UNet_Industrial  build_imagenet_data  resnet50v1.5
root@06a81cbaf92c:/workspace/nvidia-examples# cd big_lstm
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# ls
1b_word_vocab.txt  data_utils_test.py         language_model_test.py
README.md          download_1b_words_data.sh  model_utils.py
__init__.py        hparams.py                 run_utils.py
common.py          hparams_test.py            single_lm_train.py
data_utils.py      language_model.py          testdata
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# ./download_1b_words_data.sh
Please specify root of dataset directory: data

Success: dataset root dir validated

--2020-07-09 16:45:09--  http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1792209805 (1.7G) [application/x-gzip]
Saving to: ‘1-billion-word-language-modeling-benchmark-r13output.tar.gz’

1-billion-word-lang 100%[===================>]   1.67G   349KB/s    in 1h 51m

2020-07-09 18:36:32 (262 KB/s) - ‘1-billion-word-language-modeling-benchmark-r13output.tar.gz’ saved [1792209805/1792209805]

1-billion-word-language-modeling-benchmark-r13output/
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
1-billion-word-language-modeling-benchmark-r13output/.svn/
1-billion-word-language-modeling-benchmark-r13output/.svn/tmp/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/de/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/de/de102cd0c91cd19e6612f0840e68a2f20ba8134c.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/de/deed1b75d3bd5cc36ae6aeb85d56680b892b7948.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/86/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/86/86c58db52fbf362c5bc329afc33b8805085fcb0d.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9f/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9f/9f2882e21f860a83ad6ea8898ebab140974ed301.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/bc/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/bc/bcdbc523ee7488dc438cab869b6d5e236578dbfa.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d2/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d2/d2718bc26d0ee0a213d7d4add99a304cb5b39ede.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c5/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c5/c5b24f61479da923123d0394a188da922ea0359c.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/11/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/11/116d6ea61730d8199127596b072e981338597779.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/b0/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/b0/b0e26559cfe641245584a9400b35ba28d64f1411.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d3/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d3/d3ae508e3bcb0e696dd70aecd052410f1f7afc1d.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9e/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9e/9e148bd766e8805e0eb97eeae250433ec7a2e996.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/31/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/31/31b645a482e0b81fda3c567cada307c6fcf7ec80.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/da/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/da/da39a3ee5e6b4b0d3255bfef95601890afd80709.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c1/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c1/c1ed42c415ec884e591fb5c70d373da640a383b5.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/e3/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/e3/e37ba0f85e94073ccaced1eed7e4f5d737a25f49.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/entries
1-billion-word-language-modeling-benchmark-r13output/.svn/format
1-billion-word-language-modeling-benchmark-r13output/.svn/wc.db
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00015-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00031-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00027-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00010-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00033-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00042-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00046-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00037-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00029-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00013-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00002-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00048-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00006-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00030-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00025-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00039-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00008-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00020-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00001-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00034-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00044-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00045-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00016-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00004-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00035-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00038-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00009-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00024-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00022-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00021-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00032-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00011-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00049-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00041-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00019-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00023-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00040-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00014-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00007-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00017-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00012-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00018-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00003-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00028-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en-00000-of-00100
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00043-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00005-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00036-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00026-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00047-of-00050
1-billion-word-language-modeling-benchmark-r13output/README

Success! One billion words dataset ready at:
data/1-billion-word-language-modeling-benchmark-r13output/
Please pass this dir to single_lm_train.py via the --datadir option.

root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=4 --datadir=./data/1-billion-word-language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'max_grad_norm': 10.0, 'num_sampled': 8192, 'num_gpus': 4, 'state_size': 2048, 'num_steps': 20, 'vocab_size': 793470, 'batch_size': 128, 'keep_prob': 0.9, 'num_layers': 1, 'run_profiler': False, 'num_delayed_steps': 150, 'learning_rate': 0.2, 'average_params': True, 'num_shards': 8, 'optimizer': 0, 'emb_size': 512, 'projected_size': 512, 'max_time': 180, 'do_summaries': False}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1594319987.5532253
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
model/model_1/state_1_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:1
model/model_2/state_2_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:2
model/model_3/state_3_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:3
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-07-09 18:39:48.230383: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2899965000 Hz
2020-07-09 18:39:48.236813: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xb7c4df0 executing computations on platform Host. Devices:
2020-07-09 18:39:48.236854: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-07-09 18:39:48.668835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 18:39:48.712485: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 18:39:48.719311: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 18:39:48.720479: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xb7c4810 executing computations on platform CUDA. Devices:
2020-07-09 18:39:48.720495: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-07-09 18:39:48.720501: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-07-09 18:39:48.720506: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 18:39:48.720513: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 18:39:48.721566: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.22GiB
2020-07-09 18:39:48.721596: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-07-09 18:39:48.721619: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 18:39:48.721641: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 18:39:48.721665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-07-09 18:39:49.366213: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-09 18:39:49.366260: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-07-09 18:39:49.366266: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-07-09 18:39:49.366272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-07-09 18:39:49.366277: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-07-09 18:39:49.366280: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-07-09 18:39:49.366417: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22508 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-07-09 18:39:49.366815: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-07-09 18:39:49.367096: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-07-09 18:39:49.367362: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
Finished processing!
2020-07-09 18:40:08.834935: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 1, time = 10.01s, wps = 1023, train loss = 12.9759
Iteration 2, time = 7.88s, wps = 1299, train loss = 12.9605
Iteration 3, time = 0.10s, wps = 99650, train loss = 12.8422
Iteration 4, time = 0.10s, wps = 99256, train loss = 11.2516
Iteration 5, time = 0.09s, wps = 110191, train loss = 12.2178
Iteration 6, time = 0.11s, wps = 97159, train loss = 33.2654
Iteration 7, time = 0.09s, wps = 109169, train loss = 13.3990
Iteration 8, time = 0.09s, wps = 108941, train loss = 11.6970
Iteration 9, time = 0.09s, wps = 109375, train loss = 13.0028
Iteration 20, time = 1.03s, wps = 109684, train loss = 10.9034
Iteration 40, time = 1.89s, wps = 108644, train loss = 9.6566
Iteration 60, time = 1.87s, wps = 109320, train loss = 9.2033
Iteration 80, time = 1.85s, wps = 110517, train loss = 8.5605
Iteration 100, time = 1.89s, wps = 108613, train loss = 8.1022
Iteration 120, time = 1.87s, wps = 109697, train loss = 7.7993
Iteration 140, time = 1.87s, wps = 109381, train loss = 7.3636
Iteration 160, time = 1.88s, wps = 109137, train loss = 6.9943
Iteration 180, time = 1.88s, wps = 109196, train loss = 6.5827
Iteration 200, time = 1.88s, wps = 109167, train loss = 6.3724
Iteration 220, time = 1.87s, wps = 109760, train loss = 6.2509
Iteration 240, time = 1.89s, wps = 108609, train loss = 6.2207
Iteration 260, time = 1.87s, wps = 109374, train loss = 6.1654
Iteration 280, time = 1.88s, wps = 108898, train loss = 6.1080
Iteration 300, time = 1.88s, wps = 108726, train loss = 6.1223
Iteration 320, time = 1.87s, wps = 109255, train loss = 6.0498
Iteration 340, time = 1.87s, wps = 109671, train loss = 5.9154
Iteration 360, time = 1.88s, wps = 109028, train loss = 5.9312
Iteration 380, time = 1.88s, wps = 109116, train loss = 5.8795
Iteration 400, time = 1.87s, wps = 109626, train loss = 5.9287
Iteration 420, time = 1.87s, wps = 109680, train loss = 5.7916
Iteration 440, time = 1.86s, wps = 109936, train loss = 5.7363
Iteration 460, time = 1.86s, wps = 110148, train loss = 5.7357
Iteration 480, time = 1.89s, wps = 108500, train loss = 5.7579
Iteration 500, time = 1.90s, wps = 107806, train loss = 5.6737
Iteration 520, time = 1.89s, wps = 108095, train loss = 5.7357
Iteration 540, time = 1.87s, wps = 109494, train loss = 5.6561
Iteration 560, time = 1.87s, wps = 109364, train loss = 5.6245
Iteration 580, time = 1.87s, wps = 109528, train loss = 5.5981
Iteration 600, time = 1.86s, wps = 109914, train loss = 5.5529
Iteration 620, time = 1.90s, wps = 107837, train loss = 5.5422
Iteration 640, time = 1.88s, wps = 109166, train loss = 5.5324
Iteration 660, time = 1.89s, wps = 108578, train loss = 5.5251
Iteration 680, time = 1.88s, wps = 109003, train loss = 5.4815
Iteration 700, time = 1.86s, wps = 110008, train loss = 5.5235
Iteration 720, time = 1.90s, wps = 107858, train loss = 5.3970
Iteration 740, time = 1.89s, wps = 108454, train loss = 5.4528
Iteration 760, time = 1.88s, wps = 108872, train loss = 5.4529
Iteration 780, time = 1.88s, wps = 109208, train loss = 5.4455
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
Finished processing!
Iteration 800, time = 3.49s, wps = 58765, train loss = 5.4127
Iteration 820, time = 1.89s, wps = 108405, train loss = 5.3499
Iteration 840, time = 1.89s, wps = 108275, train loss = 5.3049
Iteration 860, time = 1.89s, wps = 108221, train loss = 5.3642
Iteration 880, time = 1.89s, wps = 108451, train loss = 5.2841
Iteration 900, time = 1.90s, wps = 108068, train loss = 5.3576
Iteration 920, time = 1.88s, wps = 109136, train loss = 5.2559
Iteration 940, time = 1.91s, wps = 107380, train loss = 5.2062
Iteration 960, time = 1.89s, wps = 108495, train loss = 5.3103
Iteration 980, time = 1.88s, wps = 109120, train loss = 5.2215
Iteration 1000, time = 1.89s, wps = 108565, train loss = 5.2167
Iteration 1020, time = 1.88s, wps = 108947, train loss = 5.2181
Iteration 1040, time = 1.89s, wps = 108347, train loss = 5.2800
Iteration 1060, time = 1.86s, wps = 109972, train loss = 5.2166
Iteration 1080, time = 1.89s, wps = 108324, train loss = 5.2214
Iteration 1100, time = 1.88s, wps = 108948, train loss = 5.2117
Iteration 1120, time = 1.87s, wps = 109433, train loss = 5.1117
Iteration 1140, time = 1.89s, wps = 108593, train loss = 5.1513
Iteration 1160, time = 1.87s, wps = 109306, train loss = 5.0745
Iteration 1180, time = 1.88s, wps = 108974, train loss = 5.1456
Iteration 1200, time = 1.87s, wps = 109620, train loss = 5.0737
Iteration 1220, time = 1.88s, wps = 108654, train loss = 5.0478
Iteration 1240, time = 1.86s, wps = 109911, train loss = 5.0931
Iteration 1260, time = 1.87s, wps = 109478, train loss = 5.0631
Iteration 1280, time = 1.88s, wps = 109161, train loss = 5.0981
Iteration 1300, time = 1.89s, wps = 108359, train loss = 5.0008
Iteration 1320, time = 1.89s, wps = 108200, train loss = 5.0327
Iteration 1340, time = 1.89s, wps = 108378, train loss = 5.0352
Iteration 1360, time = 1.87s, wps = 109638, train loss = 5.0477
Iteration 1380, time = 1.88s, wps = 108759, train loss = 5.0320
Iteration 1400, time = 1.89s, wps = 108384, train loss = 5.0515
Iteration 1420, time = 1.88s, wps = 109121, train loss = 4.9736
Iteration 1440, time = 1.89s, wps = 108615, train loss = 4.9756
Iteration 1460, time = 1.90s, wps = 107623, train loss = 4.9652
Iteration 1480, time = 1.88s, wps = 108961, train loss = 4.9555
Iteration 1500, time = 1.87s, wps = 109453, train loss = 4.9282
Iteration 1520, time = 1.87s, wps = 109661, train loss = 4.9654
Iteration 1540, time = 1.88s, wps = 109098, train loss = 4.9798
Iteration 1560, time = 1.89s, wps = 108467, train loss = 4.9767
Iteration 1580, time = 1.87s, wps = 109333, train loss = 4.9255
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
Finished processing!
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m15.195s
user    23m37.661s
sys     4m59.668s
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=3 --datadir=./data/1-billion-word-
language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'projected_size': 512, 'num_sampled': 8192, 'optimizer': 0, 'num_gpus': 3, 'emb_size': 512, 'num_layers': 1, 'num_shards': 8, 'batch_size': 128, 'run_profiler': False, 'state_size': 2048, 'do_summaries': False, 'keep_prob': 0.9, 'num_delayed_steps': 150, 'learning_rate': 0.2, 'average_params': True, 'max_grad_norm': 10.0, 'num_steps': 20, 'max_time': 180, 'vocab_size': 793470}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1594321274.158149
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
model/model_1/state_1_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:1
model/model_2/state_2_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:2
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-07-09 19:01:14.703384: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2899965000 Hz
2020-07-09 19:01:14.709817: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xac02510 executing computations on platform Host. Devices:
2020-07-09 19:01:14.709862: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-07-09 19:01:15.148506: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:01:15.186999: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:01:15.197397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:01:15.199969: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xac01950 executing computations on platform CUDA. Devices:
2020-07-09 19:01:15.199994: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-07-09 19:01:15.199999: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-07-09 19:01:15.200004: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 19:01:15.200011: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 19:01:15.201059: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.22GiB
2020-07-09 19:01:15.201093: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-07-09 19:01:15.201119: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 19:01:15.201143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 19:01:15.201163: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-07-09 19:01:15.851462: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-09 19:01:15.851500: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-07-09 19:01:15.851505: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-07-09 19:01:15.851508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-07-09 19:01:15.851513: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-07-09 19:01:15.851518: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-07-09 19:01:15.851655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22507 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-07-09 19:01:15.852034: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-07-09 19:01:15.852206: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-07-09 19:01:15.852476: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1070: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
Finished processing!
2020-07-09 19:01:28.410322: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 1582, time = 7.91s, wps = 970, train loss = 5.2989
Iteration 1583, time = 5.88s, wps = 1307, train loss = 5.0724
Iteration 1584, time = 0.08s, wps = 92769, train loss = 5.0078
Iteration 1585, time = 0.08s, wps = 95844, train loss = 4.9333
Iteration 1586, time = 0.08s, wps = 93691, train loss = 4.9685
Iteration 1587, time = 0.08s, wps = 101203, train loss = 4.9434
Iteration 1588, time = 0.07s, wps = 103110, train loss = 4.9638
Iteration 1589, time = 0.08s, wps = 100870, train loss = 4.9128
Iteration 1590, time = 0.08s, wps = 101731, train loss = 4.8832
Iteration 1601, time = 0.84s, wps = 100378, train loss = 4.9012
Iteration 1621, time = 1.55s, wps = 99152, train loss = 4.9077
Iteration 1641, time = 1.52s, wps = 100842, train loss = 4.9400
Iteration 1661, time = 1.51s, wps = 101628, train loss = 4.8895
Iteration 1681, time = 1.52s, wps = 101265, train loss = 4.8288
Iteration 1701, time = 1.52s, wps = 100988, train loss = 4.9322
Iteration 1721, time = 1.52s, wps = 101347, train loss = 4.9143
Iteration 1741, time = 1.53s, wps = 100380, train loss = 4.9197
Iteration 1761, time = 1.52s, wps = 101358, train loss = 4.8879
Iteration 1781, time = 1.52s, wps = 101041, train loss = 4.8435
Iteration 1801, time = 1.52s, wps = 101209, train loss = 4.8299
Iteration 1821, time = 1.52s, wps = 100885, train loss = 4.8598
Iteration 1841, time = 1.51s, wps = 101689, train loss = 4.8606
Iteration 1861, time = 1.51s, wps = 101558, train loss = 4.9117
Iteration 1881, time = 1.52s, wps = 100833, train loss = 4.8126
Iteration 1901, time = 1.52s, wps = 101014, train loss = 4.8487
Iteration 1921, time = 1.52s, wps = 101317, train loss = 4.8011
Iteration 1941, time = 1.52s, wps = 100937, train loss = 4.7909
Iteration 1961, time = 1.51s, wps = 101461, train loss = 4.8044
Iteration 1981, time = 1.52s, wps = 101045, train loss = 4.7608
Iteration 2001, time = 1.52s, wps = 101179, train loss = 4.8141
Iteration 2021, time = 1.52s, wps = 100962, train loss = 4.7577
Iteration 2041, time = 1.54s, wps = 100041, train loss = 4.6992
Iteration 2061, time = 1.53s, wps = 100710, train loss = 4.7636
Iteration 2081, time = 1.52s, wps = 101237, train loss = 4.7871
Iteration 2101, time = 1.53s, wps = 100171, train loss = 4.8062
Iteration 2121, time = 1.52s, wps = 100939, train loss = 4.7192
Iteration 2141, time = 1.51s, wps = 101652, train loss = 4.7242
Iteration 2161, time = 1.52s, wps = 101298, train loss = 4.7938
Iteration 2181, time = 1.54s, wps = 99874, train loss = 4.7609
Iteration 2201, time = 1.53s, wps = 100460, train loss = 4.6641
Iteration 2221, time = 1.53s, wps = 100693, train loss = 4.7878
Iteration 2241, time = 1.52s, wps = 101120, train loss = 4.7182
Iteration 2261, time = 1.54s, wps = 99903, train loss = 4.7180
Iteration 2281, time = 1.52s, wps = 101053, train loss = 4.8027
Iteration 2301, time = 1.52s, wps = 100873, train loss = 4.7622
Iteration 2321, time = 1.53s, wps = 100626, train loss = 4.7569
Iteration 2341, time = 1.52s, wps = 100773, train loss = 4.6736
Iteration 2361, time = 1.53s, wps = 100217, train loss = 4.7075
Iteration 2381, time = 1.53s, wps = 100584, train loss = 4.6886
Iteration 2401, time = 1.52s, wps = 100932, train loss = 4.6845
Iteration 2421, time = 1.52s, wps = 101184, train loss = 4.7266
Iteration 2441, time = 1.52s, wps = 101078, train loss = 4.6358
Iteration 2461, time = 1.54s, wps = 99807, train loss = 4.7467
Iteration 2481, time = 1.53s, wps = 100714, train loss = 4.7361
Iteration 2501, time = 1.51s, wps = 101798, train loss = 4.7299
Iteration 2521, time = 1.51s, wps = 101835, train loss = 4.7825
Iteration 2541, time = 1.52s, wps = 101371, train loss = 4.6138
Iteration 2561, time = 1.52s, wps = 101174, train loss = 4.6692
Iteration 2581, time = 1.52s, wps = 100919, train loss = 4.6941
Iteration 2601, time = 1.52s, wps = 100843, train loss = 4.6825
Iteration 2621, time = 1.52s, wps = 100949, train loss = 4.6520
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
Finished processing!
Iteration 2641, time = 3.13s, wps = 49002, train loss = 4.6524
Iteration 2661, time = 1.53s, wps = 100540, train loss = 4.6279
Iteration 2681, time = 1.51s, wps = 101693, train loss = 4.6643
Iteration 2701, time = 1.53s, wps = 100427, train loss = 4.6696
Iteration 2721, time = 1.53s, wps = 100643, train loss = 4.6458
Iteration 2741, time = 1.52s, wps = 101084, train loss = 4.6504
Iteration 2761, time = 1.54s, wps = 99908, train loss = 4.6085
Iteration 2781, time = 1.53s, wps = 100135, train loss = 4.5963
Iteration 2801, time = 1.52s, wps = 100834, train loss = 4.5958
Iteration 2821, time = 1.51s, wps = 101650, train loss = 4.6267
Iteration 2841, time = 1.53s, wps = 100164, train loss = 4.6640
Iteration 2861, time = 1.53s, wps = 100388, train loss = 4.6782
Iteration 2881, time = 1.53s, wps = 100589, train loss = 4.6115
Iteration 2901, time = 1.53s, wps = 100673, train loss = 4.6977
Iteration 2921, time = 1.52s, wps = 100802, train loss = 4.5826
Iteration 2941, time = 1.52s, wps = 100747, train loss = 4.5595
Iteration 2961, time = 1.53s, wps = 100076, train loss = 4.6335
Iteration 2981, time = 1.54s, wps = 99811, train loss = 4.5972
Iteration 3001, time = 1.52s, wps = 101088, train loss = 4.5953
Iteration 3021, time = 1.52s, wps = 101168, train loss = 4.5878
Iteration 3041, time = 1.52s, wps = 101030, train loss = 4.6366
Iteration 3061, time = 1.53s, wps = 100080, train loss = 4.6610
Iteration 3081, time = 1.52s, wps = 100945, train loss = 4.5368
Iteration 3101, time = 1.53s, wps = 100704, train loss = 4.6004
Iteration 3121, time = 1.54s, wps = 99726, train loss = 4.5618
Iteration 3141, time = 1.53s, wps = 100504, train loss = 4.6068
Iteration 3161, time = 1.53s, wps = 100115, train loss = 4.5823
Iteration 3181, time = 1.53s, wps = 100667, train loss = 4.6301
Iteration 3201, time = 1.53s, wps = 100080, train loss = 4.5744
Iteration 3221, time = 1.53s, wps = 100424, train loss = 4.5088
Iteration 3241, time = 1.54s, wps = 99742, train loss = 4.5145
Iteration 3261, time = 1.53s, wps = 100507, train loss = 4.5676
Iteration 3281, time = 1.52s, wps = 101028, train loss = 4.5728
Iteration 3301, time = 1.53s, wps = 100426, train loss = 4.5924
Iteration 3321, time = 1.54s, wps = 99542, train loss = 4.6190
Iteration 3341, time = 1.53s, wps = 100494, train loss = 4.6192
Iteration 3361, time = 1.53s, wps = 100331, train loss = 4.5912
Iteration 3381, time = 1.53s, wps = 100672, train loss = 4.4670
Iteration 3401, time = 1.53s, wps = 100279, train loss = 4.5655
Iteration 3421, time = 1.55s, wps = 99277, train loss = 4.4796
Iteration 3441, time = 1.52s, wps = 100947, train loss = 4.6020
Iteration 3461, time = 1.52s, wps = 100868, train loss = 4.5050
Iteration 3481, time = 1.55s, wps = 99184, train loss = 4.4472
Iteration 3501, time = 1.53s, wps = 100493, train loss = 4.5089
Iteration 3521, time = 1.54s, wps = 100058, train loss = 4.4714
Iteration 3541, time = 1.53s, wps = 100247, train loss = 4.5250
Iteration 3561, time = 1.53s, wps = 100429, train loss = 4.4607
Iteration 3581, time = 1.53s, wps = 100078, train loss = 4.4538
Iteration 3601, time = 1.52s, wps = 101043, train loss = 4.5539
Iteration 3621, time = 1.54s, wps = 100010, train loss = 4.5326
Iteration 3641, time = 1.54s, wps = 100028, train loss = 4.4771
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m12.026s
user    20m35.697s
sys     4m35.163s
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=2 --datadir=./data/1-billion-word-
language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'run_profiler': False, 'max_time': 180, 'num_gpus': 2, 'num_sampled': 8192, 'num_steps': 20, 'do_summaries': False, 'num_shards': 8, 'keep_prob': 0.9, 'num_delayed_steps': 150, 'average_params': True, 'learning_rate': 0.2, 'optimizer': 0, 'state_size': 2048, 'projected_size': 512, 'emb_size': 512, 'max_grad_norm': 10.0, 'num_layers': 1, 'batch_size': 128, 'vocab_size': 793470}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1594322424.3434293
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
model/model_1/state_1_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:1
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-07-09 19:20:24.755383: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2899965000 Hz
2020-07-09 19:20:24.761998: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0x98d1520 executing computations on platform Host. Devices:
2020-07-09 19:20:24.762042: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-07-09 19:20:25.181538: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:20:25.220609: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:20:25.225780: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:20:25.233457: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0x98d0f40 executing computations on platform CUDA. Devices:
2020-07-09 19:20:25.233485: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-07-09 19:20:25.233490: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-07-09 19:20:25.233495: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 19:20:25.233501: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 19:20:25.234520: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.23GiB
2020-07-09 19:20:25.234549: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-07-09 19:20:25.234572: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 19:20:25.234595: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 19:20:25.234621: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-07-09 19:20:25.877828: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-09 19:20:25.877869: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-07-09 19:20:25.877874: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-07-09 19:20:25.877877: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-07-09 19:20:25.877882: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-07-09 19:20:25.877887: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-07-09 19:20:25.878026: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22508 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-07-09 19:20:25.878433: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-07-09 19:20:25.878585: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-07-09 19:20:25.878935: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1070: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
Finished processing!
2020-07-09 19:20:35.285855: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 3651, time = 5.37s, wps = 954, train loss = 4.7448
Iteration 3652, time = 3.48s, wps = 1472, train loss = 4.4974
Iteration 3653, time = 0.07s, wps = 75299, train loss = 4.4889
Iteration 3654, time = 0.06s, wps = 81985, train loss = 4.5429
Iteration 3655, time = 0.06s, wps = 83259, train loss = 4.5148
Iteration 3656, time = 0.06s, wps = 84886, train loss = 4.4918
Iteration 3657, time = 0.06s, wps = 85767, train loss = 4.5072
Iteration 3658, time = 0.07s, wps = 74489, train loss = 4.5052
Iteration 3659, time = 0.06s, wps = 89591, train loss = 4.4991
Iteration 3670, time = 0.64s, wps = 87882, train loss = 4.5247
Iteration 3690, time = 1.15s, wps = 88936, train loss = 4.5768
Iteration 3710, time = 1.16s, wps = 88284, train loss = 4.4631
Iteration 3730, time = 1.17s, wps = 87648, train loss = 4.4461
Iteration 3750, time = 1.15s, wps = 88702, train loss = 4.4722
Iteration 3770, time = 1.15s, wps = 88921, train loss = 4.5349
Iteration 3790, time = 1.16s, wps = 88246, train loss = 4.5120
Iteration 3810, time = 1.17s, wps = 87652, train loss = 4.4371
Iteration 3830, time = 1.15s, wps = 88745, train loss = 4.5496
Iteration 3850, time = 1.16s, wps = 88221, train loss = 4.5729
Iteration 3870, time = 1.15s, wps = 89035, train loss = 4.4712
Iteration 3890, time = 1.16s, wps = 88448, train loss = 4.4403
Iteration 3910, time = 1.16s, wps = 87920, train loss = 4.4013
Iteration 3930, time = 1.15s, wps = 89088, train loss = 4.4649
Iteration 3950, time = 1.15s, wps = 88791, train loss = 4.4622
Iteration 3970, time = 1.16s, wps = 87937, train loss = 4.3920
Iteration 3990, time = 1.16s, wps = 88416, train loss = 4.4675
Iteration 4010, time = 1.16s, wps = 88282, train loss = 4.5088
Iteration 4030, time = 1.15s, wps = 88988, train loss = 4.4643
Iteration 4050, time = 1.16s, wps = 87959, train loss = 4.5503
Iteration 4070, time = 1.16s, wps = 88115, train loss = 4.4824
Iteration 4090, time = 1.16s, wps = 88204, train loss = 4.5859
Iteration 4110, time = 1.16s, wps = 88473, train loss = 4.4717
Iteration 4130, time = 1.16s, wps = 88366, train loss = 4.4565
Iteration 4150, time = 1.15s, wps = 89344, train loss = 4.5180
Iteration 4170, time = 1.15s, wps = 88968, train loss = 4.5147
Iteration 4190, time = 1.17s, wps = 87592, train loss = 4.4757
Iteration 4210, time = 1.16s, wps = 88034, train loss = 4.4391
Iteration 4230, time = 1.16s, wps = 88112, train loss = 4.5002
Iteration 4250, time = 1.15s, wps = 88990, train loss = 4.3741
Iteration 4270, time = 1.16s, wps = 88623, train loss = 4.3434
Iteration 4290, time = 1.17s, wps = 87712, train loss = 4.5099
Iteration 4310, time = 1.16s, wps = 88560, train loss = 4.4802
Iteration 4330, time = 1.17s, wps = 87491, train loss = 4.5003
Iteration 4350, time = 1.17s, wps = 87668, train loss = 4.4843
Iteration 4370, time = 1.16s, wps = 88405, train loss = 4.4211
Iteration 4390, time = 1.16s, wps = 88496, train loss = 4.4849
Iteration 4410, time = 1.16s, wps = 88149, train loss = 4.4548
Iteration 4430, time = 1.17s, wps = 87317, train loss = 4.5035
Iteration 4450, time = 1.16s, wps = 88526, train loss = 4.4041
Iteration 4470, time = 1.16s, wps = 88655, train loss = 4.5143
Iteration 4490, time = 1.17s, wps = 87395, train loss = 4.4153
Iteration 4510, time = 1.16s, wps = 88200, train loss = 4.3329
Iteration 4530, time = 1.17s, wps = 87364, train loss = 4.4890
Iteration 4550, time = 1.16s, wps = 88171, train loss = 4.4354
Iteration 4570, time = 1.17s, wps = 87735, train loss = 4.4973
Iteration 4590, time = 1.20s, wps = 85414, train loss = 4.4607
Iteration 4610, time = 1.16s, wps = 88406, train loss = 4.5005
Iteration 4630, time = 1.16s, wps = 88480, train loss = 4.5163
Iteration 4650, time = 1.16s, wps = 88259, train loss = 4.3373
Iteration 4670, time = 1.16s, wps = 88174, train loss = 4.3873
Iteration 4690, time = 1.15s, wps = 88722, train loss = 4.3774
Iteration 4710, time = 1.17s, wps = 87479, train loss = 4.4038
Iteration 4730, time = 1.16s, wps = 88442, train loss = 4.4892
Iteration 4750, time = 1.17s, wps = 87776, train loss = 4.4162
Iteration 4770, time = 1.17s, wps = 87837, train loss = 4.3617
Iteration 4790, time = 1.17s, wps = 87827, train loss = 4.4147
Iteration 4810, time = 1.18s, wps = 86882, train loss = 4.3041
Iteration 4830, time = 1.16s, wps = 88049, train loss = 4.5172
Iteration 4850, time = 1.17s, wps = 87725, train loss = 4.3812
Iteration 4870, time = 1.17s, wps = 87296, train loss = 4.3598
Iteration 4890, time = 1.17s, wps = 87698, train loss = 4.3716
Iteration 4910, time = 1.17s, wps = 87687, train loss = 4.3911
Iteration 4930, time = 1.17s, wps = 87811, train loss = 4.3761
Iteration 4950, time = 1.18s, wps = 86903, train loss = 4.3639
Iteration 4970, time = 1.17s, wps = 87863, train loss = 4.4311
Iteration 4990, time = 1.16s, wps = 88282, train loss = 4.3633
Iteration 5010, time = 1.16s, wps = 87953, train loss = 4.3819
Iteration 5030, time = 1.17s, wps = 87334, train loss = 4.3597
Iteration 5050, time = 1.18s, wps = 87137, train loss = 4.4913
Iteration 5070, time = 1.18s, wps = 87101, train loss = 4.4755
Iteration 5090, time = 1.15s, wps = 88903, train loss = 4.3256
Iteration 5110, time = 1.16s, wps = 88216, train loss = 4.4046
Iteration 5130, time = 1.16s, wps = 88406, train loss = 4.4080
Iteration 5150, time = 1.17s, wps = 87268, train loss = 4.4079
Iteration 5170, time = 1.19s, wps = 86220, train loss = 4.4002
Iteration 5190, time = 1.16s, wps = 88223, train loss = 4.4273
Iteration 5210, time = 1.16s, wps = 88246, train loss = 4.3315
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
Finished processing!
Iteration 5230, time = 2.76s, wps = 37046, train loss = 4.4082
Iteration 5250, time = 1.17s, wps = 87308, train loss = 4.4237
Iteration 5270, time = 1.16s, wps = 87930, train loss = 4.4027
Iteration 5290, time = 1.16s, wps = 87986, train loss = 4.4276
Iteration 5310, time = 1.17s, wps = 87302, train loss = 4.3443
Iteration 5330, time = 1.17s, wps = 87459, train loss = 4.4302
Iteration 5350, time = 1.16s, wps = 88095, train loss = 4.3683
Iteration 5370, time = 1.18s, wps = 86879, train loss = 4.3593
Iteration 5390, time = 1.16s, wps = 87941, train loss = 4.3763
Iteration 5410, time = 1.16s, wps = 88336, train loss = 4.3316
Iteration 5430, time = 1.17s, wps = 87457, train loss = 4.4390
Iteration 5450, time = 1.17s, wps = 87863, train loss = 4.3944
Iteration 5470, time = 1.18s, wps = 87103, train loss = 4.4514
Iteration 5490, time = 1.17s, wps = 87884, train loss = 4.4019
Iteration 5510, time = 1.16s, wps = 87918, train loss = 4.3338
Iteration 5530, time = 1.16s, wps = 88515, train loss = 4.3222
Iteration 5550, time = 1.17s, wps = 87487, train loss = 4.3301
Iteration 5570, time = 1.17s, wps = 87581, train loss = 4.4993
Iteration 5590, time = 1.17s, wps = 87801, train loss = 4.4601
Iteration 5610, time = 1.17s, wps = 87608, train loss = 4.3629
Iteration 5630, time = 1.17s, wps = 87442, train loss = 4.3597
Iteration 5650, time = 1.18s, wps = 86657, train loss = 4.3024
Iteration 5670, time = 1.17s, wps = 87857, train loss = 4.2477
Iteration 5690, time = 1.17s, wps = 87813, train loss = 4.3870
Iteration 5710, time = 1.17s, wps = 87301, train loss = 4.4227
Iteration 5730, time = 1.17s, wps = 87514, train loss = 4.3390
Iteration 5750, time = 1.17s, wps = 87657, train loss = 4.3500
Iteration 5770, time = 1.17s, wps = 87632, train loss = 4.2926
Iteration 5790, time = 1.17s, wps = 87368, train loss = 4.4158
Iteration 5810, time = 1.17s, wps = 87426, train loss = 4.4016
Iteration 5830, time = 1.16s, wps = 88028, train loss = 4.4405
Iteration 5850, time = 1.17s, wps = 87181, train loss = 4.4158
Iteration 5870, time = 1.19s, wps = 86306, train loss = 4.4062
Iteration 5890, time = 1.18s, wps = 86636, train loss = 4.3732
Iteration 5910, time = 1.17s, wps = 87308, train loss = 4.3568
Iteration 5930, time = 1.16s, wps = 88051, train loss = 4.3334
Iteration 5950, time = 1.18s, wps = 86708, train loss = 4.3624
Iteration 5970, time = 1.17s, wps = 87804, train loss = 4.2810
Iteration 5990, time = 1.17s, wps = 87889, train loss = 4.3167
Iteration 6010, time = 1.17s, wps = 87461, train loss = 4.2837
Iteration 6030, time = 1.19s, wps = 86350, train loss = 4.3785
Iteration 6050, time = 1.17s, wps = 87403, train loss = 4.3620
Iteration 6070, time = 1.17s, wps = 87218, train loss = 4.3418
Iteration 6090, time = 1.18s, wps = 86583, train loss = 4.4190
Iteration 6110, time = 1.18s, wps = 86557, train loss = 4.4121
Iteration 6130, time = 1.22s, wps = 84044, train loss = 4.2610
Iteration 6150, time = 1.19s, wps = 86221, train loss = 4.3024
Iteration 6170, time = 1.19s, wps = 86091, train loss = 4.2701
Iteration 6190, time = 1.18s, wps = 86973, train loss = 4.2732
Iteration 6210, time = 1.20s, wps = 85456, train loss = 4.4137
Iteration 6230, time = 1.17s, wps = 87214, train loss = 4.2138
Iteration 6250, time = 1.18s, wps = 86807, train loss = 4.3003
Iteration 6270, time = 1.19s, wps = 85843, train loss = 4.3879
Iteration 6290, time = 1.19s, wps = 86317, train loss = 4.3283
Iteration 6310, time = 1.19s, wps = 85992, train loss = 4.2327
Iteration 6330, time = 1.19s, wps = 86400, train loss = 4.3969
Iteration 6350, time = 1.19s, wps = 85830, train loss = 4.3786
Iteration 6370, time = 1.20s, wps = 85676, train loss = 4.3952
Iteration 6390, time = 1.19s, wps = 86075, train loss = 4.3449
Iteration 6410, time = 1.18s, wps = 87117, train loss = 4.3329
Iteration 6430, time = 1.20s, wps = 85439, train loss = 4.2929
Iteration 6450, time = 1.20s, wps = 85021, train loss = 4.3227
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m10.419s
user    16m18.722s
sys     4m20.084s
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=1 --datadir=./data/1-billion-word-
language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'vocab_size': 793470, 'learning_rate': 0.2, 'num_steps': 20, 'optimizer': 0, 'num_sampled': 8192, 'state_size': 2048, 'num_shards': 8, 'projected_size': 512, 'num_delayed_steps': 150, 'do_summaries': False, 'average_params': True, 'emb_size': 512, 'max_time': 180, 'num_layers': 1, 'batch_size': 128, 'run_profiler': False, 'max_grad_norm': 10.0, 'keep_prob': 0.9, 'num_gpus': 1}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1594323370.345273
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-07-09 19:36:10.548382: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2899965000 Hz
2020-07-09 19:36:10.555001: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0x7653ec0 executing computations on platform Host. Devices:
2020-07-09 19:36:10.555048: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-07-09 19:36:11.009120: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:36:11.014401: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:36:11.021177: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-09 19:36:11.022082: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0x76538e0 executing computations on platform CUDA. Devices:
2020-07-09 19:36:11.022111: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-07-09 19:36:11.022116: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-07-09 19:36:11.022121: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 19:36:11.022127: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-07-09 19:36:11.023157: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.22GiB
2020-07-09 19:36:11.023187: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-07-09 19:36:11.023209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 19:36:11.023232: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-07-09 19:36:11.023260: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-07-09 19:36:11.660001: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-09 19:36:11.660041: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-07-09 19:36:11.660046: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-07-09 19:36:11.660049: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-07-09 19:36:11.660055: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-07-09 19:36:11.660059: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-07-09 19:36:11.660196: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22508 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-07-09 19:36:11.660567: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-07-09 19:36:11.660712: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-07-09 19:36:11.660975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1070: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
Finished processing!
2020-07-09 19:36:18.468072: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 6452, time = 3.49s, wps = 733, train loss = 4.5748
Iteration 6453, time = 1.76s, wps = 1452, train loss = 4.3247
Iteration 6454, time = 0.06s, wps = 46465, train loss = 4.3786
Iteration 6455, time = 0.06s, wps = 45276, train loss = 4.3459
Iteration 6456, time = 0.05s, wps = 49203, train loss = 4.3737
Iteration 6457, time = 0.05s, wps = 53930, train loss = 4.3841
Iteration 6458, time = 0.05s, wps = 56850, train loss = 4.3134
Iteration 6459, time = 0.05s, wps = 54703, train loss = 4.4300
Iteration 6460, time = 0.05s, wps = 52797, train loss = 4.2431
Iteration 6471, time = 0.51s, wps = 55112, train loss = 4.2138
Iteration 6491, time = 0.93s, wps = 55183, train loss = 4.2642
Iteration 6511, time = 0.93s, wps = 55110, train loss = 4.4812
Iteration 6531, time = 0.95s, wps = 54099, train loss = 4.3427
Iteration 6551, time = 0.93s, wps = 54999, train loss = 4.2639
Iteration 6571, time = 0.93s, wps = 55339, train loss = 4.3331
Iteration 6591, time = 0.93s, wps = 54811, train loss = 4.3715
Iteration 6611, time = 0.94s, wps = 54487, train loss = 4.3418
Iteration 6631, time = 0.94s, wps = 54514, train loss = 4.3595
Iteration 6651, time = 0.93s, wps = 54983, train loss = 4.3396
Iteration 6671, time = 0.95s, wps = 53792, train loss = 4.2968
Iteration 6691, time = 0.93s, wps = 54814, train loss = 4.3515
Iteration 6711, time = 0.95s, wps = 54062, train loss = 4.3074
Iteration 6731, time = 0.93s, wps = 54823, train loss = 4.5425
Iteration 6751, time = 0.94s, wps = 54506, train loss = 4.4441
Iteration 6771, time = 0.95s, wps = 54019, train loss = 4.4239
Iteration 6791, time = 0.95s, wps = 53913, train loss = 4.3699
Iteration 6811, time = 0.95s, wps = 54133, train loss = 4.3571
Iteration 6831, time = 0.94s, wps = 54385, train loss = 4.4038
Iteration 6851, time = 0.95s, wps = 54134, train loss = 4.4007
Iteration 6871, time = 0.93s, wps = 54863, train loss = 4.3865
Iteration 6891, time = 0.94s, wps = 54262, train loss = 4.2639
Iteration 6911, time = 0.94s, wps = 54210, train loss = 4.3482
Iteration 6931, time = 0.95s, wps = 53858, train loss = 4.4031
Iteration 6951, time = 0.94s, wps = 54393, train loss = 4.3350
Iteration 6971, time = 0.95s, wps = 54021, train loss = 4.2717
Iteration 6991, time = 0.94s, wps = 54327, train loss = 4.3949
Iteration 7011, time = 0.94s, wps = 54551, train loss = 4.3383
Iteration 7031, time = 0.94s, wps = 54293, train loss = 4.2353
Iteration 7051, time = 0.95s, wps = 53841, train loss = 4.2956
Iteration 7071, time = 0.95s, wps = 54027, train loss = 4.2125
Iteration 7091, time = 0.95s, wps = 53804, train loss = 4.4604
Iteration 7111, time = 0.94s, wps = 54270, train loss = 4.2505
Iteration 7131, time = 0.93s, wps = 54798, train loss = 4.2613
Iteration 7151, time = 0.94s, wps = 54690, train loss = 4.1604
Iteration 7171, time = 0.94s, wps = 54679, train loss = 4.3508
Iteration 7191, time = 0.94s, wps = 54251, train loss = 4.3004
Iteration 7211, time = 0.95s, wps = 53676, train loss = 4.3000
Iteration 7231, time = 0.94s, wps = 54540, train loss = 4.4716
Iteration 7251, time = 0.94s, wps = 54236, train loss = 4.3187
Iteration 7271, time = 0.96s, wps = 53356, train loss = 4.2899
Iteration 7291, time = 0.94s, wps = 54309, train loss = 4.2238
Iteration 7311, time = 0.95s, wps = 53849, train loss = 4.4354
Iteration 7331, time = 0.93s, wps = 55059, train loss = 4.4568
Iteration 7351, time = 0.95s, wps = 54122, train loss = 4.3329
Iteration 7371, time = 0.95s, wps = 53912, train loss = 4.3169
Iteration 7391, time = 0.93s, wps = 54975, train loss = 4.3614
Iteration 7411, time = 0.95s, wps = 53699, train loss = 4.3399
Iteration 7431, time = 0.96s, wps = 53299, train loss = 4.3092
Iteration 7451, time = 0.94s, wps = 54707, train loss = 4.3154
Iteration 7471, time = 0.97s, wps = 52784, train loss = 4.4876
Iteration 7491, time = 0.94s, wps = 54294, train loss = 4.1984
Iteration 7511, time = 0.96s, wps = 53568, train loss = 4.3811
Iteration 7531, time = 0.94s, wps = 54269, train loss = 4.3138
Iteration 7551, time = 0.95s, wps = 53623, train loss = 4.2677
Iteration 7571, time = 0.95s, wps = 54176, train loss = 4.2834
Iteration 7591, time = 0.96s, wps = 53463, train loss = 4.2202
Iteration 7611, time = 0.93s, wps = 54936, train loss = 4.2324
Iteration 7631, time = 0.96s, wps = 53447, train loss = 4.2815
Iteration 7651, time = 0.93s, wps = 55041, train loss = 4.3713
Iteration 7671, time = 0.94s, wps = 54727, train loss = 4.3176
Iteration 7691, time = 0.94s, wps = 54191, train loss = 4.3247
Iteration 7711, time = 0.94s, wps = 54271, train loss = 4.3492
Iteration 7731, time = 0.95s, wps = 53896, train loss = 4.2081
Iteration 7751, time = 0.96s, wps = 53111, train loss = 4.3426
Iteration 7771, time = 0.94s, wps = 54561, train loss = 4.2988
Iteration 7791, time = 0.95s, wps = 53765, train loss = 4.2310
Iteration 7811, time = 0.97s, wps = 52817, train loss = 4.3332
Iteration 7831, time = 0.94s, wps = 54293, train loss = 4.3237
Iteration 7851, time = 0.94s, wps = 54408, train loss = 4.3061
Iteration 7871, time = 0.96s, wps = 53343, train loss = 4.2418
Iteration 7891, time = 0.96s, wps = 53417, train loss = 4.4382
Iteration 7911, time = 0.95s, wps = 53849, train loss = 4.2522
Iteration 7931, time = 0.96s, wps = 53585, train loss = 4.2784
Iteration 7951, time = 0.95s, wps = 53657, train loss = 4.2915
Iteration 7971, time = 0.94s, wps = 54334, train loss = 4.2792
Iteration 7991, time = 0.96s, wps = 53181, train loss = 4.3507
Iteration 8011, time = 0.96s, wps = 53592, train loss = 4.2242
Iteration 8031, time = 0.96s, wps = 53473, train loss = 4.1413
Iteration 8051, time = 0.97s, wps = 52603, train loss = 4.2696
Iteration 8071, time = 0.96s, wps = 53400, train loss = 4.2920
Iteration 8091, time = 0.96s, wps = 53121, train loss = 4.3513
Iteration 8111, time = 0.95s, wps = 53754, train loss = 4.3355
Iteration 8131, time = 0.96s, wps = 53343, train loss = 4.3722
Iteration 8151, time = 0.96s, wps = 53302, train loss = 4.2779
Iteration 8171, time = 0.95s, wps = 53750, train loss = 4.2896
Iteration 8191, time = 0.96s, wps = 53472, train loss = 4.2901
Iteration 8211, time = 0.96s, wps = 53259, train loss = 4.2940
Iteration 8231, time = 0.96s, wps = 53153, train loss = 4.3227
Iteration 8251, time = 0.96s, wps = 53421, train loss = 4.3334
Iteration 8271, time = 0.95s, wps = 53620, train loss = 4.3087
Iteration 8291, time = 0.96s, wps = 53307, train loss = 4.2348
Iteration 8311, time = 0.96s, wps = 53148, train loss = 4.2891
Iteration 8331, time = 0.98s, wps = 52382, train loss = 4.3299
Iteration 8351, time = 0.95s, wps = 53665, train loss = 4.4399
Iteration 8371, time = 0.94s, wps = 54232, train loss = 4.2496
Iteration 8391, time = 0.96s, wps = 53102, train loss = 4.2693
Iteration 8411, time = 0.96s, wps = 53602, train loss = 4.3867
Iteration 8431, time = 0.95s, wps = 53844, train loss = 4.3690
Iteration 8451, time = 0.96s, wps = 53197, train loss = 4.3226
Iteration 8471, time = 0.97s, wps = 52682, train loss = 4.3631
Iteration 8491, time = 0.96s, wps = 53146, train loss = 4.2487
Iteration 8511, time = 0.97s, wps = 53006, train loss = 4.1940
Iteration 8531, time = 0.96s, wps = 53075, train loss = 4.2272
Iteration 8551, time = 0.97s, wps = 52671, train loss = 4.3050
Iteration 8571, time = 0.94s, wps = 54261, train loss = 4.3108
Iteration 8591, time = 0.94s, wps = 54350, train loss = 4.3498
Iteration 8611, time = 0.95s, wps = 53921, train loss = 4.2782
Iteration 8631, time = 0.96s, wps = 53513, train loss = 4.3024
Iteration 8651, time = 0.95s, wps = 53697, train loss = 4.3437
Iteration 8671, time = 0.96s, wps = 53102, train loss = 4.3527
Iteration 8691, time = 0.96s, wps = 53497, train loss = 4.2810
Iteration 8711, time = 0.96s, wps = 53142, train loss = 4.2302
Iteration 8731, time = 0.97s, wps = 52883, train loss = 4.2494
Iteration 8751, time = 0.95s, wps = 53720, train loss = 4.4378
Iteration 8771, time = 0.97s, wps = 53010, train loss = 4.2579
Iteration 8791, time = 0.98s, wps = 52473, train loss = 4.3892
Iteration 8811, time = 0.96s, wps = 53520, train loss = 4.2255
Iteration 8831, time = 0.97s, wps = 52889, train loss = 4.2083
Iteration 8851, time = 0.97s, wps = 52755, train loss = 4.2327
Iteration 8871, time = 0.97s, wps = 52734, train loss = 4.3187
Iteration 8891, time = 0.97s, wps = 52813, train loss = 4.3506
Iteration 8911, time = 0.96s, wps = 53548, train loss = 4.1944
Iteration 8931, time = 0.95s, wps = 53963, train loss = 4.3758
Iteration 8951, time = 0.97s, wps = 52864, train loss = 4.2385
Iteration 8971, time = 0.98s, wps = 52123, train loss = 4.3097
Iteration 8991, time = 0.98s, wps = 52455, train loss = 4.2555
Iteration 9011, time = 0.97s, wps = 53032, train loss = 4.2439
Iteration 9031, time = 0.98s, wps = 52213, train loss = 4.2559
Iteration 9051, time = 0.97s, wps = 52611, train loss = 4.2330
Iteration 9071, time = 0.99s, wps = 51868, train loss = 4.2320
Iteration 9091, time = 1.01s, wps = 50770, train loss = 4.3600
Iteration 9111, time = 0.98s, wps = 52201, train loss = 4.2218
Iteration 9131, time = 0.98s, wps = 52144, train loss = 4.1430
Iteration 9151, time = 0.98s, wps = 52427, train loss = 4.2160
Iteration 9171, time = 0.99s, wps = 51621, train loss = 4.3801
Iteration 9191, time = 0.98s, wps = 52131, train loss = 4.2914
Iteration 9211, time = 0.98s, wps = 52019, train loss = 4.2416
Iteration 9231, time = 0.98s, wps = 52442, train loss = 4.4088
Iteration 9251, time = 0.99s, wps = 51632, train loss = 4.3585
Iteration 9271, time = 1.00s, wps = 51339, train loss = 4.4309
Iteration 9291, time = 1.00s, wps = 51368, train loss = 4.2377
Iteration 9311, time = 0.98s, wps = 52050, train loss = 4.1174
Iteration 9331, time = 0.99s, wps = 51473, train loss = 4.1457
Iteration 9351, time = 0.99s, wps = 51718, train loss = 4.3641
Iteration 9371, time = 1.01s, wps = 50607, train loss = 4.3306
Iteration 9391, time = 1.00s, wps = 51406, train loss = 4.1563
Iteration 9411, time = 1.01s, wps = 50892, train loss = 4.2648
Iteration 9431, time = 1.01s, wps = 50616, train loss = 4.2346
Iteration 9451, time = 0.99s, wps = 51716, train loss = 4.2373
Iteration 9471, time = 1.01s, wps = 50889, train loss = 4.3476
Iteration 9491, time = 0.99s, wps = 51697, train loss = 4.3584
Iteration 9511, time = 1.02s, wps = 50248, train loss = 4.1187
Iteration 9531, time = 1.01s, wps = 50704, train loss = 4.2623
Iteration 9551, time = 1.01s, wps = 50789, train loss = 4.3241
Iteration 9571, time = 1.02s, wps = 50287, train loss = 4.2409
Iteration 9591, time = 1.01s, wps = 50728, train loss = 4.3529
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
Finished processing!
Iteration 9611, time = 2.61s, wps = 19604, train loss = 4.2227
Iteration 9631, time = 1.00s, wps = 51345, train loss = 4.2694
Iteration 9651, time = 1.01s, wps = 50742, train loss = 4.3026
Iteration 9671, time = 1.02s, wps = 50086, train loss = 4.1872
Iteration 9691, time = 1.01s, wps = 50611, train loss = 4.2419
Iteration 9711, time = 1.00s, wps = 51160, train loss = 4.4209
Iteration 9731, time = 1.02s, wps = 50228, train loss = 4.1479
Iteration 9751, time = 1.01s, wps = 50804, train loss = 4.2426
Iteration 9771, time = 1.03s, wps = 49861, train loss = 4.2500
Iteration 9791, time = 1.02s, wps = 50172, train loss = 4.1588
Iteration 9811, time = 1.02s, wps = 50078, train loss = 4.1868
Iteration 9831, time = 1.03s, wps = 49744, train loss = 4.3365
Iteration 9851, time = 1.03s, wps = 49611, train loss = 4.1265
Iteration 9871, time = 1.00s, wps = 51171, train loss = 4.2198
Iteration 9891, time = 1.05s, wps = 48747, train loss = 4.2099
Iteration 9911, time = 1.04s, wps = 49195, train loss = 4.2511
Iteration 9931, time = 1.01s, wps = 50694, train loss = 4.2018
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m8.768s
user    9m16.236s
sys     2m55.672s
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# cat /etc/os-release
NAME="Ubuntu"
VERSION="16.04.6 LTS (Xenial Xerus)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 16.04.6 LTS"
VERSION_ID="16.04"
HOME_URL="http://www.ubuntu.com/"
SUPPORT_URL="http://help.ubuntu.com/"
BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/"
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Fri_Feb__8_19:08:17_PST_2019
Cuda compilation tools, release 10.1, V10.1.105
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm# cd data
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm/data# ls
1-billion-word-language-modeling-benchmark-r13output
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm/data# cd 1-billion-word-language-modeling-benchmark-r13output
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm/data/1-billion-word-language-modeling-benchmark-r13output# ls
1b_word_vocab.txt  heldout-monolingual.tokenized.shuffled
README             training-monolingual.tokenized.shuffled
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm/data/1-billion-word-language-modeling-benchmark-r13output# cd training-monolingual.tokenized.shuffled
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm/data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled# ls
news.en-00001-of-00100  news.en-00034-of-00100  news.en-00067-of-00100
news.en-00002-of-00100  news.en-00035-of-00100  news.en-00068-of-00100
news.en-00003-of-00100  news.en-00036-of-00100  news.en-00069-of-00100
news.en-00004-of-00100  news.en-00037-of-00100  news.en-00070-of-00100
news.en-00005-of-00100  news.en-00038-of-00100  news.en-00071-of-00100
news.en-00006-of-00100  news.en-00039-of-00100  news.en-00072-of-00100
news.en-00007-of-00100  news.en-00040-of-00100  news.en-00073-of-00100
news.en-00008-of-00100  news.en-00041-of-00100  news.en-00074-of-00100
news.en-00009-of-00100  news.en-00042-of-00100  news.en-00075-of-00100
news.en-00010-of-00100  news.en-00043-of-00100  news.en-00076-of-00100
news.en-00011-of-00100  news.en-00044-of-00100  news.en-00077-of-00100
news.en-00012-of-00100  news.en-00045-of-00100  news.en-00078-of-00100
news.en-00013-of-00100  news.en-00046-of-00100  news.en-00079-of-00100
news.en-00014-of-00100  news.en-00047-of-00100  news.en-00080-of-00100
news.en-00015-of-00100  news.en-00048-of-00100  news.en-00081-of-00100
news.en-00016-of-00100  news.en-00049-of-00100  news.en-00082-of-00100
news.en-00017-of-00100  news.en-00050-of-00100  news.en-00083-of-00100
news.en-00018-of-00100  news.en-00051-of-00100  news.en-00084-of-00100
news.en-00019-of-00100  news.en-00052-of-00100  news.en-00085-of-00100
news.en-00020-of-00100  news.en-00053-of-00100  news.en-00086-of-00100
news.en-00021-of-00100  news.en-00054-of-00100  news.en-00087-of-00100
news.en-00022-of-00100  news.en-00055-of-00100  news.en-00088-of-00100
news.en-00023-of-00100  news.en-00056-of-00100  news.en-00089-of-00100
news.en-00024-of-00100  news.en-00057-of-00100  news.en-00090-of-00100
news.en-00025-of-00100  news.en-00058-of-00100  news.en-00091-of-00100
news.en-00026-of-00100  news.en-00059-of-00100  news.en-00092-of-00100
news.en-00027-of-00100  news.en-00060-of-00100  news.en-00093-of-00100
news.en-00028-of-00100  news.en-00061-of-00100  news.en-00094-of-00100
news.en-00029-of-00100  news.en-00062-of-00100  news.en-00095-of-00100
news.en-00030-of-00100  news.en-00063-of-00100  news.en-00096-of-00100
news.en-00031-of-00100  news.en-00064-of-00100  news.en-00097-of-00100
news.en-00032-of-00100  news.en-00065-of-00100  news.en-00098-of-00100
news.en-00033-of-00100  news.en-00066-of-00100  news.en-00099-of-00100
root@06a81cbaf92c:/workspace/nvidia-examples/big_lstm/data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled# exit
exit
[chibi@centos8 ~]$ cat /etc/os-release
NAME="CentOS Linux"
VERSION="8 (Core)"
ID="centos"
ID_LIKE="rhel fedora"
VERSION_ID="8"
PLATFORM_ID="platform:el8"
PRETTY_NAME="CentOS Linux 8 (Core)"
ANSI_COLOR="0;31"
CPE_NAME="cpe:/o:centos:centos:8"
HOME_URL="https://www.centos.org/"
BUG_REPORT_URL="https://bugs.centos.org/"

CENTOS_MANTISBT_PROJECT="CentOS-8"
CENTOS_MANTISBT_PROJECT_VERSION="8"
REDHAT_SUPPORT_PRODUCT="centos"
REDHAT_SUPPORT_PRODUCT_VERSION="8"

[chibi@centos8 ~]$ cat /etc/redhat-release
CentOS Linux release 8.2.2004 (Core)
[chibi@centos8 ~]$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_May__6_19:09:25_PDT_2020
Cuda compilation tools, release 11.0, V11.0.167
Build cuda_11.0_bu.TC445_37.28358933_0
[chibi@centos8 ~]$ lsmem
RANGE                                  SIZE  STATE REMOVABLE     BLOCK
0x0000000000000000-0x000000000fffffff  256M online        no       0-1
0x0000000010000000-0x0000000017ffffff  128M online       yes         2
0x0000000018000000-0x000000003fffffff  640M online        no       3-7
0x0000000100000000-0x000000010fffffff  256M online        no     32-33
0x0000000110000000-0x000000122fffffff 68.5G online       yes    34-581
0x0000001230000000-0x0000001237ffffff  128M online        no       582
0x0000001238000000-0x0000001247ffffff  256M online       yes   583-584
0x0000001248000000-0x000000124fffffff  128M online        no       585
0x0000001250000000-0x0000001257ffffff  128M online       yes       586
0x0000001258000000-0x0000001267ffffff  256M online        no   587-588
0x0000001268000000-0x000000129fffffff  896M online       yes   589-595
0x00000012a0000000-0x00000012a7ffffff  128M online        no       596
0x00000012a8000000-0x00000013b7ffffff  4.3G online       yes   597-630
0x00000013b8000000-0x00000013bfffffff  128M online        no       631
0x00000013c0000000-0x00000014d7ffffff  4.4G online       yes   632-666
0x00000014d8000000-0x00000014dfffffff  128M online        no       667
0x00000014e0000000-0x00000014efffffff  256M online       yes   668-669
0x00000014f0000000-0x0000001507ffffff  384M online        no   670-672
0x0000001508000000-0x000000153fffffff  896M online       yes   673-679
0x0000001540000000-0x0000001547ffffff  128M online        no       680
0x0000001548000000-0x00000015a7ffffff  1.5G online       yes   681-692
0x00000015a8000000-0x00000015afffffff  128M online        no       693
0x00000015b0000000-0x00000015b7ffffff  128M online       yes       694
0x00000015b8000000-0x00000015bfffffff  128M online        no       695
0x00000015c0000000-0x00000015cfffffff  256M online       yes   696-697
0x00000015d0000000-0x00000015d7ffffff  128M online        no       698
0x00000015d8000000-0x00000015efffffff  384M online       yes   699-701
0x00000015f0000000-0x00000015f7ffffff  128M online        no       702
0x00000015f8000000-0x00000015ffffffff  128M online       yes       703
0x0000001600000000-0x0000001607ffffff  128M online        no       704
0x0000001608000000-0x000000161fffffff  384M online       yes   705-707
0x0000001620000000-0x0000001647ffffff  640M online        no   708-712
0x0000001648000000-0x000000164fffffff  128M online       yes       713
0x0000001650000000-0x0000001687ffffff  896M online        no   714-720
0x0000001688000000-0x00000016afffffff  640M online       yes   721-725
0x00000016b0000000-0x00000016c7ffffff  384M online        no   726-728
0x00000016c8000000-0x00000016cfffffff  128M online       yes       729
0x00000016d0000000-0x00000016d7ffffff  128M online        no       730
0x00000016d8000000-0x000000171fffffff  1.1G online       yes   731-739
0x0000001720000000-0x0000001727ffffff  128M online        no       740
0x0000001728000000-0x000000172fffffff  128M online       yes       741
0x0000001730000000-0x0000001747ffffff  384M online        no   742-744
0x0000001748000000-0x0000001757ffffff  256M online       yes   745-746
0x0000001758000000-0x000000175fffffff  128M online        no       747
0x0000001760000000-0x0000001777ffffff  384M online       yes   748-750
0x0000001778000000-0x000000177fffffff  128M online        no       751
0x0000001780000000-0x00000018a7ffffff  4.6G online       yes   752-788
0x00000018a8000000-0x00000018afffffff  128M online        no       789
0x00000018b0000000-0x00000018cfffffff  512M online       yes   790-793
0x00000018d0000000-0x00000018dfffffff  256M online        no   794-795
0x00000018e0000000-0x0000001947ffffff  1.6G online       yes   796-808
0x0000001948000000-0x0000001957ffffff  256M online        no   809-810
0x0000001958000000-0x000000197fffffff  640M online       yes   811-815
0x0000001980000000-0x0000001987ffffff  128M online        no       816
0x0000001988000000-0x000000198fffffff  128M online       yes       817
0x0000001990000000-0x0000001997ffffff  128M online        no       818
0x0000001998000000-0x00000019a7ffffff  256M online       yes   819-820
0x00000019a8000000-0x00000019b7ffffff  256M online        no   821-822
0x00000019b8000000-0x00000019bfffffff  128M online       yes       823
0x00000019c0000000-0x00000019c7ffffff  128M online        no       824
0x00000019c8000000-0x0000001a27ffffff  1.5G online       yes   825-836
0x0000001a28000000-0x0000001a37ffffff  256M online        no   837-838
0x0000001a38000000-0x0000001a4fffffff  384M online       yes   839-841
0x0000001a50000000-0x0000001a6fffffff  512M online        no   842-845
0x0000001a70000000-0x0000001a77ffffff  128M online       yes       846
0x0000001a78000000-0x0000001a7fffffff  128M online        no       847
0x0000001a80000000-0x0000001a8fffffff  256M online       yes   848-849
0x0000001a90000000-0x0000001aa7ffffff  384M online        no   850-852
0x0000001aa8000000-0x0000001b3fffffff  2.4G online       yes   853-871
0x0000001b40000000-0x0000001b47ffffff  128M online        no       872
0x0000001b48000000-0x0000001b7fffffff  896M online       yes   873-879
0x0000001b80000000-0x0000001b87ffffff  128M online        no       880
0x0000001b88000000-0x0000001b9fffffff  384M online       yes   881-883
0x0000001ba0000000-0x0000001ba7ffffff  128M online        no       884
0x0000001ba8000000-0x0000001bc7ffffff  512M online       yes   885-888
0x0000001bc8000000-0x0000001bcfffffff  128M online        no       889
0x0000001bd0000000-0x0000001bf7ffffff  640M online       yes   890-894
0x0000001bf8000000-0x0000001c17ffffff  512M online        no   895-898
0x0000001c18000000-0x0000001c27ffffff  256M online       yes   899-900
0x0000001c28000000-0x0000001c37ffffff  256M online        no   901-902
0x0000001c38000000-0x0000001c3fffffff  128M online       yes       903
0x0000001c40000000-0x0000001c4fffffff  256M online        no   904-905
0x0000001c50000000-0x0000001c5fffffff  256M online       yes   906-907
0x0000001c60000000-0x0000001c6fffffff  256M online        no   908-909
0x0000001c70000000-0x0000001cc7ffffff  1.4G online       yes   910-920
0x0000001cc8000000-0x0000001ccfffffff  128M online        no       921
0x0000001cd0000000-0x0000001d77ffffff  2.6G online       yes   922-942
0x0000001d78000000-0x0000001dafffffff  896M online        no   943-949
0x0000001db0000000-0x0000001dbfffffff  256M online       yes   950-951
0x0000001dc0000000-0x0000001ddfffffff  512M online        no   952-955
0x0000001de0000000-0x0000001e07ffffff  640M online       yes   956-960
0x0000001e08000000-0x0000001e17ffffff  256M online        no   961-962
0x0000001e18000000-0x0000001e1fffffff  128M online       yes       963
0x0000001e20000000-0x0000001e27ffffff  128M online        no       964
0x0000001e28000000-0x0000001e2fffffff  128M online       yes       965
0x0000001e30000000-0x0000001e57ffffff  640M online        no   966-970
0x0000001e58000000-0x0000001e6fffffff  384M online       yes   971-973
0x0000001e70000000-0x0000001e77ffffff  128M online        no       974
0x0000001e78000000-0x0000001e7fffffff  128M online       yes       975
0x0000001e80000000-0x0000001e97ffffff  384M online        no   976-978
0x0000001e98000000-0x0000001eafffffff  384M online       yes   979-981
0x0000001eb0000000-0x0000001ec7ffffff  384M online        no   982-984
0x0000001ec8000000-0x0000001ecfffffff  128M online       yes       985
0x0000001ed0000000-0x0000001ed7ffffff  128M online        no       986
0x0000001ed8000000-0x0000001f87ffffff  2.8G online       yes  987-1008
0x0000001f88000000-0x0000001f8fffffff  128M online        no      1009
0x0000001f90000000-0x0000001f9fffffff  256M online       yes 1010-1011
0x0000001fa0000000-0x00000020bfffffff  4.5G online        no 1012-1047

メモリブロックサイズ  128M
Total online memory:     128G
Total offline memory:      0B
[chibi@centos8 ~]$ cat /proc/meminfo
MemTotal:       131596828 kB
MemFree:        121646404 kB
MemAvailable:   128431440 kB
Buffers:            1060 kB
Cached:          7639972 kB
SwapCached:            0 kB
Active:          1171816 kB
Inactive:        6959632 kB
Active(anon):     462944 kB
Inactive(anon):    19020 kB
Active(file):     708872 kB
Inactive(file):  6940612 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:                 0 kB
Writeback:             0 kB
AnonPages:        484464 kB
Mapped:           301544 kB
Shmem:             21000 kB
KReclaimable:     384588 kB
Slab:            1278028 kB
SReclaimable:     384588 kB
SUnreclaim:       893440 kB
KernelStack:       27136 kB
PageTables:        25488 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:    65798412 kB
Committed_AS:    3636988 kB
VmallocTotal:   34359738367 kB
VmallocUsed:           0 kB
VmallocChunk:          0 kB
Percpu:            68608 kB
HardwareCorrupted:     0 kB
AnonHugePages:    202752 kB
ShmemHugePages:        0 kB
ShmemPmdMapped:        0 kB
HugePages_Total:       0
HugePages_Free:        0
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:       2048 kB
Hugetlb:               0 kB
DirectMap4k:     1690044 kB
DirectMap2M:    24385536 kB
DirectMap1G:    108003328 kB
[chibi@centos8 ~]$ lscpu
アーキテクチャ:                      x86_64
CPU 操作モード:                      32-bit, 64-bit
バイト順序:                          Little Endian
CPU:                                 128
オンラインになっている CPU のリスト: 0-127
コアあたりのスレッド数:              2
ソケットあたりのコア数:              64
ソケット数:                          1
NUMA ノード数:                       1
ベンダー ID:                         AuthenticAMD
CPU ファミリー:                      23
モデル:                              49
モデル名:                            AMD Ryzen Threadripper 3990X 64-Core Processor
ステッピング:                        0
CPU MHz:                             3570.508
CPU 最大 MHz:                        2900.0000
CPU 最小 MHz:                        2200.0000
BogoMIPS:                            5799.93
仮想化:                              AMD-V
L1d キャッシュ:                      32K
L1i キャッシュ:                      32K
L2 キャッシュ:                       512K
L3 キャッシュ:                       16384K
NUMA ノード 0 CPU:                   0-127
フラグ:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd mba sev ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca
[chibi@centos8 ~]$ lstopo
Machine (126GB)
  Package L#0
    L3 L#0 (16MB)
      L2 L#0 (512KB) + L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
        PU L#0 (P#0)
        PU L#1 (P#64)
      L2 L#1 (512KB) + L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
        PU L#2 (P#1)
        PU L#3 (P#65)
      L2 L#2 (512KB) + L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
        PU L#4 (P#2)
        PU L#5 (P#66)
      L2 L#3 (512KB) + L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
        PU L#6 (P#3)
        PU L#7 (P#67)
    L3 L#1 (16MB)
      L2 L#4 (512KB) + L1d L#4 (32KB) + L1i L#4 (32KB) + Core L#4
        PU L#8 (P#4)
        PU L#9 (P#68)
      L2 L#5 (512KB) + L1d L#5 (32KB) + L1i L#5 (32KB) + Core L#5
        PU L#10 (P#5)
        PU L#11 (P#69)
      L2 L#6 (512KB) + L1d L#6 (32KB) + L1i L#6 (32KB) + Core L#6
        PU L#12 (P#6)
        PU L#13 (P#70)
      L2 L#7 (512KB) + L1d L#7 (32KB) + L1i L#7 (32KB) + Core L#7
        PU L#14 (P#7)
        PU L#15 (P#71)
    L3 L#2 (16MB)
      L2 L#8 (512KB) + L1d L#8 (32KB) + L1i L#8 (32KB) + Core L#8
        PU L#16 (P#8)
        PU L#17 (P#72)
      L2 L#9 (512KB) + L1d L#9 (32KB) + L1i L#9 (32KB) + Core L#9
        PU L#18 (P#9)
        PU L#19 (P#73)
      L2 L#10 (512KB) + L1d L#10 (32KB) + L1i L#10 (32KB) + Core L#10
        PU L#20 (P#10)
        PU L#21 (P#74)
      L2 L#11 (512KB) + L1d L#11 (32KB) + L1i L#11 (32KB) + Core L#11
        PU L#22 (P#11)
        PU L#23 (P#75)
    L3 L#3 (16MB)
      L2 L#12 (512KB) + L1d L#12 (32KB) + L1i L#12 (32KB) + Core L#12
        PU L#24 (P#12)
        PU L#25 (P#76)
      L2 L#13 (512KB) + L1d L#13 (32KB) + L1i L#13 (32KB) + Core L#13
        PU L#26 (P#13)
        PU L#27 (P#77)
      L2 L#14 (512KB) + L1d L#14 (32KB) + L1i L#14 (32KB) + Core L#14
        PU L#28 (P#14)
        PU L#29 (P#78)
      L2 L#15 (512KB) + L1d L#15 (32KB) + L1i L#15 (32KB) + Core L#15
        PU L#30 (P#15)
        PU L#31 (P#79)
    L3 L#4 (16MB)
      L2 L#16 (512KB) + L1d L#16 (32KB) + L1i L#16 (32KB) + Core L#16
        PU L#32 (P#16)
        PU L#33 (P#80)
      L2 L#17 (512KB) + L1d L#17 (32KB) + L1i L#17 (32KB) + Core L#17
        PU L#34 (P#17)
        PU L#35 (P#81)
      L2 L#18 (512KB) + L1d L#18 (32KB) + L1i L#18 (32KB) + Core L#18
        PU L#36 (P#18)
        PU L#37 (P#82)
      L2 L#19 (512KB) + L1d L#19 (32KB) + L1i L#19 (32KB) + Core L#19
        PU L#38 (P#19)
        PU L#39 (P#83)
    L3 L#5 (16MB)
      L2 L#20 (512KB) + L1d L#20 (32KB) + L1i L#20 (32KB) + Core L#20
        PU L#40 (P#20)
        PU L#41 (P#84)
      L2 L#21 (512KB) + L1d L#21 (32KB) + L1i L#21 (32KB) + Core L#21
        PU L#42 (P#21)
        PU L#43 (P#85)
      L2 L#22 (512KB) + L1d L#22 (32KB) + L1i L#22 (32KB) + Core L#22
        PU L#44 (P#22)
        PU L#45 (P#86)
      L2 L#23 (512KB) + L1d L#23 (32KB) + L1i L#23 (32KB) + Core L#23
        PU L#46 (P#23)
        PU L#47 (P#87)
    L3 L#6 (16MB)
      L2 L#24 (512KB) + L1d L#24 (32KB) + L1i L#24 (32KB) + Core L#24
        PU L#48 (P#24)
        PU L#49 (P#88)
      L2 L#25 (512KB) + L1d L#25 (32KB) + L1i L#25 (32KB) + Core L#25
        PU L#50 (P#25)
        PU L#51 (P#89)
      L2 L#26 (512KB) + L1d L#26 (32KB) + L1i L#26 (32KB) + Core L#26
        PU L#52 (P#26)
        PU L#53 (P#90)
      L2 L#27 (512KB) + L1d L#27 (32KB) + L1i L#27 (32KB) + Core L#27
        PU L#54 (P#27)
        PU L#55 (P#91)
    L3 L#7 (16MB)
      L2 L#28 (512KB) + L1d L#28 (32KB) + L1i L#28 (32KB) + Core L#28
        PU L#56 (P#28)
        PU L#57 (P#92)
      L2 L#29 (512KB) + L1d L#29 (32KB) + L1i L#29 (32KB) + Core L#29
        PU L#58 (P#29)
        PU L#59 (P#93)
      L2 L#30 (512KB) + L1d L#30 (32KB) + L1i L#30 (32KB) + Core L#30
        PU L#60 (P#30)
        PU L#61 (P#94)
      L2 L#31 (512KB) + L1d L#31 (32KB) + L1i L#31 (32KB) + Core L#31
        PU L#62 (P#31)
        PU L#63 (P#95)
    L3 L#8 (16MB)
      L2 L#32 (512KB) + L1d L#32 (32KB) + L1i L#32 (32KB) + Core L#32
        PU L#64 (P#32)
        PU L#65 (P#96)
      L2 L#33 (512KB) + L1d L#33 (32KB) + L1i L#33 (32KB) + Core L#33
        PU L#66 (P#33)
        PU L#67 (P#97)
      L2 L#34 (512KB) + L1d L#34 (32KB) + L1i L#34 (32KB) + Core L#34
        PU L#68 (P#34)
        PU L#69 (P#98)
      L2 L#35 (512KB) + L1d L#35 (32KB) + L1i L#35 (32KB) + Core L#35
        PU L#70 (P#35)
        PU L#71 (P#99)
    L3 L#9 (16MB)
      L2 L#36 (512KB) + L1d L#36 (32KB) + L1i L#36 (32KB) + Core L#36
        PU L#72 (P#36)
        PU L#73 (P#100)
      L2 L#37 (512KB) + L1d L#37 (32KB) + L1i L#37 (32KB) + Core L#37
        PU L#74 (P#37)
        PU L#75 (P#101)
      L2 L#38 (512KB) + L1d L#38 (32KB) + L1i L#38 (32KB) + Core L#38
        PU L#76 (P#38)
        PU L#77 (P#102)
      L2 L#39 (512KB) + L1d L#39 (32KB) + L1i L#39 (32KB) + Core L#39
        PU L#78 (P#39)
        PU L#79 (P#103)
    L3 L#10 (16MB)
      L2 L#40 (512KB) + L1d L#40 (32KB) + L1i L#40 (32KB) + Core L#40
        PU L#80 (P#40)
        PU L#81 (P#104)
      L2 L#41 (512KB) + L1d L#41 (32KB) + L1i L#41 (32KB) + Core L#41
        PU L#82 (P#41)
        PU L#83 (P#105)
      L2 L#42 (512KB) + L1d L#42 (32KB) + L1i L#42 (32KB) + Core L#42
        PU L#84 (P#42)
        PU L#85 (P#106)
      L2 L#43 (512KB) + L1d L#43 (32KB) + L1i L#43 (32KB) + Core L#43
        PU L#86 (P#43)
        PU L#87 (P#107)
    L3 L#11 (16MB)
      L2 L#44 (512KB) + L1d L#44 (32KB) + L1i L#44 (32KB) + Core L#44
        PU L#88 (P#44)
        PU L#89 (P#108)
      L2 L#45 (512KB) + L1d L#45 (32KB) + L1i L#45 (32KB) + Core L#45
        PU L#90 (P#45)
        PU L#91 (P#109)
      L2 L#46 (512KB) + L1d L#46 (32KB) + L1i L#46 (32KB) + Core L#46
        PU L#92 (P#46)
        PU L#93 (P#110)
      L2 L#47 (512KB) + L1d L#47 (32KB) + L1i L#47 (32KB) + Core L#47
        PU L#94 (P#47)
        PU L#95 (P#111)
    L3 L#12 (16MB)
      L2 L#48 (512KB) + L1d L#48 (32KB) + L1i L#48 (32KB) + Core L#48
        PU L#96 (P#48)
        PU L#97 (P#112)
      L2 L#49 (512KB) + L1d L#49 (32KB) + L1i L#49 (32KB) + Core L#49
        PU L#98 (P#49)
        PU L#99 (P#113)
      L2 L#50 (512KB) + L1d L#50 (32KB) + L1i L#50 (32KB) + Core L#50
        PU L#100 (P#50)
        PU L#101 (P#114)
      L2 L#51 (512KB) + L1d L#51 (32KB) + L1i L#51 (32KB) + Core L#51
        PU L#102 (P#51)
        PU L#103 (P#115)
    L3 L#13 (16MB)
      L2 L#52 (512KB) + L1d L#52 (32KB) + L1i L#52 (32KB) + Core L#52
        PU L#104 (P#52)
        PU L#105 (P#116)
      L2 L#53 (512KB) + L1d L#53 (32KB) + L1i L#53 (32KB) + Core L#53
        PU L#106 (P#53)
        PU L#107 (P#117)
      L2 L#54 (512KB) + L1d L#54 (32KB) + L1i L#54 (32KB) + Core L#54
        PU L#108 (P#54)
        PU L#109 (P#118)
      L2 L#55 (512KB) + L1d L#55 (32KB) + L1i L#55 (32KB) + Core L#55
        PU L#110 (P#55)
        PU L#111 (P#119)
    L3 L#14 (16MB)
      L2 L#56 (512KB) + L1d L#56 (32KB) + L1i L#56 (32KB) + Core L#56
        PU L#112 (P#56)
        PU L#113 (P#120)
      L2 L#57 (512KB) + L1d L#57 (32KB) + L1i L#57 (32KB) + Core L#57
        PU L#114 (P#57)
        PU L#115 (P#121)
      L2 L#58 (512KB) + L1d L#58 (32KB) + L1i L#58 (32KB) + Core L#58
        PU L#116 (P#58)
        PU L#117 (P#122)
      L2 L#59 (512KB) + L1d L#59 (32KB) + L1i L#59 (32KB) + Core L#59
        PU L#118 (P#59)
        PU L#119 (P#123)
    L3 L#15 (16MB)
      L2 L#60 (512KB) + L1d L#60 (32KB) + L1i L#60 (32KB) + Core L#60
        PU L#120 (P#60)
        PU L#121 (P#124)
      L2 L#61 (512KB) + L1d L#61 (32KB) + L1i L#61 (32KB) + Core L#61
        PU L#122 (P#61)
        PU L#123 (P#125)
      L2 L#62 (512KB) + L1d L#62 (32KB) + L1i L#62 (32KB) + Core L#62
        PU L#124 (P#62)
        PU L#125 (P#126)
      L2 L#63 (512KB) + L1d L#63 (32KB) + L1i L#63 (32KB) + Core L#63
        PU L#126 (P#63)
        PU L#127 (P#127)
  HostBridge L#0
    PCIBridge
      PCI 10de:1e02
        GPU L#0 "renderD128"
        GPU L#1 "card0"
  HostBridge L#2
    PCIBridge
      PCI 10de:1e02
        GPU L#2 "card1"
        GPU L#3 "renderD129"
  HostBridge L#4
    PCIBridge
      PCIBridge
        PCIBridge
          PCI 1d6a:07b1
            Net L#4 "eth0"
        PCIBridge
          PCI 8086:2723
            Net L#5 "wlan1"
        PCIBridge
          PCI 10ec:8125
        PCIBridge
          PCI 1022:7901
            Block(Disk) L#6 "sda"
            Block(Other) L#7 "sr0"
        PCIBridge
          PCI 1022:7901
    PCIBridge
      PCI 10de:1e07
        GPU L#8 "renderD130"
        GPU L#9 "card2"
    PCIBridge
      PCI 10de:1e07
        GPU L#10 "card3"
        GPU L#11 "renderD131"
[chibi@centos8 ~]$ free
              total        used        free      shared  buff/cache   available
Mem:      131596828     1926040   121639248       21000     8031540   128429668
Swap:             0           0           0
[chibi@centos8 ~]$ sensors
iwlwifi-virtual-0
Adapter: Virtual device
temp1:        +32.0°C

eth0-pci-4400
Adapter: PCI adapter
PHY Temperature:  +48.1°C

k10temp-pci-00c3
Adapter: PCI adapter
Tdie:         +41.2°C  (high = +70.0°C)
Tctl:         +41.2°C

[chibi@centos8 ~]$ sudo hddtemp /dev/sda
[sudo] chibi のパスワード:
/dev/sda: TS128GSSD370S: 25°C
[chibi@centos8 ~]$ nvidia-smi nvlink -c
GPU 0: TITAN RTX (UUID: GPU-5a71d61e-f130-637a-b33d-4df555b0ed88)
GPU 1: TITAN RTX (UUID: GPU-7fb51c1d-c1e7-35cc-aad7-66971f05ddb7)
GPU 2: GeForce RTX 2080 Ti (UUID: GPU-13277ce5-e1e9-0cb1-8cee-6c9e6618e774)
GPU 3: GeForce RTX 2080 Ti (UUID: GPU-1ac935c2-557f-282e-14e5-3f749ffd63ac)
[chibi@centos8 ~]$ sensors
iwlwifi-virtual-0
Adapter: Virtual device
temp1:        +32.0°C

eth0-pci-4400
Adapter: PCI adapter
PHY Temperature:  +47.5°C

k10temp-pci-00c3
Adapter: PCI adapter
Tdie:         +40.2°C  (high = +70.0°C)
Tctl:         +40.2°C

[chibi@centos8 ~]$