[chibi@centos8 ~]$ sudo nvidia-docker run --rm -ti nvcr.io/nvidia/tensorflow:19.04-py3
Unable to find image 'nvcr.io/nvidia/tensorflow:19.04-py3' locally
19.04-py3: Pulling from nvidia/tensorflow
34667c7e4631: Pulling fs layer
d18d76a881a4: Pulling fs layer
119c7358fbfc: Pulling fs layer
2aaf13f3eff0: Pulling fs layer
202fa0f8874b: Pulling fs layer
3b700a61ede6: Waiting
2aaf13f3eff0: Waiting
a1e76dce1aec: Pulling fs layer
9b91fa2f9276: Pulling fs layer
b5877a9add73: Pulling fs layer
b5877a9add73: Waiting
534bbf505504: Pulling fs layer
4956bf3bbbb9: Pulling fs layer
f4371944c97d: Pulling fs layer
4615a735431d: Pulling fs layer
5db2639932b5: Pulling fs layer
629d5c9d75a4: Pulling fs layer
8071b94b5429: Pulling fs layer
6eb8eba2ad5a: Waiting
e32e86c15b8b: Waiting
08db5b51b243: Waiting
f71ce95fb406: Waiting
3498ed8c5685: Waiting
bab74df105f1: Waiting
34bc85bf8bef: Pulling fs layer
534bbf505504: Waiting
41bc2d0a4d4d: Pulling fs layer
a2ceadc61854: Pulling fs layer
4956bf3bbbb9: Waiting
a531832992b8: Pulling fs layer
f4371944c97d: Waiting
4615a735431d: Waiting
e5cafe011f22: Pull complete
eca19a329cd4: Pull complete
65ee50af0bcc: Pull complete
5f60ec8c32f4: Pull complete
d7dcb657fa13: Pull complete
1f6ef6575fbe: Pull complete
d1ef346a3015: Pull complete
4ef9cb404fd5: Pull complete
f6797f45a018: Pull complete
1d4380527325: Pull complete
965f2629db02: Pull complete
5debff4c8c0a: Pull complete
b3a3a9d82be6: Pull complete
eac05f20b729: Pull complete
3ce0a7f80167: Pull complete
2a21e34a5784: Pull complete
c1ccf19e258e: Pull complete
0b6ea9d0652b: Pull complete
307bc8c3f024: Pull complete
ca75fd593a79: Pull complete
0cd3cdca1af7: Pull complete
48e857e9d372: Pull complete
3264ea403ca9: Pull complete
Digest: sha256:aaebc136d5d50937362675c77afd908bd96cded68846f39163050a023c8a9851
Status: Downloaded newer image for nvcr.io/nvidia/tensorflow:19.04-py3
                                                                                
================
== TensorFlow ==
================

NVIDIA Release 19.04 (build 6132408)
TensorFlow Version 1.13.1

Container image Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
Copyright 2017-2019 The TensorFlow Authors.  All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION.  All rights reserved.
NVIDIA modifications are covered by the license terms that apply to the underlying project or file.

NOTE: MOFED driver for multi-node communication was not detected.
      Multi-node communication performance may be reduced.

NOTE: The SHMEM allocation limit is set to the default of 64MB.  This may be
   insufficient for TensorFlow.  NVIDIA recommends the use of the following flags:
   nvidia-docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 ...

root@ad8c883cdd71:/workspace# ls
README.md  docker-examples  nvidia-examples
root@ad8c883cdd71:/workspace# cd nvidia-examples
root@ad8c883cdd71:/workspace/nvidia-examples# ls
NCF              bert                 cnn           ssdv1.2
OpenSeq2Seq      big_lstm             gnmt_v2       tensorrt
UNet_Industrial  build_imagenet_data  resnet50v1.5
root@ad8c883cdd71:/workspace/nvidia-examples# cd big_lstm
root@ad8c883cdd71:/workspace/nvidia-examples/big_lstm# ls
1b_word_vocab.txt  data_utils_test.py         language_model_test.py
README.md          download_1b_words_data.sh  model_utils.py
__init__.py        hparams.py                 run_utils.py
common.py          hparams_test.py            single_lm_train.py
data_utils.py      language_model.py          testdata
root@ad8c883cdd71:/workspace/nvidia-examples/big_lstm# ./download_1b_words_data.sh
Please specify root of dataset directory: data

Success: dataset root dir validated

--2020-06-22 19:35:48--  http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1792209805 (1.7G) [application/x-gzip]
Saving to: ‘1-billion-word-language-modeling-benchmark-r13output.tar.gz’

1-billion-word-lang 100%[===================>]   1.67G   546KB/s    in 53m 21s

2020-06-22 20:29:10 (547 KB/s) - ‘1-billion-word-language-modeling-benchmark-r13output.tar.gz’ saved [1792209805/1792209805]

1-billion-word-language-modeling-benchmark-r13output/
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
1-billion-word-language-modeling-benchmark-r13output/.svn/
1-billion-word-language-modeling-benchmark-r13output/.svn/tmp/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/de/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/de/de102cd0c91cd19e6612f0840e68a2f20ba8134c.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/de/deed1b75d3bd5cc36ae6aeb85d56680b892b7948.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/86/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/86/86c58db52fbf362c5bc329afc33b8805085fcb0d.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9f/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9f/9f2882e21f860a83ad6ea8898ebab140974ed301.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/bc/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/bc/bcdbc523ee7488dc438cab869b6d5e236578dbfa.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d2/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d2/d2718bc26d0ee0a213d7d4add99a304cb5b39ede.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c5/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c5/c5b24f61479da923123d0394a188da922ea0359c.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/11/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/11/116d6ea61730d8199127596b072e981338597779.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/b0/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/b0/b0e26559cfe641245584a9400b35ba28d64f1411.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d3/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/d3/d3ae508e3bcb0e696dd70aecd052410f1f7afc1d.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9e/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/9e/9e148bd766e8805e0eb97eeae250433ec7a2e996.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/31/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/31/31b645a482e0b81fda3c567cada307c6fcf7ec80.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/da/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/da/da39a3ee5e6b4b0d3255bfef95601890afd80709.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c1/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/c1/c1ed42c415ec884e591fb5c70d373da640a383b5.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/e3/
1-billion-word-language-modeling-benchmark-r13output/.svn/pristine/e3/e37ba0f85e94073ccaced1eed7e4f5d737a25f49.svn-base
1-billion-word-language-modeling-benchmark-r13output/.svn/entries
1-billion-word-language-modeling-benchmark-r13output/.svn/format
1-billion-word-language-modeling-benchmark-r13output/.svn/wc.db
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00015-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00031-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00027-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00010-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00033-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00042-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00046-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00037-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00029-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00013-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00002-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00048-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00006-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00030-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00025-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00039-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00008-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00020-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00001-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00034-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00044-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00045-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00016-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00004-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00035-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00038-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00009-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00024-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00022-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00021-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00032-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00011-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00049-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00041-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00019-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00023-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00040-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00014-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00007-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00017-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00012-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00018-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00003-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00028-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en-00000-of-00100
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00043-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00005-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00036-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00026-of-00050
1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en.heldout-00047-of-00050
1-billion-word-language-modeling-benchmark-r13output/README

Success! One billion words dataset ready at:
data/1-billion-word-language-modeling-benchmark-r13output/
Please pass this dir to single_lm_train.py via the --datadir option.

root@ad8c883cdd71:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=4 --datadir=./data/1-billion-word-language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'num_gpus': 4, 'emb_size': 512, 'num_sampled': 8192, 'num_delayed_steps': 150, 'average_params': True, 'batch_size': 128, 'num_shards': 8, 'num_layers': 1, 'optimizer': 0, 'do_summaries': False, 'max_time': 180, 'projected_size': 512, 'state_size': 2048, 'num_steps': 20, 'vocab_size': 793470, 'max_grad_norm': 10.0, 'run_profiler': False, 'keep_prob': 0.9, 'learning_rate': 0.2}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1592858675.3814945
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
model/model_1/state_1_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:1
model/model_2/state_2_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:2
model/model_3/state_3_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:3
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-06-22 20:44:36.053499: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2900005000 Hz
2020-06-22 20:44:36.060095: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xa58d3e0 executing computations on platform Host. Devices:
2020-06-22 20:44:36.060141: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-06-22 20:44:36.509671: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 20:44:36.552030: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 20:44:36.558916: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 20:44:36.559821: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xa58c820 executing computations on platform CUDA. Devices:
2020-06-22 20:44:36.559845: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-06-22 20:44:36.559851: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-06-22 20:44:36.559856: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-06-22 20:44:36.559862: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-06-22 20:44:36.560861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.22GiB
2020-06-22 20:44:36.560891: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-06-22 20:44:36.560915: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-06-22 20:44:36.560938: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-06-22 20:44:36.560961: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-06-22 20:44:37.198656: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-06-22 20:44:37.198696: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-06-22 20:44:37.198703: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-06-22 20:44:37.198709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-06-22 20:44:37.198713: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-06-22 20:44:37.198717: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-06-22 20:44:37.198860: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22507 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-06-22 20:44:37.199213: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-06-22 20:44:37.199467: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-06-22 20:44:37.199720: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
Finished processing!
2020-06-22 20:44:56.619489: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 1, time = 9.96s, wps = 1029, train loss = 12.9931
Iteration 2, time = 7.91s, wps = 1295, train loss = 12.9280
Iteration 3, time = 0.10s, wps = 106180, train loss = 12.8353
Iteration 4, time = 0.10s, wps = 100420, train loss = 12.5591
Iteration 5, time = 0.11s, wps = 95389, train loss = 12.6864
Iteration 6, time = 0.09s, wps = 115991, train loss = 11.2054
Iteration 7, time = 0.09s, wps = 107874, train loss = 87.8683
Iteration 8, time = 0.10s, wps = 106682, train loss = 34.8211
Iteration 9, time = 0.09s, wps = 110045, train loss = 17.4595
Iteration 20, time = 1.02s, wps = 110158, train loss = 11.8758
Iteration 40, time = 1.86s, wps = 109985, train loss = 9.6805
Iteration 60, time = 1.88s, wps = 108716, train loss = 8.9143
Iteration 80, time = 1.87s, wps = 109548, train loss = 8.2828
Iteration 100, time = 1.87s, wps = 109529, train loss = 8.0193
Iteration 120, time = 1.86s, wps = 110272, train loss = 7.4370
Iteration 140, time = 1.88s, wps = 109169, train loss = 7.1279
Iteration 160, time = 1.87s, wps = 109411, train loss = 6.7670
Iteration 180, time = 1.86s, wps = 109982, train loss = 6.7183
Iteration 200, time = 1.86s, wps = 110370, train loss = 6.6514
Iteration 220, time = 1.87s, wps = 109502, train loss = 6.4099
Iteration 240, time = 1.86s, wps = 110098, train loss = 6.2664
Iteration 260, time = 1.86s, wps = 110132, train loss = 6.1636
Iteration 280, time = 1.87s, wps = 109686, train loss = 6.1795
Iteration 300, time = 1.86s, wps = 109937, train loss = 6.0657
Iteration 320, time = 1.88s, wps = 108929, train loss = 6.1214
Iteration 340, time = 1.89s, wps = 108405, train loss = 6.0549
Iteration 360, time = 1.88s, wps = 108855, train loss = 5.8912
Iteration 380, time = 1.88s, wps = 109214, train loss = 5.9306
Iteration 400, time = 1.87s, wps = 109328, train loss = 5.8835
Iteration 420, time = 1.87s, wps = 109767, train loss = 5.8815
Iteration 440, time = 1.87s, wps = 109707, train loss = 5.8202
Iteration 460, time = 1.88s, wps = 108652, train loss = 5.7747
Iteration 480, time = 1.88s, wps = 108739, train loss = 5.6940
Iteration 500, time = 1.88s, wps = 109216, train loss = 5.7500
Iteration 520, time = 1.88s, wps = 109034, train loss = 5.7221
Iteration 540, time = 1.88s, wps = 109052, train loss = 5.6832
Iteration 560, time = 1.86s, wps = 110105, train loss = 5.6650
Iteration 580, time = 1.87s, wps = 109631, train loss = 5.6015
Iteration 600, time = 1.87s, wps = 109583, train loss = 5.6614
Iteration 620, time = 1.87s, wps = 109373, train loss = 5.5329
Iteration 640, time = 1.89s, wps = 108317, train loss = 5.6230
Iteration 660, time = 1.88s, wps = 108983, train loss = 5.5817
Iteration 680, time = 1.87s, wps = 109627, train loss = 5.4870
Iteration 700, time = 1.88s, wps = 108809, train loss = 5.4730
Iteration 720, time = 1.88s, wps = 109179, train loss = 5.4817
Iteration 740, time = 1.86s, wps = 109938, train loss = 5.5010
Iteration 760, time = 1.87s, wps = 109242, train loss = 5.4541
Iteration 780, time = 1.89s, wps = 108316, train loss = 5.4185
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
Finished processing!
Iteration 800, time = 3.49s, wps = 58721, train loss = 5.3786
Iteration 820, time = 1.89s, wps = 108554, train loss = 5.3659
Iteration 840, time = 1.87s, wps = 109442, train loss = 5.4025
Iteration 860, time = 1.87s, wps = 109757, train loss = 5.3293
Iteration 880, time = 1.88s, wps = 108754, train loss = 5.3402
Iteration 900, time = 1.88s, wps = 108764, train loss = 5.2727
Iteration 920, time = 1.89s, wps = 108172, train loss = 5.3284
Iteration 940, time = 1.87s, wps = 109346, train loss = 5.3269
Iteration 960, time = 1.88s, wps = 108839, train loss = 5.2533
Iteration 980, time = 1.89s, wps = 108438, train loss = 5.2419
Iteration 1000, time = 1.88s, wps = 108855, train loss = 5.1476
Iteration 1020, time = 1.89s, wps = 108092, train loss = 5.1798
Iteration 1040, time = 1.87s, wps = 109395, train loss = 5.2383
Iteration 1060, time = 1.89s, wps = 108439, train loss = 5.2095
Iteration 1080, time = 1.88s, wps = 108740, train loss = 5.1208
Iteration 1100, time = 1.87s, wps = 109351, train loss = 5.1614
Iteration 1120, time = 1.88s, wps = 108766, train loss = 5.1756
Iteration 1140, time = 1.88s, wps = 108789, train loss = 5.1507
Iteration 1160, time = 1.88s, wps = 109103, train loss = 5.1384
Iteration 1180, time = 1.89s, wps = 108446, train loss = 5.0934
Iteration 1200, time = 1.91s, wps = 107505, train loss = 5.1221
Iteration 1220, time = 1.86s, wps = 109864, train loss = 5.1332
Iteration 1240, time = 1.87s, wps = 109642, train loss = 5.0789
Iteration 1260, time = 1.87s, wps = 109535, train loss = 5.0189
Iteration 1280, time = 1.91s, wps = 107327, train loss = 5.0716
Iteration 1300, time = 1.88s, wps = 108908, train loss = 5.0577
Iteration 1320, time = 1.87s, wps = 109568, train loss = 5.0419
Iteration 1340, time = 1.90s, wps = 107571, train loss = 5.0607
Iteration 1360, time = 1.90s, wps = 107672, train loss = 5.0227
Iteration 1380, time = 1.90s, wps = 107579, train loss = 4.9981
Iteration 1400, time = 1.88s, wps = 108895, train loss = 4.9326
Iteration 1420, time = 1.89s, wps = 108510, train loss = 4.9529
Iteration 1440, time = 1.88s, wps = 108675, train loss = 4.9727
Iteration 1460, time = 1.88s, wps = 109052, train loss = 4.9476
Iteration 1480, time = 1.87s, wps = 109450, train loss = 4.8972
Iteration 1500, time = 1.88s, wps = 109098, train loss = 4.9608
Iteration 1520, time = 1.89s, wps = 108374, train loss = 4.9122
Iteration 1540, time = 1.89s, wps = 108577, train loss = 4.9157
Iteration 1560, time = 1.89s, wps = 108511, train loss = 4.9409
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
Finished processing!
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m14.852s
user    23m39.660s
sys     4m50.203s
root@ad8c883cdd71:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=3 --datadir=./data/1-billion-word-
language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'num_steps': 20, 'state_size': 2048, 'average_params': True, 'emb_size': 512, 'run_profiler': False, 'num_layers': 1, 'optimizer': 0, 'batch_size': 128, 'do_summaries': False, 'learning_rate': 0.2, 'num_delayed_steps': 150, 'num_sampled': 8192, 'num_gpus': 3, 'max_grad_norm': 10.0, 'max_time': 180, 'vocab_size': 793470, 'projected_size': 512, 'keep_prob': 0.9, 'num_shards': 8}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1592859945.2676558
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
model/model_1/state_1_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:1
model/model_2/state_2_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:2
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-06-22 21:05:45.803511: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2900005000 Hz
2020-06-22 21:05:45.809886: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xab676f0 executing computations on platform Host. Devices:
2020-06-22 21:05:45.809930: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-06-22 21:05:46.250343: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 21:05:46.285195: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 21:05:46.292171: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 21:05:46.293073: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0xab67110 executing computations on platform CUDA. Devices:
2020-06-22 21:05:46.293105: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-06-22 21:05:46.293112: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-06-22 21:05:46.293120: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-06-22 21:05:46.293126: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-06-22 21:05:46.294163: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.22GiB
2020-06-22 21:05:46.294192: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-06-22 21:05:46.294215: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-06-22 21:05:46.294237: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-06-22 21:05:46.294262: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-06-22 21:05:46.931236: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-06-22 21:05:46.931282: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-06-22 21:05:46.931287: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-06-22 21:05:46.931293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-06-22 21:05:46.931297: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-06-22 21:05:46.931302: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-06-22 21:05:46.931443: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22507 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-06-22 21:05:46.931739: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-06-22 21:05:46.931965: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-06-22 21:05:46.932107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1070: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
Finished processing!
2020-06-22 21:05:59.497913: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 1580, time = 7.88s, wps = 974, train loss = 5.4991
Iteration 1581, time = 5.90s, wps = 1302, train loss = 5.1368
Iteration 1582, time = 0.09s, wps = 90350, train loss = 4.9570
Iteration 1583, time = 0.08s, wps = 97957, train loss = 4.9906
Iteration 1584, time = 0.08s, wps = 99009, train loss = 4.9420
Iteration 1585, time = 0.08s, wps = 101475, train loss = 4.9719
Iteration 1586, time = 0.08s, wps = 99416, train loss = 4.9348
Iteration 1587, time = 0.08s, wps = 102049, train loss = 4.8371
Iteration 1588, time = 0.08s, wps = 101529, train loss = 4.9271
Iteration 1599, time = 0.84s, wps = 101104, train loss = 4.9587
Iteration 1619, time = 1.51s, wps = 101559, train loss = 4.9857
Iteration 1639, time = 1.51s, wps = 101449, train loss = 4.8869
Iteration 1659, time = 1.52s, wps = 101230, train loss = 4.8786
Iteration 1679, time = 1.51s, wps = 101720, train loss = 4.8949
Iteration 1699, time = 1.51s, wps = 101549, train loss = 4.9194
Iteration 1719, time = 1.50s, wps = 102633, train loss = 4.8692
Iteration 1739, time = 1.51s, wps = 101728, train loss = 4.8893
Iteration 1759, time = 1.51s, wps = 101647, train loss = 4.8610
Iteration 1779, time = 1.51s, wps = 101929, train loss = 4.8247
Iteration 1799, time = 1.52s, wps = 101266, train loss = 4.8701
Iteration 1819, time = 1.51s, wps = 101606, train loss = 4.8917
Iteration 1839, time = 1.53s, wps = 100658, train loss = 4.8450
Iteration 1859, time = 1.53s, wps = 100624, train loss = 4.8385
Iteration 1879, time = 1.50s, wps = 102144, train loss = 4.7924
Iteration 1899, time = 1.51s, wps = 101985, train loss = 4.8256
Iteration 1919, time = 1.52s, wps = 101212, train loss = 4.7768
Iteration 1939, time = 1.51s, wps = 101475, train loss = 4.7890
Iteration 1959, time = 1.52s, wps = 100870, train loss = 4.7915
Iteration 1979, time = 1.51s, wps = 101553, train loss = 4.8773
Iteration 1999, time = 1.51s, wps = 101557, train loss = 4.8638
Iteration 2019, time = 1.52s, wps = 100938, train loss = 4.8264
Iteration 2039, time = 1.52s, wps = 100864, train loss = 4.7708
Iteration 2059, time = 1.51s, wps = 101997, train loss = 4.7646
Iteration 2079, time = 1.51s, wps = 101818, train loss = 4.8554
Iteration 2099, time = 1.51s, wps = 101652, train loss = 4.7646
Iteration 2119, time = 1.53s, wps = 100442, train loss = 4.8192
Iteration 2139, time = 1.51s, wps = 101778, train loss = 4.6941
Iteration 2159, time = 1.51s, wps = 101460, train loss = 4.8339
Iteration 2179, time = 1.50s, wps = 102089, train loss = 4.8029
Iteration 2199, time = 1.52s, wps = 101199, train loss = 4.8248
Iteration 2219, time = 1.52s, wps = 101091, train loss = 4.7834
Iteration 2239, time = 1.53s, wps = 100274, train loss = 4.8315
Iteration 2259, time = 1.50s, wps = 102150, train loss = 4.6937
Iteration 2279, time = 1.51s, wps = 101680, train loss = 4.7867
Iteration 2299, time = 1.51s, wps = 101512, train loss = 4.6648
Iteration 2319, time = 1.53s, wps = 100686, train loss = 4.7590
Iteration 2339, time = 1.52s, wps = 101009, train loss = 4.6849
Iteration 2359, time = 1.53s, wps = 100550, train loss = 4.7180
Iteration 2379, time = 1.53s, wps = 100579, train loss = 4.6819
Iteration 2399, time = 1.52s, wps = 101248, train loss = 4.7601
Iteration 2419, time = 1.52s, wps = 100726, train loss = 4.7014
Iteration 2439, time = 1.51s, wps = 101541, train loss = 4.7163
Iteration 2459, time = 1.51s, wps = 101454, train loss = 4.7191
Iteration 2479, time = 1.52s, wps = 101194, train loss = 4.6500
Iteration 2499, time = 1.51s, wps = 101579, train loss = 4.6663
Iteration 2519, time = 1.52s, wps = 100788, train loss = 4.7096
Iteration 2539, time = 1.51s, wps = 101581, train loss = 4.6999
Iteration 2559, time = 1.52s, wps = 101188, train loss = 4.6865
Iteration 2579, time = 1.52s, wps = 101091, train loss = 4.7258
Iteration 2599, time = 1.55s, wps = 99125, train loss = 4.6827
Iteration 2619, time = 1.52s, wps = 100868, train loss = 4.6680
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
Finished processing!
Iteration 2639, time = 3.14s, wps = 48923, train loss = 4.6573
Iteration 2659, time = 1.51s, wps = 101583, train loss = 4.7355
Iteration 2679, time = 1.51s, wps = 101655, train loss = 4.7174
Iteration 2699, time = 1.52s, wps = 101240, train loss = 4.6277
Iteration 2719, time = 1.52s, wps = 100797, train loss = 4.5637
Iteration 2739, time = 1.51s, wps = 101554, train loss = 4.6590
Iteration 2759, time = 1.53s, wps = 100533, train loss = 4.6591
Iteration 2779, time = 1.53s, wps = 100404, train loss = 4.6163
Iteration 2799, time = 1.52s, wps = 101124, train loss = 4.5883
Iteration 2819, time = 1.53s, wps = 100553, train loss = 4.6736
Iteration 2839, time = 1.52s, wps = 100844, train loss = 4.6715
Iteration 2859, time = 1.52s, wps = 100923, train loss = 4.7346
Iteration 2879, time = 1.51s, wps = 101705, train loss = 4.6450
Iteration 2899, time = 1.53s, wps = 100379, train loss = 4.6676
Iteration 2919, time = 1.52s, wps = 100811, train loss = 4.6360
Iteration 2939, time = 1.52s, wps = 101277, train loss = 4.6021
Iteration 2959, time = 1.53s, wps = 100574, train loss = 4.6119
Iteration 2979, time = 1.52s, wps = 100973, train loss = 4.5833
Iteration 2999, time = 1.51s, wps = 101412, train loss = 4.5993
Iteration 3019, time = 1.52s, wps = 101280, train loss = 4.5699
Iteration 3039, time = 1.52s, wps = 101382, train loss = 4.6009
Iteration 3059, time = 1.54s, wps = 99939, train loss = 4.4993
Iteration 3079, time = 1.53s, wps = 100659, train loss = 4.5225
Iteration 3099, time = 1.52s, wps = 100850, train loss = 4.5541
Iteration 3119, time = 1.52s, wps = 101027, train loss = 4.5700
Iteration 3139, time = 1.52s, wps = 100825, train loss = 4.6511
Iteration 3159, time = 1.53s, wps = 100257, train loss = 4.6142
Iteration 3179, time = 1.52s, wps = 101106, train loss = 4.5483
Iteration 3199, time = 1.51s, wps = 101506, train loss = 4.5391
Iteration 3219, time = 1.51s, wps = 101755, train loss = 4.5190
Iteration 3239, time = 1.52s, wps = 101005, train loss = 4.5179
Iteration 3259, time = 1.52s, wps = 101343, train loss = 4.5012
Iteration 3279, time = 1.53s, wps = 100613, train loss = 4.6233
Iteration 3299, time = 1.52s, wps = 101218, train loss = 4.5606
Iteration 3319, time = 1.52s, wps = 101104, train loss = 4.5684
Iteration 3339, time = 1.53s, wps = 100421, train loss = 4.5752
Iteration 3359, time = 1.53s, wps = 100628, train loss = 4.5861
Iteration 3379, time = 1.52s, wps = 100763, train loss = 4.5619
Iteration 3399, time = 1.52s, wps = 101148, train loss = 4.5600
Iteration 3419, time = 1.52s, wps = 101033, train loss = 4.6020
Iteration 3439, time = 1.52s, wps = 100783, train loss = 4.6110
Iteration 3459, time = 1.52s, wps = 100747, train loss = 4.4795
Iteration 3479, time = 1.52s, wps = 100813, train loss = 4.5527
Iteration 3499, time = 1.54s, wps = 100054, train loss = 4.6057
Iteration 3519, time = 1.52s, wps = 100864, train loss = 4.5134
Iteration 3539, time = 1.52s, wps = 101060, train loss = 4.5286
Iteration 3559, time = 1.51s, wps = 101680, train loss = 4.5713
Iteration 3579, time = 1.53s, wps = 100465, train loss = 4.5151
Iteration 3599, time = 1.51s, wps = 101563, train loss = 4.5071
Iteration 3619, time = 1.52s, wps = 100816, train loss = 4.5112
Iteration 3639, time = 1.53s, wps = 100647, train loss = 4.4841
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m11.902s
user    20m40.978s
sys     4m36.557s
root@ad8c883cdd71:/workspace/nvidia-examples/big_lstm# time python single_lm_train.py --mode=train --logdir=./logs --num_gpus=2 --datadir=./data/1-billion-word-
language-modeling-benchmark-r13output

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

*****HYPER PARAMETERS*****
{'learning_rate': 0.2, 'vocab_size': 793470, 'num_delayed_steps': 150, 'max_time': 180, 'run_profiler': False, 'num_steps': 20, 'optimizer': 0, 'num_gpus': 2, 'state_size': 2048, 'average_params': True, 'keep_prob': 0.9, 'num_sampled': 8192, 'num_layers': 1, 'projected_size': 512, 'batch_size': 128, 'max_grad_norm': 10.0, 'do_summaries': False, 'num_shards': 8, 'emb_size': 512}
**************************
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/model_utils.py:33: UniformUnitScaling.__init__ (from tensorflow.python.ops.init_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/language_model.py:107: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_impl.py:1444: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Current time: 1592862145.2891047
ALL VARIABLES
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:18: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/global_step:0 () <dtype: 'int32_ref'>
model/model/emb_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/emb_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/Adagrad:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/Adagrad:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/Adagrad:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_0/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_1/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_2/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_3/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_4/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_5/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_6/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_w_7/Adagrad:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/model/softmax_b/Adagrad:0 (793470,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_0/ExponentialMovingAverage:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/B/ExponentialMovingAverage:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/model/lstm_0/LSTMCell/W_P_0/ExponentialMovingAverage:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
TRAINABLE VARIABLES
model/emb_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/emb_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_0:0 (1024, 8192) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/B:0 (8192,) <dtype: 'float32_ref'> /gpu:0
model/lstm_0/LSTMCell/W_P_0:0 (2048, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_0:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_1:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_2:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_3:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_4:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_5:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_6:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_w_7:0 (99184, 512) <dtype: 'float32_ref'> /gpu:0
model/softmax_b:0 (793470,) <dtype: 'float32_ref'> /gpu:0
LOCAL VARIABLES
model/model/state_0_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:0
model/model_1/state_1_0:0 (128, 2560) <dtype: 'float32_ref'> /gpu:1
WARNING:tensorflow:From /opt/tensorflow/nvidia-examples/big_lstm/run_utils.py:32: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2020-06-22 21:42:25.699497: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2900005000 Hz
2020-06-22 21:42:25.705835: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0x845f100 executing computations on platform Host. Devices:
2020-06-22 21:42:25.705882: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): <undefined>, <undefined>
2020-06-22 21:42:26.140630: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 21:42:26.176070: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 21:42:26.182930: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-06-22 21:42:26.183838: I tensorflow/compiler/xla/service/service.cc:161] XLA service 0x845eb20 executing computations on platform CUDA. Devices:
2020-06-22 21:42:26.183856: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (0): TITAN RTX, Compute Capability 7.5
2020-06-22 21:42:26.183861: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (1): TITAN RTX, Compute Capability 7.5
2020-06-22 21:42:26.183866: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (2): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-06-22 21:42:26.183873: I tensorflow/compiler/xla/service/service.cc:168]   StreamExecutor device (3): GeForce RTX 2080 Ti, Compute Capability 7.5
2020-06-22 21:42:26.184923: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:01:00.0
totalMemory: 23.65GiB freeMemory: 23.23GiB
2020-06-22 21:42:26.184952: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 1 with properties:
name: TITAN RTX major: 7 minor: 5 memoryClockRate(GHz): 1.77
pciBusID: 0000:21:00.0
totalMemory: 23.65GiB freeMemory: 23.49GiB
2020-06-22 21:42:26.184974: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 2 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4a:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-06-22 21:42:26.184997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 3 with properties:
name: GeForce RTX 2080 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.635
pciBusID: 0000:4b:00.0
totalMemory: 10.76GiB freeMemory: 10.61GiB
2020-06-22 21:42:26.185020: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2020-06-22 21:42:26.820163: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-06-22 21:42:26.820209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3
2020-06-22 21:42:26.820214: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N N N N
2020-06-22 21:42:26.820219: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   N N N N
2020-06-22 21:42:26.820223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N N
2020-06-22 21:42:26.820228: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N N N
2020-06-22 21:42:26.820364: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 22508 MB memory) -> physical GPU (device: 0, name: TITAN RTX, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-06-22 21:42:26.820814: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 22765 MB memory) -> physical GPU (device: 1, name: TITAN RTX, pci bus id: 0000:21:00.0, compute capability: 7.5)
2020-06-22 21:42:26.820961: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 10231 MB memory) -> physical GPU (device: 2, name: GeForce RTX 2080 Ti, pci bus id: 0000:4a:00.0, compute capability: 7.5)
2020-06-22 21:42:26.821115: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 10231 MB memory) -> physical GPU (device: 3, name: GeForce RTX 2080 Ti, pci bus id: 0000:4b:00.0, compute capability: 7.5)
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1070: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
Finished processing!
2020-06-22 21:42:36.122353: I tensorflow/stream_executor/dso_loader.cc:153] successfully opened CUDA library libcublas.so.10 locally
Iteration 3658, time = 5.34s, wps = 960, train loss = 4.7358
Iteration 3659, time = 3.50s, wps = 1464, train loss = 4.5549
Iteration 3660, time = 0.07s, wps = 77548, train loss = 4.5335
Iteration 3661, time = 0.06s, wps = 80497, train loss = 4.5285
Iteration 3662, time = 0.07s, wps = 76497, train loss = 4.5351
Iteration 3663, time = 0.06s, wps = 85630, train loss = 4.5974
Iteration 3664, time = 0.06s, wps = 90494, train loss = 4.5375
Iteration 3665, time = 0.06s, wps = 90567, train loss = 4.5138
Iteration 3666, time = 0.06s, wps = 89600, train loss = 4.5633
Iteration 3677, time = 0.63s, wps = 89399, train loss = 4.5439
Iteration 3697, time = 1.16s, wps = 87982, train loss = 4.4645
Iteration 3717, time = 1.16s, wps = 88284, train loss = 4.5093
Iteration 3737, time = 1.15s, wps = 89319, train loss = 4.4569
Iteration 3757, time = 1.15s, wps = 89017, train loss = 4.4696
Iteration 3777, time = 1.16s, wps = 88361, train loss = 4.5421
Iteration 3797, time = 1.16s, wps = 88158, train loss = 4.4886
Iteration 3817, time = 1.16s, wps = 88561, train loss = 4.4171
Iteration 3837, time = 1.16s, wps = 88070, train loss = 4.5340
Iteration 3857, time = 1.15s, wps = 88728, train loss = 4.3578
Iteration 3877, time = 1.16s, wps = 88414, train loss = 4.4961
Iteration 3897, time = 1.15s, wps = 89270, train loss = 4.5462
Iteration 3917, time = 1.15s, wps = 89122, train loss = 4.5059
Iteration 3937, time = 1.17s, wps = 87264, train loss = 4.4497
Iteration 3957, time = 1.16s, wps = 87943, train loss = 4.4848
Iteration 3977, time = 1.15s, wps = 89310, train loss = 4.4501
Iteration 3997, time = 1.16s, wps = 88282, train loss = 4.5294
Iteration 4017, time = 1.16s, wps = 88370, train loss = 4.5675
Iteration 4037, time = 1.16s, wps = 88058, train loss = 4.5166
Iteration 4057, time = 1.16s, wps = 88011, train loss = 4.4421
Iteration 4077, time = 1.17s, wps = 87533, train loss = 4.5431
Iteration 4097, time = 1.16s, wps = 88640, train loss = 4.4803
Iteration 4117, time = 1.17s, wps = 87853, train loss = 4.4799
Iteration 4137, time = 1.16s, wps = 88094, train loss = 4.4656
Iteration 4157, time = 1.18s, wps = 86649, train loss = 4.4530
Iteration 4177, time = 1.16s, wps = 87931, train loss = 4.4792
Iteration 4197, time = 1.14s, wps = 89898, train loss = 4.4070
Iteration 4217, time = 1.16s, wps = 87907, train loss = 4.4931
Iteration 4237, time = 1.15s, wps = 88932, train loss = 4.4289
Iteration 4257, time = 1.15s, wps = 88669, train loss = 4.4894
Iteration 4277, time = 1.15s, wps = 89122, train loss = 4.4667
Iteration 4297, time = 1.16s, wps = 88232, train loss = 4.4666
Iteration 4317, time = 1.15s, wps = 89057, train loss = 4.4657
Iteration 4337, time = 1.16s, wps = 88202, train loss = 4.4594
Iteration 4357, time = 1.15s, wps = 88754, train loss = 4.4386
Iteration 4377, time = 1.16s, wps = 87988, train loss = 4.4455
Iteration 4397, time = 1.16s, wps = 87938, train loss = 4.4639
Iteration 4417, time = 1.16s, wps = 88277, train loss = 4.5071
Iteration 4437, time = 1.17s, wps = 87723, train loss = 4.4559
Iteration 4457, time = 1.15s, wps = 88832, train loss = 4.4303
Iteration 4477, time = 1.16s, wps = 88032, train loss = 4.5125
Iteration 4497, time = 1.16s, wps = 88305, train loss = 4.4824
Iteration 4517, time = 1.17s, wps = 87469, train loss = 4.4651
Iteration 4537, time = 1.16s, wps = 88488, train loss = 4.5037
Iteration 4557, time = 1.16s, wps = 88011, train loss = 4.4218
Iteration 4577, time = 1.17s, wps = 87878, train loss = 4.4039
Iteration 4597, time = 1.16s, wps = 87935, train loss = 4.4503
Iteration 4617, time = 1.16s, wps = 88341, train loss = 4.4050
Iteration 4637, time = 1.17s, wps = 87502, train loss = 4.4536
Iteration 4657, time = 1.16s, wps = 88440, train loss = 4.3186
Iteration 4677, time = 1.16s, wps = 88126, train loss = 4.4561
Iteration 4697, time = 1.17s, wps = 87683, train loss = 4.5155
Iteration 4717, time = 1.18s, wps = 86834, train loss = 4.4506
Iteration 4737, time = 1.17s, wps = 87761, train loss = 4.4375
Iteration 4757, time = 1.17s, wps = 87546, train loss = 4.4300
Iteration 4777, time = 1.14s, wps = 89626, train loss = 4.3914
Iteration 4797, time = 1.17s, wps = 87809, train loss = 4.3948
Iteration 4817, time = 1.17s, wps = 87816, train loss = 4.4060
Iteration 4837, time = 1.15s, wps = 89179, train loss = 4.3492
Iteration 4857, time = 1.16s, wps = 88631, train loss = 4.3638
Iteration 4877, time = 1.17s, wps = 87817, train loss = 4.4797
Iteration 4897, time = 1.16s, wps = 88257, train loss = 4.4217
Iteration 4917, time = 1.17s, wps = 87703, train loss = 4.5081
Iteration 4937, time = 1.16s, wps = 88305, train loss = 4.4509
Iteration 4957, time = 1.15s, wps = 88776, train loss = 4.3529
Iteration 4977, time = 1.16s, wps = 88281, train loss = 4.4510
Iteration 4997, time = 1.17s, wps = 87528, train loss = 4.3825
Iteration 5017, time = 1.15s, wps = 88712, train loss = 4.4250
Iteration 5037, time = 1.16s, wps = 88215, train loss = 4.4139
Iteration 5057, time = 1.15s, wps = 88665, train loss = 4.4242
Iteration 5077, time = 1.17s, wps = 87547, train loss = 4.3887
Iteration 5097, time = 1.16s, wps = 87905, train loss = 4.4527
Iteration 5117, time = 1.17s, wps = 87407, train loss = 4.3447
Iteration 5137, time = 1.17s, wps = 87617, train loss = 4.3689
Iteration 5157, time = 1.17s, wps = 87840, train loss = 4.4057
Iteration 5177, time = 1.18s, wps = 86983, train loss = 4.4083
Iteration 5197, time = 1.17s, wps = 87729, train loss = 4.4001
Iteration 5217, time = 1.16s, wps = 88419, train loss = 4.4692
Processing file: ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
Finished processing!
Iteration 5237, time = 2.76s, wps = 37139, train loss = 4.3477
Iteration 5257, time = 1.16s, wps = 88320, train loss = 4.4138
Iteration 5277, time = 1.16s, wps = 88335, train loss = 4.3759
Iteration 5297, time = 1.16s, wps = 87941, train loss = 4.3920
Iteration 5317, time = 1.17s, wps = 87310, train loss = 4.3592
Iteration 5337, time = 1.18s, wps = 87001, train loss = 4.3884
Iteration 5357, time = 1.16s, wps = 88199, train loss = 4.2886
Iteration 5377, time = 1.18s, wps = 86568, train loss = 4.2593
Iteration 5397, time = 1.16s, wps = 88151, train loss = 4.3982
Iteration 5417, time = 1.16s, wps = 88082, train loss = 4.3290
Iteration 5437, time = 1.16s, wps = 88283, train loss = 4.4078
Iteration 5457, time = 1.15s, wps = 89037, train loss = 4.3923
Iteration 5477, time = 1.17s, wps = 87604, train loss = 4.3585
Iteration 5497, time = 1.17s, wps = 87412, train loss = 4.3822
Iteration 5517, time = 1.17s, wps = 87880, train loss = 4.3700
Iteration 5537, time = 1.17s, wps = 87466, train loss = 4.3854
Iteration 5557, time = 1.17s, wps = 87563, train loss = 4.3435
Iteration 5577, time = 1.16s, wps = 88611, train loss = 4.4302
Iteration 5597, time = 1.17s, wps = 87533, train loss = 4.4254
Iteration 5617, time = 1.16s, wps = 88059, train loss = 4.4021
Iteration 5637, time = 1.18s, wps = 86865, train loss = 4.3857
Iteration 5657, time = 1.16s, wps = 88038, train loss = 4.3356
Iteration 5677, time = 1.17s, wps = 87600, train loss = 4.3608
Iteration 5697, time = 1.17s, wps = 87632, train loss = 4.2943
Iteration 5717, time = 1.17s, wps = 87689, train loss = 4.3630
Iteration 5737, time = 1.17s, wps = 87372, train loss = 4.4060
Iteration 5757, time = 1.18s, wps = 87117, train loss = 4.3234
Iteration 5777, time = 1.17s, wps = 87797, train loss = 4.3807
Iteration 5797, time = 1.17s, wps = 87598, train loss = 4.3461
Iteration 5817, time = 1.17s, wps = 87559, train loss = 4.3783
Iteration 5837, time = 1.16s, wps = 87962, train loss = 4.2885
Iteration 5857, time = 1.18s, wps = 87110, train loss = 4.2751
Iteration 5877, time = 1.16s, wps = 87945, train loss = 4.3109
Iteration 5897, time = 1.17s, wps = 87283, train loss = 4.3415
Iteration 5917, time = 1.17s, wps = 87793, train loss = 4.3002
Iteration 5937, time = 1.17s, wps = 87384, train loss = 4.3556
Iteration 5957, time = 1.17s, wps = 87844, train loss = 4.3848
Iteration 5977, time = 1.17s, wps = 87341, train loss = 4.3601
Iteration 5997, time = 1.17s, wps = 87594, train loss = 4.2737
Iteration 6017, time = 1.17s, wps = 87800, train loss = 4.3120
Iteration 6037, time = 1.17s, wps = 87617, train loss = 4.3750
Iteration 6057, time = 1.16s, wps = 88074, train loss = 4.4251
Iteration 6077, time = 1.18s, wps = 86668, train loss = 4.4217
Iteration 6097, time = 1.17s, wps = 87423, train loss = 4.2958
Iteration 6117, time = 1.16s, wps = 88266, train loss = 4.3821
Iteration 6137, time = 1.19s, wps = 86154, train loss = 4.3400
Iteration 6157, time = 1.17s, wps = 87346, train loss = 4.3108
Iteration 6177, time = 1.16s, wps = 87923, train loss = 4.3585
Iteration 6197, time = 1.19s, wps = 86357, train loss = 4.2869
Iteration 6217, time = 1.16s, wps = 87933, train loss = 4.2981
Iteration 6237, time = 1.17s, wps = 87297, train loss = 4.3816
Iteration 6257, time = 1.17s, wps = 87561, train loss = 4.3323
Iteration 6277, time = 1.17s, wps = 87437, train loss = 4.3185
Iteration 6297, time = 1.16s, wps = 87938, train loss = 4.2814
Iteration 6317, time = 1.17s, wps = 87608, train loss = 4.3211
Iteration 6337, time = 1.17s, wps = 87315, train loss = 4.2836
Iteration 6357, time = 1.18s, wps = 87082, train loss = 4.3642
Iteration 6377, time = 1.17s, wps = 87324, train loss = 4.2665
Iteration 6397, time = 1.17s, wps = 87501, train loss = 4.3511
Iteration 6417, time = 1.17s, wps = 87804, train loss = 4.3519
Iteration 6437, time = 1.18s, wps = 87117, train loss = 4.3000
Iteration 6457, time = 1.18s, wps = 86896, train loss = 4.3261
/usr/local/lib/python3.5/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
  warnings.warn("Attempting to use a closed FileWriter. "

real    3m10.313s
user    16m30.401s
sys     4m13.081s
root@ad8c883cdd71:/workspace/nvidia-examples/big_lstm#