pip install tensorflow==1.14.0
pip install tensorflow-gpu==1.14
import tensorflow as tf
tf.test.is_gpu_available(
cuda_only=False,
min_cuda_compute_capability=None
)
False
GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("# of CPU: {0}".format(psutil.cpu_count()))
print("CPU type: {0}".format(platform.uname()))
print("GPU Type: {0}".format(gpu.name))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
Gen RAM Free: 65.8 GB | Proc size: 294.4 MB
# of CPU: 8
CPU type: uname_result(system='Linux', node='lian-2', release='5.3.0-53-generic', version='#47~18.04.1-Ubuntu SMP Thu May 7 13:10:50 UTC 2020', machine='x86_64', processor='x86_64')
GPU Type: GeForce RTX 2080 Ti
GPU RAM Free: 10992MB | Used: 26MB | Util 0% | Total 11018MB
I reinstalled cuda 10
Mon Jun 8 15:48:44 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82 Driver Version: 440.82 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce RTX 208... Off | 00000000:01:00.0 Off | N/A |
| 24% 28C P8 2W / 260W | 26MiB / 11018MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 1805 G /usr/lib/xorg/Xorg 9MiB |
| 0 2192 G /usr/bin/gnome-shell 14MiB |
+-----------------------------------------------------------------------------+
define CUDNN_MAJOR 7
#define CUDNN_MINOR 4
#define CUDNN_PATCHLEVEL 2
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
I'm trying to train this from a jupyter notebook (ubuntu 18 ssh):
model = Sequential()
model.add(layers.Conv1D(16, 9, activation='relu', input_shape=(800, X_train.shape[-1])))
model.add(layers.MaxPooling1D(2))
model.add(layers.Bidirectional(layers.CuDNNGRU(8, return_sequences=True)))
model.add(layers.Bidirectional(layers.CuDNNGRU(8)))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer= 'Adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()
train_to_epoch = 150
start_epoch = 3
t1 = datetime.datetime.now()
print('Training start time = %s' % t1)
history = model.fit(X_train, y_train,
batch_size=128, epochs=train_to_epoch, verbose=0,
validation_data=(X_val, y_val))
print('\nTraining Duration = %s' % (datetime.datetime.now()-t1))
InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNN' used by {{node bidirectional_1/CudnnRNN}}with these attrs: [input_mode="linear_input", T=DT_FLOAT, direction="unidirectional", rnn_mode="gru", seed2=0, is_training=true, seed=87654321, dropout=0]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
[[bidirectional_1/CudnnRNN]]
Maybe is another version of CUDA needed?
Yes, it is the CUDA 10.1 version.
Related
I am trying to query data from my postgresql database using pd.read_sql. There are timestamptz columns, with a combination of dates which have DST and some which don't. The dates which have a DST offset are being read in correctly, however those without a DST offset are being read as NaT.
All columns shown are of the same type, timestamptz
Example A - smart_departure has no DST offset
Queried using DBeaver
wtt_arrival
wtt_pass
wtt_departure
smart_arrival
smart_pass
smart_departure
NULL
NULL
1970-01-01 08:23:00.000 +0100
NULL
NULL
2022-01-14 08:23:03.000 +0000
Queried using pd.read_sql ( for example pd.read_sql('select * from database', con=engine))
wtt_arrival
wtt_pass
wtt_departure
smart_arrival
smart_pass
smart_departure
None
None
1970-01-01 07:23:00+00:00
None
None
NaT
Query by retrieving value directly (suggested by Gord Thompson below)
(None, None, datetime.datetime(1970, 1, 1, 7, 23, tzinfo=datetime.timezone.utc), None, None, None)
Queried using psql (suggested by Adrian Klaver)
wtt_arrival
wtt_pass
wtt_departure
smart_arrival
smart_pass
smart_departure
1970-01-01 07:23:00+00
Example B - smart_arrival has DST offset
Queried using DBeaver
wtt_arrival
wtt_pass
wtt_departure
smart_arrival
smart_pass
smart_departure
1970-01-01 15:49:00.000 +0100
NULL
NULL
2022-10-14 15:47:27.000 +0100
NULL
NULL
Queried using pd.read_sql
wtt_arrival
wtt_pass
wtt_departure
smart_arrival
smart_pass
smart_departure
1970-01-01 14:49:00+00:00
None
None
2022-10-14 14:47:27+00:00
None
None
Query by retrieving value directly (suggested by Gord Thompson below)
(datetime.datetime(1970, 1, 1, 14, 49, tzinfo=datetime.timezone.utc), None, None, datetime.datetime(2022, 10, 14, 14, 47, 27, tzinfo=datetime.timezone.utc), None, None)
Queried using psql (suggested by Adrian Klaver)
wtt_arrival
wtt_pass
wtt_departure
smart_arrival
smart_pass
smart_departure
1970-01-01 14:49:00+00
2022-10-14 14:47:27+00
Results of running show search_path and \d *.scheduled_and_actual_timings
timetabletooldev=> \d *.scheduled_and_actual_timings
View "public.scheduled_and_actual_timings"
Column | Type | Collation | Nullable | Default
----------------------+--------------------------+-----------+----------+---------
wtt_arrival | timestamp with time zone | | |
wtt_pass | timestamp with time zone | | |
wtt_departure | timestamp with time zone | | |
smart_arrival | timestamp with time zone | | |
smart_pass | timestamp with time zone | | |
smart_departure | timestamp with time zone | | |
timetabletooldev=> show search_path;
search_path
-----------------
"$user", public
(1 row)
Timezone in DBeaver session and psql/sqlalchemy sessions were different. When correct time zone is used, the same data is shown in both DBeaver and sqlalchemy.
Defining the engine like so worked for me:
engine = create_engine(DATABASE_URL, connect_args={"options": "-c timezone=Europe/London"})
My Tensorflow model makes heavy use of data preprocessing that should be done on the CPU to leave the GPU open for training.
top - 09:57:54 up 16:23, 1 user, load average: 3,67, 1,57, 0,67
Tasks: 400 total, 1 running, 399 sleeping, 0 stopped, 0 zombie
%Cpu(s): 19,1 us, 2,8 sy, 0,0 ni, 78,1 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
MiB Mem : 32049,7 total, 314,6 free, 5162,9 used, 26572,2 buff/cache
MiB Swap: 6779,0 total, 6556,0 free, 223,0 used. 25716,1 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
17604 joro 20 0 22,1g 2,3g 704896 S 331,2 7,2 4:39.33 python
This is what top shows me. I would like to make this python process use at least 90% of available CPU across all cores. How can this be achieved?
GPU utilization is better, around 90%. Even though I don't know why it is not at 100%
Mon Aug 10 10:00:13 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100 Driver Version: 440.100 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce RTX 208... Off | 00000000:01:00.0 On | N/A |
| 35% 41C P2 90W / 260W | 10515MiB / 11016MiB | 11% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 1128 G /usr/lib/xorg/Xorg 102MiB |
| 0 1648 G /usr/lib/xorg/Xorg 380MiB |
| 0 1848 G /usr/bin/gnome-shell 279MiB |
| 0 10633 G ...uest-channel-token=1206236727 266MiB |
| 0 13794 G /usr/lib/firefox/firefox 6MiB |
| 0 17604 C python 9457MiB |
+-----------------------------------------------------------------------------+
All i found was a solution for tensorflow 1.0:
sess = tf.Session(config=tf.ConfigProto(
intra_op_parallelism_threads=NUM_THREADS))
I have an Intel 9900k and a RTX 2080 Ti and use Ubuntu 20.04
E: When I add the following code on top, it uses 1 core 100%
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)
But increasing this number to 16 again only utilizes all cores ~30%
Just setting the set_intra_op_parallelism_threads and set_inter_op_parallelism_threads wasn't working for me. Incase someone else is in the same place, after a lot of struggle with the same issue, below piece of code worked for me in limiting the CPU usage of tensorflow below 500%:
import os
import tensorflow as tf
num_threads = 5
os.environ["OMP_NUM_THREADS"] = "5"
os.environ["TF_NUM_INTRAOP_THREADS"] = "5"
os.environ["TF_NUM_INTEROP_THREADS"] = "5"
tf.config.threading.set_inter_op_parallelism_threads(
num_threads
)
tf.config.threading.set_intra_op_parallelism_threads(
num_threads
)
tf.config.set_soft_device_placement(True)
There can be many issues for this, I solved it for me the following way:
Set
tf.config.threading.set_intra_op_parallelism_threads(<Your_Physical_Core_Count>) tf.config.threading.set_inter_op_parallelism_threads(<Your_Physical_Core_Count>)
both to your physical core count. You do not want Hyperthreading for highly vectorized operations as you cannot benefit from parallized operations when there aren't any gaps.
"With a high level of vectorization, the number of execution gaps is
very small and there is possibly insufficient opportunity to make up
any penalty due to increased contention in HT."
From: Saini et al, published by NASAA dvanced Supercomputing Division, 2011: The Impact of Hyper-Threading on Processor
Resource Utilization in Production Applications
EDIT: I am not sure anymore, if one of the two has to be 1. But one 100% needs to be set to Physical.
How do I code BeautifulSoup to display the results in a tabluar format?
something like this:
Topic | Views | Replies
---------------------------------------
XPS 7590 problems | 557 | 8
SSD not working | 76 | 3
My code is:
import requests, re
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://www.dell.com/community/XPS/bd-p/XPS")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "lia-component-messages-column-thread-info"})
for item in g_data:
print (item.find_all("h2", {"class": "message-subject"})[0].text)
print (item.find_all("span", {"class": "lia-message-stats-count"})[0].text) #replies
print (item.find_all("span", {"class": "lia-message-stats-count"})[1].text) #views
Just construct a dataframe by initializing an empty one and append each "row" into it:
import requests, re
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://www.dell.com/community/XPS/bd-p/XPS")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "lia-component-messages-column-thread-info"})
df = pd.DataFrame()
for item in g_data:
topic = item.find_all("h2", {"class": "message-subject"})[0].text.strip()
replies = item.find_all("span", {"class": "lia-message-stats-count"})[0].text.strip() #replies
views = item.find_all("span", {"class": "lia-message-stats-count"})[1].text.strip() #views
df = df.append(pd.DataFrame([[topic, views, replies]], columns=['Topic','Views','Replies']), sort=False).reset_index(drop=True)
Output:
print (df)
Topic Views Replies
0 FAQ Modern Standby 1057 0
1 FAQ XPS Laptops 4315 0
2 Where is the Precision Laptops Forum board? 624 0
3 XPS 15-9570, color banding issue 5880 192
4 XPS 7590 problems.. 565 9
5 XPS 13 7390 2-in-1 Display and Touchscreen issues 17 2
6 Dell XPS 9570 I7-8750H video display issues 9 0
7 XPS 9360 Fn lock for PgUp PgDn 12 0
8 Dell XPS DPC Latency Fix 1724 4
9 XPS 13 7390 2-in-1, Realtek drivers lead to fr... 253 11
10 XPS 12 9q23 Touch screen firmware update fix 36 1
11 Dell XPS 15 9570 when HDMI plugged in, screen ... 17 0
12 XPS 13 7390 2 in 1 bluetooth keyboard and mous... 259 10
13 xps15 7590 wifi problem 46 1
14 Unable to update Windows from 1803 to 1909 - X... 52 5
15 Dell XPS 9300 - Thunderbolt 3 Power Delivery I... 28 0
16 Dell XPS 15 9560, right arrow key or right of ... 26 0
17 XPS 13 2020 (9300) Ubuntu sudden shut down 24 0
18 Dell XPS 15 9750 won’t login 26 0
19 XPS 13 9360 Windows Hello Face - reconfigurati... 29 2
20 Enclosure for Dell XPS 13 9360 512 GB pcie nvm... 181 7
21 XPS 13 7390 Firmware 1.3.1 Issue - Bluetooth /... 119 2
22 SSD Onboard? 77 3
23 XPS 13 9350 only turns on when charger connected 4090 11
24 Integrated webcam not working 45 1
25 Docking station for XPS 15 9570, Dell TB16 not... 53 4
26 Dell XPS 13 9370 34 1
27 XPS 13 9380 overheat while charging 602 3
28 DELL XPS 13 (9300) REALTEK AUDIO DRIVER PROBLEM 214 2
29 XPS 15 9570 freezing Windows 10 222 6
30 XPS 13 (9300) - Speaker Vibration 40 2
31 Dell XPS 15 9570 Fingerprint reader not workin... 158 2
32 XPS 9570 Intel 9260 No Bluetooth 34 0
I want to use NetworkManager to control wwan0. But I only see ttyUSB2 in NetworkManager device list.
I use "cat" command to make ttyUSB2 & ttyUSB3 in use, but it doesn't work.
The below is modemmanager info
root#imx6qsabresd:~# mmcli -m 0
/org/freedesktop/ModemManager1/Modem/0 (device id '74d82342f71de5945dddcc381409e231f5ed5df1')
-------------------------
Hardware | manufacturer: 'SIMCOM INCORPORATED'
| model: 'SIMCOM_SIM7600JC-H'
| revision: 'LE11B01SIM7600JC-H'
| supported: 'gsm-umts, lte'
| current: 'gsm-umts, lte'
| equipment id: '861478030131862'
-------------------------
System | device: '/sys/devices/soc0/soc/2100000.aips-bus/2184200.usb/ci_hdrc.1/usb1/1-1/1-1.1'
| drivers: 'option1, simcom_wwan'
| plugin: 'SimTech'
| primary port: 'ttyUSB2'
| ports: 'ttyUSB0 (qcdm), ttyUSB2 (at), ttyUSB3 (at), wwan0 (net)'
-------------------------
Numbers | own : 'unknown'
-------------------------
Status | lock: 'none'
| unlock retries: 'unknown'
| state: 'registered'
| power state: 'on'
| access tech: 'unknown'
| signal quality: '80' (recent)
-------------------------
Modes | supported: 'allowed: 2g; preferred: none
| allowed: 3g; preferred: none
| allowed: 2g, 3g; preferred: none
| allowed: 2g, 3g; preferred: 2g
| allowed: 2g, 3g; preferred: 3g
| allowed: 2g, 3g, 4g; preferred: none'
| current: 'allowed: any; preferred: none'
-------------------------
Bands | supported: 'unknown'
| current: 'unknown'
-------------------------
IP | supported: 'ipv4, ipv6, ipv4v6'
-------------------------
3GPP | imei: '861478030131862'
| enabled locks: 'none'
| operator id: '46692'
| operator name: 'Chunghwa Telecom'
| subscription: 'unknown'
| registration: 'home'
-------------------------
SIM | path: '/org/freedesktop/ModemManager1/SIM/0'
-------------------------
Bearers | paths: 'none'
Does anyone know how to make wwan0 appear in NetworkManager device list?
You see ttyUSB2 in NetworkManager because that is the "primary port" reported by ModemManager in the modem details.
In this specific case, ModemManager doesn't know how to use the WWAN port of the device (exposed by the non-standard simcom_wwan kernel driver), and therefore it will fallback to use PPP over a TTY port.
If you want to use the WWAN port with NM/MM you must not install the simcom_wwan kernel driver, and instead, you should just use the standard qmi_wwan kernel driver. Once you do that, you'll get a cdc-wdm port (that speaks QMI) and an associated WWAN port (in raw-ip mode), and those are nicely used by ModemManager/NetworkManager.
If you ask me, there should be no reason to use the simcom_wwan kernel driver by normal users. If your qmi_wwan driver doesn't expose the QMI ports of the modem, you may just need a newer kernel.
Using Keras with Tensorflow backend, I am trying to train an LSTM network and it is taking much longer to run it on a GPU than a CPU.
I am training an LSTM network using the fit_generator function. It takes CPU ~250 seconds per epoch while it takes GPU ~900 seconds per epoch. The packages in my GPU environment include
keras-applications 1.0.8 py_0 anaconda
keras-base 2.2.4 py36_0 anaconda
keras-gpu 2.2.4 0 anaconda
keras-preprocessing 1.1.0 py_1 anaconda
...
tensorflow 1.13.1 gpu_py36h3991807_0 anaconda
tensorflow-base 1.13.1 gpu_py36h8d69cac_0 anaconda
tensorflow-estimator 1.13.0 py_0 anaconda
tensorflow-gpu 1.13.1 pypi_0 pypi
My Cuda compilation tools are of version 9.1.85 and my CUDA and Driver version are
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce RTX 2080 On | 00000000:0A:00.0 Off | N/A |
| 0% 39C P8 5W / 225W | 7740MiB / 7952MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 GeForce RTX 2080 On | 00000000:42:00.0 Off | N/A |
| 0% 33C P8 19W / 225W | 142MiB / 7951MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 49251 C .../whsu014/.conda/envs/whsuphd/bin/python 7729MiB |
| 1 1354 G /usr/lib/xorg/Xorg 16MiB |
| 1 49251 C .../whsu014/.conda/envs/whsuphd/bin/python 113MiB |
+-----------------------------------------------------------------------------+
When I insert this line of code
tf.Session(config = tf.configProto(log_device_placement = True)):
I see the below in my terminal
...
ining_1/Adam/Const_10: (Const)/job:localhost/replica:0/task:0/device:GPU:0
training_1/Adam/Const_11: (Const): /job:localhost/replica:0/task:0/device:GPU:0
2019-06-25 11:27:31.720653: I tensorflow/core/common_runtime/placer.cc:1059] training_1/Adam/Const_11: (Const)/job:localhost/replica:0/task:0/device:GPU:0
training_1/Adam/add_15/y: (Const): /job:localhost/replica:0/task:0/device:GPU:0
2019-06-25 11:27:31.720666: I tensorflow/core/common_runtime/placer.cc:1059] training_1/Adam/add_15/y: (Const)/job:localhost/replica:0/task:0/device:GPU:0
...
So it seems that Tensorflow is using GPU.
When I profile the code,
on GPU this is the first 10 lines
10852017 function calls (10524203 primitive calls) in 184.768 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
16200 173.827 0.011 173.827 0.011 {built-in method _pywrap_tensorflow_internal.TF_SessionRunCallable}
6 0.926 0.154 0.926 0.154 {built-in method _pywrap_tensorflow_internal.TF_SessionMakeCallable}
62 0.813 0.013 0.813 0.013 {built-in method _pywrap_tensorflow_internal.TF_SessionRun_wrapper}
156954 0.414 0.000 0.415 0.000 {built-in method numpy.array}
16200 0.379 0.000 1.042 0.000 training.py:643(_standardize_user_data)
24300 0.338 0.000 0.338 0.000 {method 'partition' of 'numpy.ndarray' objects}
68 0.301 0.004 0.301 0.004 {built-in method _pywrap_tensorflow_internal.ExtendSession}
32458 0.223 0.000 2.122 0.000 tensorflow_backend.py:156(get_session)
3206 0.212 0.000 0.238 0.000 tf_stack.py:31(extract_stack)
76024 0.210 0.000 0.702 0.000 ops.py:5246(get_controller)
...
on CPU this is the first 10 lines
22123473 function calls (21647174 primitive calls) in 60.173 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
16269 42.491 0.003 42.491 0.003 {built-in method tensorflow.python._pywrap_tensorflow_internal.TF_Run}
16269 0.568 0.000 48.964 0.003 session.py:1042(_run)
56 0.532 0.010 0.532 0.010 {built-in method time.sleep}
153641 0.458 0.000 0.460 0.000 {built-in method numpy.core.multiarray.array}
183148/125354 0.447 0.000 1.316 0.000 python_message.py:469(init)
1226659 0.362 0.000 0.364 0.000 {built-in method builtins.getattr}
2302110/2301986 0.339 0.000 0.358 0.000 {built-in method builtins.isinstance}
8 0.285 0.036 0.285 0.036 {built-in method tensorflow.python._pywrap_tensorflow_internal.TF_ExtendGraph}
12150 0.267 0.000 0.271 0.000 callbacks.py:211(on_batch_end)
147026/49078 0.264 0.000 1.429 0.000 python_message.py:1008(ByteSize)
...
This is my code.
def train_generator(x_list, y_list):
# 0.1 validatioin split
train_length = (len(x_list)//10)*9
while True:
for i in range(train_length):
train_x = np.array([x_list[i]])
train_y = np.array([y_list[i]])
yield train_x, train_y
def val_generator(x_list, y_list):
# 0.1 validation split
val_length = len(x_list)//10
while True:
for i in range(-val_length, 0, 1):
val_x = np.array([x_list[i]])
val_y = np.array([y_list[i]])
yield val_x, val_y
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
model = Sequential()
model.add(LSTM(64, return_sequences=False,
input_shape=(None, 24)))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
checkpointer = ModelCheckpoint(filepath="weights.hdf5",
monitor='val_loss', verbose=1,
save_best_only=True)
history = model.fit_generator(generator=train_generator(train_x,
train_y),
steps_per_epoch=(len(train_x)//10)*9,
epochs=5,
validation_data=val_generator(train_x,
train_y),
validation_steps=len(train_x)//10,
callbacks=[checkpointer],
verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='validation')
pyplot.legend()
pyplot.show()
I expect a significant speed up when using GPU for training. How can I fix this? Can someone help me to understand what is causing the slowdown? Thank you.
Couple of observations:
Use CuDNNLSTM instead of LSTM to train on GPU, you will see considerable increase in speed.
Sometimes, for very small networks, the overhead of transferring between CPU and GPU outweighs the parallel computations made on GPU; in other words, there is more time lost on transferring the data than time gained by training on GPU.
GPUs should be used for highly intensive tasks and computations (very big LSTM/heavy CNN networks). Nevertheless, for very small MLPs and even small LSTMs you might observe that the network trains equally fast on CPU and GPU or that in some particular cases the speed on CPU is even better (very particular cases with super small networks).
UPDATE FOR TENSORFLOW >= 2.0
The imports default to using CuDNNLSTM/CuDNNGRU if the video card is detected; therefore it is not needed explicitly to import them.