Pytorch detach() function failed to be excuated on different GPU severs - gpu

Recently, our lab bought a new server with 9 GPUs and I want to run my programming on this machine. However, I do not change my right code and I got an unexpected error like the following.
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1535491974311/work/aten/src/THC/THCGeneral.cpp line=663 error=11 : invalid argument
Traceback (most recent call last):
File "main.py", line 166, in <module>
p_img.copy_(netG(p_z).detach())
File "/usr/local/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/home/szhangcj/python/GBGAN/celebA_attention/sagan_models.py", line 100, in forward
out,p1 = self.attn1(out)
File "/usr/local/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/home/szhangcj/python/GBGAN/celebA_attention/sagan_models.py", line 32, in forward
energy = torch.bmm(proj_query,proj_key) # transpose check
RuntimeError: cublas runtime error : the GPU program failed to execute at /opt/conda/conda-bld/pytorch_1535491974311/work/aten/src/THC/THCBlas.cu:411
However, I can run my programming successfully on the old machine with 4 GPUs. I am not sure what the problem is and it seems that the error is caused by the detach() function. My code is as the following.
z_b = torch.FloatTensor(opt.batch_size, opt.z_dim).to(device)
img_b = torch.FloatTensor(opt.batch_size, 3, 64, 64).to(device)
img_a = torch.FloatTensor(opt.batch_size, 3, 64, 64).to(device)
p_z = torch.FloatTensor(pool_size, opt.z_dim).to(device)
p_img = torch.FloatTensor(pool_size, 3, 64, 64).to(device)
## evaluation placeholder
show_z_b = torch.FloatTensor(100, opt.z_dim).to(device)
eval_z_b = torch.FloatTensor(250, opt.z_dim).to(device) # 250/batch * 120 --> 300000
optim_D = optim.Adam(netD.parameters(), lr=opt.lr_d) # other param?
optim_G = optim.Adam(netG.parameters(), lr=opt.lr_g) #?suitable
criterion_G = nn.MSELoss()
eta = 1
loss_GD = []
pre_loss = 0
cur_loss = 0
G_epoch = 1
for epoch in range(start_epoch, start_epoch + opt.num_epoch):
print('Start epoch: %d' % epoch)
## input_pool: [pool_size, opt.z_dim] -> [pool_size, 32, 32]
netD.train()
netG.eval()
p_z.normal_()
# print(netG(p_z).detach().size())
p_img.copy_(netG(p_z).detach())
for t in range(opt.period):
for _ in range(opt.dsteps):
t = time.time()
### Update D
netD.zero_grad()
## real
real_img, _ = next(iter(dataloader)) # [batch_size, 1, 32, 32]
img_b.copy_(real_img.squeeze().to(device))
real_D_err = torch.log(1 + torch.exp(-netD(img_b))).mean()
print("D real loss", netD(img_b).mean())
# real_D_err.backward()
## fake
z_b_idx = random.sample(range(pool_size), opt.batch_size)
img_a.copy_(p_img[z_b_idx])
fake_D_err = torch.log(1 + torch.exp(netD(img_a))).mean() # torch scalar[]
loss_gp = calc_gradient_penalty(netD, img_b, img_a)
total_loss = real_D_err + fake_D_err + loss_gp
print("D fake loss", netD(img_a).mean())
total_loss.backward()
optim_D.step()
## update input pool
p_img_t = p_img.clone().to(device)
p_img_t.requires_grad_(True)
if p_img_t.grad is not None:
p_img_t.grad.zero_()
fake_D_score = netD(p_img_t)
fake_D_score.backward(torch.ones(len(p_img_t)).to(device))
p_img = img_truncate(p_img + eta * p_img_t.grad)
print("The mean of gradient", torch.mean(p_img_t.grad))

The error is caused by the version mismatching between the RTX GPU cards and the CUDA driver.

Related

Using BatchedPyEnvironment in tf_agents

I am trying to create a batched environment version of an SAC agent example from the Tensorflow Agents library, the original code can be found here. I am also using a custom environment.
I am pursuing a batched environment setup in order to better leverage GPU resources in order to speed up training. My understanding is that by passing batches of trajectories to the GPU, there will be less overhead incurred when passing data from the host (CPU) to the device (GPU).
My custom environment is called SacEnv, and I attempt to create a batched environment like so:
py_envs = [SacEnv() for _ in range(0, batch_size)]
batched_env = batched_py_environment.BatchedPyEnvironment(envs=py_envs)
tf_env = tf_py_environment.TFPyEnvironment(batched_env)
My hope is that this will create a batched environment consisting of a 'batch' of non-batched environments. However I am receiving the following error when running the code:
ValueError: Cannot assign value to variable ' Accumulator:0': Shape mismatch.The variable shape (1,), and the assigned value shape (32,) are incompatible.
with the stack trace:
Traceback (most recent call last):
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 370, in <module>
app.run(main)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/absl/app.py", line 312, in run
_run_main(main, args)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/absl/app.py", line 258, in _run_main
sys.exit(main(argv))
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 366, in main
train_eval(FLAGS.root_dir)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 274, in train_eval
results = metric_utils.eager_compute(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/eval/metric_utils.py", line 163, in eager_compute
common.function(driver.run)(time_step, policy_state)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 211, in run
return self._run_fn(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/utils/common.py", line 188, in with_check_resource_vars
return fn(*fn_args, **fn_kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 238, in _run
tf.while_loop(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 154, in loop_body
observer_ops = [observer(traj) for observer in self._observers]
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 154, in <listcomp>
observer_ops = [observer(traj) for observer in self._observers]
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metric.py", line 93, in __call__
return self._update_state(*args, **kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metric.py", line 81, in _update_state
return self.call(*arg, **kwargs)
ValueError: in user code:
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metrics.py", line 176, in call *
self._return_accumulator.assign(
ValueError: Cannot assign value to variable ' Accumulator:0': Shape mismatch.The variable shape (1,), and the assigned value shape (32,) are incompatible.
In call to configurable 'eager_compute' (<function eager_compute at 0x7fa4d6e5e040>)
In call to configurable 'train_eval' (<function train_eval at 0x7fa4c8622dc0>)
I have dug through the tf_metric.py code to try and understand the error, however I have been unsuccessful. A related issue was solved when I added the batch size (32) to the initializer for the AverageReturnMetric instance, and this issue seems related.
The full code is:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python2, python3
r"""Train and Eval SAC.
All hyperparameters come from the SAC paper
https://arxiv.org/pdf/1812.05905.pdf
To run:
```bash
tensorboard --logdir $HOME/tmp/sac/gym/HalfCheetah-v2/ --port 2223 &
python tf_agents/agents/sac/examples/v2/train_eval.py \
--root_dir=$HOME/tmp/sac/gym/HalfCheetah-v2/ \
--alsologtostderr
\```
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sac_env import SacEnv
import os
import time
from absl import app
from absl import flags
from absl import logging
import gin
from six.moves import range
import tensorflow as tf # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.agents.sac import tanh_normal_projection_network
from tf_agents.drivers import dynamic_step_driver
#from tf_agents.environments import suite_mujoco
from tf_agents.environments import tf_py_environment
from tf_agents.environments import batched_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
from tf_agents.train.utils import strategy_utils
flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
'Root directory for writing logs/summaries/checkpoints.')
flags.DEFINE_multi_string('gin_file', None, 'Path to the trainer config files.')
flags.DEFINE_multi_string('gin_param', None, 'Gin binding to pass through.')
FLAGS = flags.FLAGS
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
print(e)
#gin.configurable
def train_eval(
root_dir,
env_name='SacEnv',
# The SAC paper reported:
# Hopper and Cartpole results up to 1000000 iters,
# Humanoid results up to 10000000 iters,
# Other mujoco tasks up to 3000000 iters.
num_iterations=3000000,
actor_fc_layers=(256, 256),
critic_obs_fc_layers=None,
critic_action_fc_layers=None,
critic_joint_fc_layers=(256, 256),
# Params for collect
# Follow https://github.com/haarnoja/sac/blob/master/examples/variants.py
# HalfCheetah and Ant take 10000 initial collection steps.
# Other mujoco tasks take 1000.
# Different choices roughly keep the initial episodes about the same.
#initial_collect_steps=10000,
initial_collect_steps=2000,
collect_steps_per_iteration=1,
replay_buffer_capacity=31250, # 1000000 / 32
# Params for target update
target_update_tau=0.005,
target_update_period=1,
# Params for train
train_steps_per_iteration=1,
#batch_size=256,
batch_size=32,
actor_learning_rate=3e-4,
critic_learning_rate=3e-4,
alpha_learning_rate=3e-4,
td_errors_loss_fn=tf.math.squared_difference,
gamma=0.99,
reward_scale_factor=0.1,
gradient_clipping=None,
use_tf_functions=True,
# Params for eval
num_eval_episodes=30,
eval_interval=10000,
# Params for summaries and logging
train_checkpoint_interval=50000,
policy_checkpoint_interval=50000,
rb_checkpoint_interval=50000,
log_interval=1000,
summary_interval=1000,
summaries_flush_secs=10,
debug_summaries=False,
summarize_grads_and_vars=False,
eval_metrics_callback=None):
"""A simple train and eval for SAC."""
root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
train_summary_writer = tf.compat.v2.summary.create_file_writer(
train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()
eval_summary_writer = tf.compat.v2.summary.create_file_writer(
eval_dir, flush_millis=summaries_flush_secs * 1000)
eval_metrics = [
tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
]
global_step = tf.compat.v1.train.get_or_create_global_step()
with tf.compat.v2.summary.record_if(
lambda: tf.math.equal(global_step % summary_interval, 0)):
py_envs = [SacEnv() for _ in range(0, batch_size)]
batched_env = batched_py_environment.BatchedPyEnvironment(envs=py_envs)
tf_env = tf_py_environment.TFPyEnvironment(batched_env)
eval_py_envs = [SacEnv() for _ in range(0, batch_size)]
eval_batched_env = batched_py_environment.BatchedPyEnvironment(envs=eval_py_envs)
eval_tf_env = tf_py_environment.TFPyEnvironment(eval_batched_env)
time_step_spec = tf_env.time_step_spec()
observation_spec = time_step_spec.observation
action_spec = tf_env.action_spec()
strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True)
with strategy.scope():
actor_net = actor_distribution_network.ActorDistributionNetwork(
observation_spec,
action_spec,
fc_layer_params=actor_fc_layers,
continuous_projection_net=tanh_normal_projection_network
.TanhNormalProjectionNetwork)
critic_net = critic_network.CriticNetwork(
(observation_spec, action_spec),
observation_fc_layer_params=critic_obs_fc_layers,
action_fc_layer_params=critic_action_fc_layers,
joint_fc_layer_params=critic_joint_fc_layers,
kernel_initializer='glorot_uniform',
last_kernel_initializer='glorot_uniform')
tf_agent = sac_agent.SacAgent(
time_step_spec,
action_spec,
actor_network=actor_net,
critic_network=critic_net,
actor_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=actor_learning_rate),
critic_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=critic_learning_rate),
alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=alpha_learning_rate),
target_update_tau=target_update_tau,
target_update_period=target_update_period,
td_errors_loss_fn=td_errors_loss_fn,
gamma=gamma,
reward_scale_factor=reward_scale_factor,
gradient_clipping=gradient_clipping,
debug_summaries=debug_summaries,
summarize_grads_and_vars=summarize_grads_and_vars,
train_step_counter=global_step)
tf_agent.initialize()
# Make the replay buffer.
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=batch_size,
max_length=replay_buffer_capacity,
device="/device:GPU:0")
replay_observer = [replay_buffer.add_batch]
train_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(
buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
tf_metrics.AverageEpisodeLengthMetric(
buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
]
eval_policy = greedy_policy.GreedyPolicy(tf_agent.policy)
initial_collect_policy = random_tf_policy.RandomTFPolicy(
tf_env.time_step_spec(), tf_env.action_spec())
collect_policy = tf_agent.collect_policy
train_checkpointer = common.Checkpointer(
ckpt_dir=train_dir,
agent=tf_agent,
global_step=global_step,
metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
policy_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'policy'),
policy=eval_policy,
global_step=global_step)
rb_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
max_to_keep=1,
replay_buffer=replay_buffer)
train_checkpointer.initialize_or_restore()
rb_checkpointer.initialize_or_restore()
initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
tf_env,
initial_collect_policy,
observers=replay_observer + train_metrics,
num_steps=initial_collect_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(
tf_env,
collect_policy,
observers=replay_observer + train_metrics,
num_steps=collect_steps_per_iteration)
if use_tf_functions:
initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_agent.train = common.function(tf_agent.train)
if replay_buffer.num_frames() == 0:
# Collect initial replay data.
logging.info(
'Initializing replay buffer by collecting experience for %d steps '
'with a random policy.', initial_collect_steps)
initial_collect_driver.run()
results = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
if eval_metrics_callback is not None:
eval_metrics_callback(results, global_step.numpy())
metric_utils.log_metrics(eval_metrics)
time_step = None
policy_state = collect_policy.get_initial_state(tf_env.batch_size)
timed_at_step = global_step.numpy()
time_acc = 0
# Prepare replay buffer as dataset with invalid transitions filtered.
def _filter_invalid_transition(trajectories, unused_arg1):
return ~trajectories.is_boundary()[0]
dataset = replay_buffer.as_dataset(
sample_batch_size=batch_size,
num_steps=2).unbatch().filter(
_filter_invalid_transition).batch(batch_size).prefetch(5)
# Dataset generates trajectories with shape [Bx2x...]
iterator = iter(dataset)
def train_step():
experience, _ = next(iterator)
return tf_agent.train(experience)
if use_tf_functions:
train_step = common.function(train_step)
global_step_val = global_step.numpy()
while global_step_val < num_iterations:
start_time = time.time()
time_step, policy_state = collect_driver.run(
time_step=time_step,
policy_state=policy_state,
)
for _ in range(train_steps_per_iteration):
train_loss = train_step()
time_acc += time.time() - start_time
global_step_val = global_step.numpy()
if global_step_val % log_interval == 0:
logging.info('step = %d, loss = %f', global_step_val,
train_loss.loss)
steps_per_sec = (global_step_val - timed_at_step) / time_acc
logging.info('%.3f steps/sec', steps_per_sec)
tf.compat.v2.summary.scalar(
name='global_steps_per_sec', data=steps_per_sec, step=global_step)
timed_at_step = global_step_val
time_acc = 0
for train_metric in train_metrics:
train_metric.tf_summaries(
train_step=global_step, step_metrics=train_metrics[:2])
if global_step_val % eval_interval == 0:
results = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
if eval_metrics_callback is not None:
eval_metrics_callback(results, global_step_val)
metric_utils.log_metrics(eval_metrics)
if global_step_val % train_checkpoint_interval == 0:
train_checkpointer.save(global_step=global_step_val)
if global_step_val % policy_checkpoint_interval == 0:
policy_checkpointer.save(global_step=global_step_val)
if global_step_val % rb_checkpoint_interval == 0:
rb_checkpointer.save(global_step=global_step_val)
return train_loss
def main(_):
tf.compat.v1.enable_v2_behavior()
logging.set_verbosity(logging.INFO)
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
train_eval(FLAGS.root_dir)
if __name__ == '__main__':
flags.mark_flag_as_required('root_dir')
app.run(main)
What is the appropriate way to create a batched environment for a custom, non-batched environment? I can share my custom environment, but I don't believe the issue lies there as the code works fine when using batch sizes of 1.
Also, any tips on increasing GPU utilization in reinforcement learning scenarios would be greatly appreciated. I have examined examples of using tensorboard-profiler to profile GPU utilization, but it seems these require callbacks and a fit function, which doesn't seem to be applicable in RL use-cases.
It turns out I neglected to pass batch_size when initializing the AverageReturnMetric and AverageEpisodeLengthMetric instances.

Skopt gp_minimize eroor while looking for optimal point

I have a problem with gp_minimize from skopt. I'm doing hyperparameter tunning for transfer learning (base model vgg19). Everything works fine until gp_minimize starts to evaluate optimal point (evaluate random point works fine). I have this error, i spend few days on this and i still don't know what to do:
Traceback (most recent call last):
File "C:/Users/mea/Train_models/04_VGG19_train_model.py", line 144, in <module>
search_result = gp_minimize(func=fitness,
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\optimizer\gp.py", line 259, in gp_minimize
return base_minimize(
File "C:\Users\Wme\anaconda3\envs\Train_models\lib\site-packages\skopt\optimizer\base.py", line 300, in base_minimize
result = optimizer.tell(next_x, next_y)
File "C:\Users\mea\anaconda3\envs\Train_models\lib\site-packages\skopt\optimizer\optimizer.py", line 493, in tell
return self._tell(x, y, fit=fit)
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\optimizer\optimizer.py", line 552, in _tell
X = self.space.transform(self.space.rvs(
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\space\space.py", line 900, in rvs
columns.append(dim.rvs(n_samples=n_samples, random_state=rng))
File "C:\Users\Weronika Gramacka\anaconda3\envs\Train_models\lib\site-packages\skopt\space\space.py", line 698, in rvs
return self.inverse_transform(list(choices))
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\space\space.py", line 685, in inverse_transform
inv_transform = super(Categorical, self).inverse_transform(Xt)
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\space\space.py", line 168, in inverse_transform
return self.transformer.inverse_transform(Xt)
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\space\transformers.py", line 309, in inverse_transform
X = transformer.inverse_transform(X)
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\space\transformers.py", line 216, in inverse_transform
return [
File "C:\Users\me\anaconda3\envs\Train_models\lib\site-packages\skopt\space\transformers.py", line 217, in <listcomp>
self.inverse_mapping_[int(np.round(i))] for i in Xt
KeyError: 6
Process finished with exit code 1
Code is from tutorial and looks like this (only important parts):
dim_num_dense_layers = Integer(low=1, high=3, name='num_dense_layers')
dim_num_dense_nodes = Integer(low=60, high=1500, name='num_dense_nodes')
dim_activation = Categorical(categories=[ 'sigmoid', 'tanh', 'relu', 'softmax'],
name='activation')
dim_dropout = Real(low = 0.01, high = 0.4, name = 'dropout')
dim_init = Categorical(categories = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'], name = 'kernel_initializer')
dim_loss = Categorical(categories = ['categorical_crossentropy', 'categorical_hinge', 'mean_squared_error', 'huber_loss'], name = 'loss')
dimensions = [dim_num_dense_layers,
dim_num_dense_nodes,
dim_activation,
dim_dropout,
dim_init,
dim_loss]
default_parameters = [2, 600, 'relu', 0.2, 'uniform', 'huber_loss']
#use_named_args(dimensions=dimensions)
def fitness(num_dense_layers, num_dense_nodes, activation, dropout, kernel_initializer, loss):
# Print the hyper-parameters.
keras.backend.clear_session()
model = create_model(num_dense_layers, num_dense_nodes, activation, dropout, kernel_initializer, loss)
log_dir = "Tensor_board/04/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(
log_dir='log_dir',
histogram_freq=1,
write_graph=True,
write_grads=False,
write_images=False)
history = model.fit(train_data,
epochs=1,
batch_size=32,
validation_data=val_data,
callbacks=[tensorboard_callback])
accuracy = max(history.history['val_accuracy'])
global best_accuracy
if accuracy > best_accuracy:
model.save("Models/vgg19_flat.h5")
best_accuracy = accuracy
del model
gc.collect()
keras.backend.clear_session()
return -accuracy
checkpoint_saver = CheckpointSaver("./checkpoint.pkl", compress=9)
search_result = gp_minimize(func=fitness,
dimensions=dimensions,
acq_func='EI',
n_calls=30,
n_initial_points=1,
x0=default_parameters, verbose=True, callback=[checkpoint_saver])

Tensorflow: Model_Average_optimizer Use Problems in got multiple values for keyword argument 'dtype'

I used Model_Average_optimizer and some problems occured:
the code is used more likely as model_average_optimizer_test.py.
I find this error is occured in init ModelAverageOptimizer to create local_step. And this problem is more likely some problems with device_setter.
But I don't know why. Could someone help me?
Have I written custom code: Yes
OS Platform and Distribution: Linux n10-044-067 4.4.0-33.bm.1-amd64 #1 SMP Thu, 22 Jun 2017 11:19:55 +0800 x86_64 GNU/Linux
TensorFlow installed from: Anaconda2.5, pip install tensorflow_gpu
TensorFlow version: 1.8.0
Bazel version: N/A
CUDA/cuDNN version: CUDA-9.0, cuDNN-7.1.2(installed in anaconda2.5 also)
GPU model and memory: GeForce GTX 1080
Exact command to reproduce:
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=worker --task_id=0
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=worker --task_id=1
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=server --task_id=0
You may notice the errors below is occured when one worker init class ModelAverageOptimizer
Traceback (most recent call last):
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 112, in tf.app.run()
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 109, in main
test()
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 84, in test
interval_steps=3)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/contrib/opt/python/training/model_average_optimizer.py", line 139, in init name="local_step")
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1317, in get_variable
constraint=constraint)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1079, in get_variable
constraint=constraint)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 417, in get_variable
return custom_getter(**custom_getter_kwargs)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/contrib/opt/python/training/model_average_optimizer.py", line 92, in call
return getter(name, trainable, collections, *args, **kwargs)
TypeError: _true_getter() got multiple values for keyword argument 'dtype'
The code is below:
import os
import time
import json
import copy
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.contrib.opt.python.training import model_average_optimizer
flags = tf.flags
flags.DEFINE_string("server_hosts", "", "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", "", "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("job_name", "", "Either 'server' of 'worker'")
flags.DEFINE_integer("task_id", 0, "Task Id for Each workers")
FLAGS = flags.FLAGS
tf.logging.set_verbosity(tf.logging.INFO)
def workers_ps_creator(args):
ps_hosts = args.server_hosts.split(",")
worker_hosts = args.worker_hosts.split(",")
num_workers = len(worker_hosts)
cluster = tf.train.ClusterSpec({"ps": ps_hosts,"worker": worker_hosts})
gpu_options = tf.GPUOptions(allocator_type='BFC', allow_growth=True)
if args.job_name == "server":
server_def = tf.train.ServerDef(cluster=cluster.as_cluster_def(),
job_name='ps',
task_index=args.task_id,
default_session_config=tf.ConfigProto(gpu_options=gpu_options, device_count={"GPU":0}),
protocol="grpc")
elif args.job_name == "worker":
server_def = tf.train.ServerDef(cluster=cluster.as_cluster_def(),
job_name="worker",
task_index=args.task_id,
default_session_config = tf.ConfigProto(gpu_options=gpu_options),
protocol="grpc")
server = tf.train.Server(server_def)
return server, cluster, num_workers, gpu_options
def Model(opt):
if FLAGS.task_id == 0:
var_0 = tf.get_variable(initializer = 0.0, name = 'v0')
var_1 = tf.get_variable(initializer = 1.0, name = 'v1')
grads_0 = constant_op.constant(-1.0)
grads_1 = constant_op.constant(-1.0)
else:
var_0 = tf.get_variable(initializer = 7.0, name = 'v0')
var_1 = tf.get_variable(initializer = 8.0, name = 'v1')
grads_0 = constant_op.constant(-2.0)
grads_1 = constant_op.constant(-2.0)
train_op = opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]],
global_step = tf.train.get_or_create_global_step())
return train_op
def test():
server, cluster, num_workers, gpu_options = workers_ps_creator(FLAGS)
if FLAGS.job_name == "server":
server.join()
elif FLAGS.job_name == "worker":
is_chief = (FLAGS.task_id == 0)
#Between-graph replication
worker_device = "/job:worker/task:%d" % (FLAGS.task_id)
ma_custom = model_average_optimizer.ModelAverageCustomGetter(worker_device=worker_device)
from tensorflow.python.training import device_setter
with tf.device(
device_setter.replica_device_setter(
cluster=cluster,
worker_device=worker_device,
ps_device="/job:ps")), \
tf.variable_scope("", custom_getter=ma_custom):
#create model
lr = tf.Variable(1, trainable=False)
opt = tf.train.GradientDescentOptimizer(lr)
sync_opt = model_average_optimizer.ModelAverageOptimizer(
opt=opt,
num_worker=num_workers,
ma_custom_getter=ma_custom,
is_chief=is_chief,
interval_steps=3)
tf.logging.info('model start')
train_model = Model(sync_opt)
tf.logging.info('model end')
ma_hook = sync_opt.make_session_run_hook()
sess_config = tf.ConfigProto(gpu_options=gpu_options)
sess_config.log_device_placement = False
sess_config.allow_soft_placement = True
all_hooks = [ma_hook]
tf.logging.info('Start Sess')
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=is_chief,
hooks=all_hooks) as sess:
tf.logging.info("is chief: %s, len: %s", is_chief, num_workers)
for i in range(4):
sess.run(train_op)
pp1 = sess.run(tf.get_default_graph().get_tensor_by_name('v0:0'))
pp2 = sess.run(tf.get_default_graph().get_tensor_by_name('v1:0'))
tf.logging.info("%d %.2f %.2f" % (FLAGS.task_id, pp1, pp2))
sv.stop()
tf.logging.info("done")

No op named GatherTree when using BeamSearchDecoder

I'm implementing a Seq2Seq model with TensorFlow. My code works using the Greedy Decoder, but when I was using BeamSearchDecoder to improve the performance, I encountered this error:
Traceback (most recent call last):
File "/Users/MichaelChen/Projects/CN-isA-Relation-Extraction/isa_seq2seq/predict.py", line 83, in <module>
out_file='result/result_wc_4.out', checkpoint=checkpoint)
File "/Users/MichaelChen/Projects/CN-isA-Relation-Extraction/isa_seq2seq/predict.py", line 48, in predict
loader = tf.train.import_meta_graph(checkpoint + '.meta')
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1686, in import_meta_graph
**kwargs)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/meta_graph.py", line 504, in import_scoped_meta_graph
producer_op_list=producer_op_list)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/importer.py", line 283, in import_graph_def
raise ValueError('No op named %s in defined operations.' % node.op)
ValueError: No op named GatherTree in defined operations.
This error occurred when I used the infer module to generate
outputs:
with tf.Session(graph=loaded_graph) as sess:
loader = tf.train.import_meta_graph(checkpoint + '.meta')
loader.restore(sess, checkpoint)
input_data = loaded_graph.get_tensor_by_name('inputs:0')
logits = loaded_graph.get_tensor_by_name('predictions:0')
src_seq_len = loaded_graph.get_tensor_by_name('source_sequence_length:0')
tgt_seq_len = loaded_graph.get_tensor_by_name('target_sequence_length:0')
for i in range(len(text)):
if len(text[i].strip()) < 1:
continue
text_seq = src2seq_word(text[i], True)
answer_logits = sess.run(logits, {input_data: [text_seq] * batch_size,
tgt_seq_len: [len(text_seq)] * batch_size,
src_seq_len: [len(text_seq)] * batch_size}
)[0]
pred_res = "".join([pp.id2c[i] for i in answer_logits if i != pad and i != eos])
Program failed at loader = tf.train.import_meta_graph(checkpoint + '.meta')
I don't know if I handle the outputs of the decoder right, so here is the corresponding code:
# 5. Predicting decoder
# Share params with Training Deocder
tiled_dec_start_state = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(encoder_outputs, beam_width)
tiled_src_seq_len = tf.contrib.seq2seq.tile_batch(src_seq_len, beam_width)
with tf.variable_scope('decode', reuse=True):
batch_size_tensor = tf.constant(batch_size)
beam_decoder_cell = get_decoder_cell(tiled_encoder_outputs, tiled_src_seq_len, 2 * num_units)
beam_initial_state = beam_decoder_cell.zero_state(batch_size_tensor * beam_width, tf.float32)
beam_initial_state = beam_initial_state.clone(cell_state=tiled_dec_start_state)
start_tokens = tf.tile(tf.constant([c2id['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')
predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
cell=beam_decoder_cell,
embedding=decoder_embeddings,
start_tokens=start_tokens,
end_token=c2id['<EOS>'],
initial_state=beam_initial_state,
beam_width=beam_width,
output_layer=output_layer
)
predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=predicting_decoder, maximum_iterations=max_tgt_seq_len)
Handling outputs:
training_decoder_output, predicting_decoder_output = seq2seq_model(params...)
training_logits = tf.identity(training_decoder_output.rnn_output, name='logits')
predicting_logits = tf.identity(predicting_decoder_output.predicted_ids[:,:,0], name='predictions')
Also, I found something in the nmt model in
https://github.com/tensorflow/nmt/blob/77e6c55052ba31a8d733c94bb820d091c8156d35/nmt/model.py (line 391)
if beam_width > 0:
logits = tf.no_op()
sample_id = outputs.predicted_ids
else:
logits = outputs.rnn_output
sample_id = outputs.sample_id
Is this has something to do with my error?
Thanks in advance!

Export Model For Serving But Tensor Type Dismatch

What related GitHub issues or StackOverflow threads have you found by searching the web for your problem?
I am trying to export the model for serving , but it's report type error about inputs tensor.
but in the export and predict part , the inputs are the same type.
If possible, provide a minimal reproducible example (We usually don't have time to read hundreds of lines of your code)
here is a sample code for my exporting
```
named_graph_signature = {
'inputs': exporter.generic_signature({
'sparse_index': tf.placeholder(tf.int64, name="feature_index")
'sparse_ids': tf.placeholder(tf.int64,name = "feature_ids"),
'sparse_values':tf.placeholder(tf.int64, name ="feature_values"),
'sparse_shape':tf.placeholder(tf.int64, name="feature_shape")
}),
'outputs': exporter.generic_signature({
'prob': inference_softmax
})}
model_exporter.init(
sess.graph.as_graph_def(),
#default_graph_signature=named_graph_signature,
named_graph_signatures=named_graph_signature,
init_op=init_op)
model_exporter.export(export_path, tf.constant(export_version), sess)
print('Done exporting!')
```
here is my code for predicting
```
ins = "0 142635:1 250810:1 335229:1 375278:1 392970:1 506983:1 554566:1 631968:1 647823:1 658803:1 733446:1 856305:1 868202:1"
FEATURE_SIZE = 1000000
tokens = ins.split(" ")
feature_num = 0
feature_ids = []
feature_values = []
feature_index = []
for feature in tokens[1:]:
feature_id, feature_value = feature.split(":")
feature_ids.append(int(feature_id))
feature_values.append(float(feature_value))
feature_index.append([1, feature_num])
feature_num += 1
feature_shape = [1, FEATURE_SIZE]
sparse_index = tf.contrib.util.make_tensor_proto(numpy.asarray(feature_index), dtype=tf.int64)
sparse_ids = tf.contrib.util.make_tensor_proto(numpy.asarray(feature_ids), dtype=tf.int64)
sparse_values = tf.contrib.util.make_tensor_proto(numpy.asarray(feature_values), dtype=tf.float32)
sparse_shape= tf.contrib.util.make_tensor_proto(numpy.asarray(feature_shape), dtype=tf.int64)
channel = implementations.insecure_channel(host, port)
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = model_name
request.model_spec.version.value = model_version
print model_name,model_version
request.inputs['sparse_index'].CopyFrom(sparse_index)
request.inputs['sparse_ids'].CopyFrom(sparse_ids)
request.inputs['sparse_values'].CopyFrom(sparse_values)
request.inputs['sparse_shape'].CopyFrom(sparse_shape)
# Send request
result = stub.Predict(request, request_timeout)
```
Logs or other output that would be helpful
(If logs are large, please upload as attachment or provide link).
```
Traceback (most recent call last):
File "run.py", line 63, in <module>
main()
File "run.py", line 59, in main
result = stub.Predict(request, request_timeout)
File "/home/serving/.local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 305, in __call__
self._request_serializer, self._response_deserializer)
File "/home/serving/.local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 203, in _blocking_unary_unary
raise _abortion_error(rpc_error_call)
grpc.framework.interfaces.face.face.AbortionError: AbortionError(code=StatusCode.INVALID_ARGUMENT, details="input size does not match signature")
```