dimensions must equal error but they are equal - tensorflow

I added a print to the "discriminator_loss" function to see what was going on. at first it will tell me the shape of both are 16. later it tells me the shape of "real_loss" is only 15 while the other stays 16. So far I have only tried lowering the batchsize's and increasing them by 1 ect. I have provided the most relevant parts of the code. I can provide the rest of the code if needed. I have no clue why this is happening and it breaks the code.
with strategy.scope():
BATCH_SIZE = 16
GLOBAL_BATCH_SIZE = 32#batchsize*# of gpus
im_size = 256
latent_size = 512
with strategy.scope():
cross_entropy = tf.keras.losses.BinaryCrossentropy(
from_logits=True,\
reduction = tf.keras.losses.Reduction.NONE)
#this is used to evaluate discriminators ability to discriminate
def discriminator_loss(real_output, fake_output):
real_loss = cross_entropy(tf.ones_like(real_output), real_output)#compares prediction to actual value of 1
fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)#compares rediction to actual value of 0
print(real_loss)
print(fake_loss)
total_loss = real_loss + fake_loss
total_loss = total_loss/GLOBAL_BATCH_SIZE
return total_loss
#how well was generator able to trick discriminator
def generator_loss(fake_output):
gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)#compares predictions to the expected value 1 of a real image
gen_loss = gen_loss / GLOBAL_BATCH_SIZE
return gen_loss
with strategy.scope():
EPOCHS = 80
noise_dim = 512
num_examples_to_generate = 32
# We will reuse this seed overtime (so it's easier)
# to visualize progress in the animated GIF)
with strategy.scope():
def noise(n):
return tf.random.normal([n, latent_size])
def noiseImage(n):
return tf.random.uniform([n, im_size, im_size, 1])
#seed = tf.random.normal([num_examples_to_generate, noise_dim])
#seed used to generate image>the discriminator than classifies real images from training set and a set of generated images>loss is calculated and gradients are used to update the model
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
with strategy.scope():
##tf.function
def train_step(images):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
real_output = discriminator(images, training=True)
fake_output = discriminator(generated_images, training=True)
g_loss = generator_loss(fake_output)#runs generator loss
d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
D_grads = disc_tape.gradient(d_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(D_grads, discriminator.trainable_variables))
#run g_optim twice to make sure d_loss doesn't go to zero
with tf.GradientTape() as gen_tape:
generated_imgs = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
fake_output = discriminator(generated_imgs, training=True)
g_loss = generator_loss(fake_output)
G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
return g_loss, d_loss
#tf.function
def distributed_train_step(dist_dataset):
per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
total_g_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_g_losses,axis=0)
total_d_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_d_losses,axis=0)
return total_g_loss, total_d_loss
with strategy.scope():
def train(dist_dataset, epochs):
for epoch in range(epochs):
start = time.time()
for image_batch in dist_dataset:
total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
with strategy.scope():
train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
error and traceback
Traceback (most recent call last):
File "C:\image generator\pixiv\#image generator.py", line 507, in <module>
train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
File "C:\image generator\pixiv\#image generator.py", line 441, in train
total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2419, in __call__
graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2777, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2667, in _create_graph_function
capture_by_value=self._capture_by_value),
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 981, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 441, in wrapped_fn
return weak_wrapped_fn().__wrapped__(*args, **kwds)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 968, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
C:\image generator\pixiv\#image generator.py:419 distributed_train_step *
per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
C:\image generator\pixiv\#image generator.py:393 train_step *
d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
C:\image generator\pixiv\#image generator.py:328 discriminator_loss *
total_loss = real_loss + fake_loss
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:984 binary_op_wrapper
return func(x, y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1276 _add_dispatch
return gen_math_ops.add_v2(x, y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:483 add_v2
"AddV2", x=x, y=y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
attrs=attr_protos, op_def=op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
compute_device)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
op_def=op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1817 __init__
control_input_ops, op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
raise ValueError(str(e))
ValueError: Dimensions must be equal, but are 0 and 2 for '{{node replica_1/add}} = AddV2[T=DT_FLOAT](replica_1/binary_crossentropy_1/weighted_loss/Mul, replica_1/binary_crossentropy_2/weighted_loss/Mul)' with input shapes: [0], [2].

So according to comments the problem lies in unequal batch sizes, due to the final batch being smaller than the specified batch size. I believe this is due to this line:
generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
where the constant size BATCH_SIZE is used, instead of the actual input shape of the batch, so that generated_images is of a different shape than images.
So one solution as mentioned is simply to use drop_remainder=True in batch(). However it might be better to get the generator to output images of the same shape as the input, so instead of passing BATCH_SIZE as argument to your noise generation functions, you should use the actual size of the input batch. So maybe using tf.shape(images)[0] would help. Alternatively, you could generate a fixed batch of images with BATCH_SIZE, and then simply discard any extra images, like
num_images = tf.shape(images)[0]
generated_images = generated_images[:num_images]

Related

TypeError: Can not convert a NoneType into a Tensor or Operation -- Error believe related to converting to graph

Below find my model:
class CustomModel(tf.keras.Model):
def __init__(self, model1, model2, model3, model4):
super(deep_and_wide, self).__init__()
self.model1 = model1
self.model2 = model2
self.model3 = model3
self.model4 = model4
def call(self, inputs):
x1 = self.mode1([inputs["a"], inputs["b"]])
x2 = self.model2([inputs["a"], inputs["b"]])
x3 = self.model3([inputs["a"], inputs["b"]])
x4 = self.model4([inputs["a"], inputs["b"]])
x = Concatenate()([x1, x2, x3])
x = TimeDistributed(Dense(2))(x)
x = Add()([x, x4])
x_fc = Dense(1)(x)
x_ec = Dense(1)(x)
return x_fc, x_ec
def train_step(self, data):
with tf.GradientTape() as tape:
data = data_adapter.expand_1d(data)
batch_inputs, batch_outputs, sample_weight= data_adapter.unpack_x_y_sample_weight(data)
y_true_fc, y_true_ec = batch_outputs["y_fc"], batch_outputs["y_ec"]
y_pred_fc, y_pred_ec = self(batch_inputs, training=True)
loss_fc = self.compiled_loss(y_true_fc, y_pred_fc)
loss_ec = self.compiled_loss(y_true_ec, y_pred_ec)
print("here")
trainable_variables = self.trainable_variables
print("here")
gradients = tape.gradient([loss_fc, loss_ec], trainable_variables)
print("here")
self.optimizer.apply_gradients(zip(gradients, trainable_variables))
print("here")
And below is my custom loss
class CustomLoss(tf.keras.losses.Loss):
def __init__(self, mask=True, alpha=1, beta=1, gamma=1, dtype=tf.float64):
super(CustomLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE)
self.mask = mask
self.alpha = alpha
self.beta = beta
self.gamma = gamma
self.dtype = dtype
def call(self, y_true, y_pred):
def loss_fn(y_true, y_pred, mask):
y_true = tf.boolean_mask(y_true, mask)
y_pred = tf.boolean_mask(y_pred, mask)
return tf.keras.losses.MSE(y_true, y_pred)
self.mask = tf.not_equal(y_true, 0.)
y_true = tf.cast(y_true, self.dtype)
y_pred = tf.cast(y_pred, self.dtype)
y_pred = tf.multiply(y_pred, tf.cast(self.mask, dtype=self.dtype))
y_pred_cum = tf.math.cumsum(y_pred, axis=1)
y_pred_cum = tf.multiply(y_pred_cum, tf.cast(self.mask, dtype=self.dtype))
y_true_cum = tf.math.cumsum(y_true, axis=1)
y_true_cum = tf.multiply(y_true_cum, tf.cast(self.mask, dtype=self.dtype))
loss_value = self.alpha * loss_fn(y_true, y_pred, self.mask) + \
self.gamma * loss_fn(y_true_cum, y_pred_cum, self.mask)
return loss_value
And then finally:
optimizer = tf.keras.optimizers.Adam()
loss = CustomLoss()
model.compile(optimizer, loss)
model.fit(train_data, epochs=5, validation_data=val_data)
My data inputs are of size (sequence length, feature length) where sequence length is variable hence I am using tf.data.experimental.bucket_by_sequence_length to pad to max sequence length of the batch (as opposed to batch to max sequence length). All in all, my train and val data are tf.data.Datasets each created using tf.data.experimental.bucket_by_sequence_length where each batch is of size (None, None, feature length).
When I run the above code, I get the following errors and cannot seem to understand where I am going wrong:
Traceback (most recent call last):
File "<input>", line 75, in <module>
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1100, in fit
tmp_logs = self.train_function(iterator)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 828, in __call__
result = self._call(*args, **kwds)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 871, in _call
self._initialize(args, kwds, add_initializers_to=initializers)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 725, in _initialize
self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\function.py", line 2969, in _get_concrete_function_internal_garbage_collected
graph_function, _ = self._maybe_define_function(args, kwargs)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\function.py", line 3361, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\function.py", line 3196, in _create_graph_function
func_graph_module.func_graph_from_py_func(
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\func_graph.py", line 990, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 634, in wrapped_fn
out = weak_wrapped_fn().__wrapped__(*args, **kwds)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\func_graph.py", line 977, in wrapper
raise e.ag_error_metadata.to_exception(e)
TypeError: in user code:
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function *
return step_function(self, iterator)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
return fn(*args, **kwargs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py:790 run_step **
with ops.control_dependencies(_minimum_control_deps(outputs)):
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:5359 control_dependencies
return get_default_graph().control_dependencies(control_inputs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\func_graph.py:362 control_dependencies
return super(FuncGraph, self).control_dependencies(filtered_control_inputs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:4815 control_dependencies
c = self.as_graph_element(c)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:3726 as_graph_element
return self._as_graph_element_locked(obj, allow_tensor, allow_operation)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:3814 _as_graph_element_locked
raise TypeError("Can not convert a %s into a %s." %
TypeError: Can not convert a NoneType into a Tensor or Operation.
The four print statements inserted in the train_step function above are printed.
This NoneType refers to the returned value of the custom train_step, when using a custom train_step you should return something that can be converted into a tensor so that the minimum control dependencies can process it, typically, the loss value as {"loss": loss_value} and potentially some other metrics, or at least an empty dict {}.

Tensorflow predictor: Specifying the serving_input_receiver_fn

I want to build a predictor from a an tf.estimator.Estimator model. Therefore I need to specify a input_receiver_fn that specifies the preprocessing graph from the receiver tensors to the features that will be passed to the model_fn by the predictor.
Here is an example for an eval_input_fn for the Estimator:
def eval_input_fn(params):
ds = tf.data.Dataset.from_generator(
generator=Eval_Generator(params),
output_types=(tf.uint16,tf.uint16),
output_shapes = ([3]+params['crop_size'],[2]+params['crop_size']))
augmentations = [Convert,Downsample,Clip]
ds = ds.repeat()
for augmentation in augmentations:
ds = ds.map(augmentation, num_parallel_calls=params['threads'])
ds = ds.batch(1).prefetch(None)
return ds
I changed the augmentation functions from taking in two arguments (features: tf.Tensor, labels: tf.Tensor) to taking only one argument (features: tf.Tensor) and wrote the according input_receiver_fn that looks like this:
def serving_input_receiver_fn():
rec_raw = tf.placeholder(tf.float32, [3, 256, 256, 256],name='raw')
raw = Convert(rec_raw)
raw = Downsample(raw)
raw = Clip(raw)
raw = tf.expand_dims(raw,0)
return tf.estimator.export.TensorServingInputReceiver(features=raw,receiver_tensors=rec_raw)
The function returns the following object:
TensorServingInputReceiver(features=<tf.Tensor 'ExpandDims_1:0' shape=(1, 3, 128, 128, 128) dtype=float32>, receiver_tensors={'input': <tf.Tensor 'raw:0' shape=(3, 256, 256, 256) dtype=float32>}, receiver_tensors_alternatives=None)
which seems pretty right. But when it try to instantiate the predictor by:
config = tf.estimator.RunConfig(model_dir = params['model_dir'])
estimator = tf.estimator.Estimator(model_fn=model_fn, params=params,config=config)
predict_fn = tf.contrib.predictor.from_estimator(estimator, serving_input_receiver_fn)
I'll get the following error message:
INFO:tensorflow:Calling model_fn.
Traceback (most recent call last):
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 510, in _apply_op_helper
preferred_dtype=default_dtype)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1146, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 229, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 208, in constant
value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 430, in make_tensor_proto
raise ValueError("None values not supported.")
ValueError: None values not supported.
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/contrib/predictor/predictor_factories.py", line 105, in from_estimator
config=config)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/contrib/predictor/core_estimator_predictor.py", line 72, in __init__
serving_input_receiver, estimator, output_key)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/contrib/predictor/core_estimator_predictor.py", line 37, in _get_signature_def
estimator.config)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 235, in public_model_fn
return self._call_model_fn(features, labels, mode, config)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1195, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/fast/AG_Kainmueller/jrumber/flylight_01/train_tf.py", line 227, in model_fn
gt,fg = tf.unstack(labels,num=2,axis=1)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1027, in unstack
return gen_array_ops.unpack(value, num=num, axis=axis, name=name)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 9429, in unpack
"Unpack", value=value, num=num, axis=axis, name=name)
File "/home/jrumber/anaconda3/envs/tf1.12_gpuenv/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 528, in _apply_op_helper
(input_name, err))
ValueError: Tried to convert 'value' to a tensor and failed. Error: None values not supported.
Since it could be a problem with my model_fn, I'll post it too:
def model_fn(features,labels,mode,params):
gt,fg = tf.unstack(labels,num=2,axis=1)
gt.set_shape([1]+params['input_size'])
fg.set_shape([1]+params['input_size'])
features.set_shape([1,3]+params['input_size'])
# first layer to set input_shape
features = tf.keras.layers.Conv3D(
input_shape = tuple([3]+params['input_size']),
data_format = 'channels_first',
filters = params['chan'],
kernel_size = [3,3,3],
strides=(1, 1, 1),
padding='same',
activation='relu',
kernel_regularizer=tf.keras.regularizers.l2(l=0.01))(features)
# U-Net
out = unet(features, params['unet_initial_filters'], params['width_factor'], params['architecture'])
# Embedding conv pass
output_batched = conv_pass(
out,
kernel_size=1,
num_fmaps=params['chan'],
num_repetitions=1,
activation=None,
name='conv_embedding')
output = tf.squeeze(output_batched)
# Fg/Bg segmentation conv pass
mask_batched = conv_pass(
out,
kernel_size=1,
num_fmaps=1,
num_repetitions=1,
activation='sigmoid',
name='conv_mask')
prob_mask = tf.squeeze(mask_batched)
logits_mask = logit(prob_mask)
# store predictions in dict
predictions = {
'prob_mask': tf.expand_dims(prob_mask,0),
'embedding': output,
'gt': tf.squeeze(gt,0)}
# RAIN mode
if mode == tf.contrib.learn.ModeKeys.TRAIN:
loss , l_var, l_dist, l_reg = discriminative_loss_single(prediction=output,
correct_label=tf.squeeze(gt),
feature_dim=params['chan'],
delta_v= params['delta_v'],
delta_d= params['delta_d'],
param_var= params['param_var'],
param_dist= params['param_dist'],
param_reg= params['param_reg']
)
mask_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.squeeze(fg),
logits=logits_mask))
reg_loss = tf.losses.get_regularization_loss() * 1e-6
loss += mask_loss + reg_loss
opt = tf.train.AdamOptimizer(
learning_rate=0.5e-4,
beta1=0.95,
beta2=0.999,
epsilon=1e-8)
optimizer = opt.minimize(loss, global_step=tf.train.get_global_step())
global_step = tf.Variable(1, name='global_step', trainable=False, dtype=tf.int32)
increment_global_step_op = tf.assign(global_step, global_step+1)
logging_hook = tf.train.LoggingTensorHook({"loss" : loss,'global_step':increment_global_step_op}, every_n_iter=1)
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=optimizer, training_hooks=[logging_hook])
# PREDICT mode
if mode == tf.estimator.ModeKeys.PREDICT:
export_outputs = {
'predict_output': tf.estimator.export.PredictOutput(predictions)
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)
# EVAL mode
if mode == tf.estimator.ModeKeys.EVAL:
export_outputs = {
'eval_output': tf.estimator.export.EvalOutput(predictions)
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)
Does anybody spot my mistake here?
Best :)
The error was in the model_fn. The following lines have to be moved down to the # TRAIN mode part of the function
gt,fg = tf.unstack(labels,num=2,axis=1)
gt.set_shape([1]+params['input_size'])
fg.set_shape([1]+params['input_size'])
Estimator.predict will feed only the features and None instead of labels, therefore tf.unstack will throw an exception, so all operations that work on the labels have to be moved to the # train mode part of the model_fn.

InvalidArgumentError: Matrix size-incompatible: In[0]: [256,2048], In[1]: [256,1024]

I have been getting this error and i cant figure out the reason. if anyone could help would be great.
this is my code:
import numpy as np
import pickle
import os
import download
#from dataset import one_hot_encoded
#from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from random import shuffle
data_path = "D:/Personal details/Internship/"
# Width and height of each image.
img_size = 32
# Number of channels in each image, 3 channels: Red, Green, Blue.
num_channels = 3
# Length of an image when flattened to a 1-dim array.
img_size_flat = img_size * img_size * num_channels
# Number of classes.
num_classes = 10
# Number of files for the training-set.
_num_files_train = 5
# Number of images for each batch-file in the training-set.
_images_per_file = 10000
def _get_file_path(filename=""):
return os.path.join(data_path, "cifar-10-batches-py/", filename)
def _unpickle(filename):
file_path = _get_file_path(filename)
print("Loading data: " + file_path)
with open(file_path, mode='rb') as file:
# In Python 3.X it is important to set the encoding,
# otherwise an exception is raised here.
data = pickle.load(file, encoding='bytes')
return data
def _convert_images(raw):
# Convert the raw images from the data-files to floating-points.
raw_float = np.array(raw, dtype=float) / 255.0
# Reshape the array to 4-dimensions.
images = raw_float.reshape([-1, num_channels, img_size, img_size])
# Reorder the indices of the array.
images = images.transpose([0, 2, 3, 1])
return images
def _load_data(filename):
# Load the pickled data-file.
data = _unpickle(filename)
# Get the raw images.
raw_images = data[b'data']
# Get the class-numbers for each image. Convert to numpy-array.
cls = np.array(data[b'labels'])
# Convert the images.
images = _convert_images(raw_images)
return images, cls
def load_class_names():
# Load the class-names from the pickled file.
raw = _unpickle(filename="batches.meta")[b'label_names']
# Convert from binary strings.
names = [x.decode('utf-8') for x in raw]
return names
def load_training_data():
images = np.zeros(shape=[_num_images_train, img_size, img_size, num_channels], dtype=float)
cls = np.zeros(shape=[_num_images_train], dtype=int)
# Begin-index for the current batch.
begin = 0
# For each data-file.
for i in range(_num_files_train):
# Load the images and class-numbers from the data-file.
images_batch, cls_batch = _load_data(filename="data_batch_" + str(i + 1))
# Number of images in this batch.
num_images = len(images_batch)
# End-index for the current batch.
end = begin + num_images
# Store the images into the array.
images[begin:end, :] = images_batch
# Store the class-numbers into the array.
cls[begin:end] = cls_batch
# The begin-index for the next batch is the current end-index.
begin = end
return images, cls, one_hot_encoded(class_numbers=cls, num_classes=num_classes)
def load_test_data():
images, cls = _load_data(filename="test_batch")
return images, cls, one_hot_encoded(class_numbers=cls, num_classes=num_classes)
########################################################################
def one_hot_encoded(class_numbers, num_classes=None):
if num_classes is None:
num_classes = np.max(class_numbers) + 1
return np.eye(num_classes, dtype=float)[class_numbers]
class_names = load_class_names()
images_train, cls_train, labels_train = load_training_data()
images_test, cls_test, labels_test = load_test_data()
images_train_train = images_train[0:45000]
validation_train = images_train[45000:50000]
labels_train_train = labels_train[0:45000]
validation_labels = labels_train[45000:]
print(len(images_train_train))
print(len(validation_train))
##print(class_names)
##print(len(images_train))
##print(cls_train)
##print(labels_train)
##print(cls_test)
##print(labels_test)
n_classes = len(class_names)
batch_size = 128
x = tf.placeholder(tf.float32, shape=[None, 32, 32, 3], name='x')
y = tf.placeholder(tf.float32, shape=[None, n_classes], name='y_true')
def conv2d(x,W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
def convolutional_neural_network(x):
weights = {'W_conv1': tf.Variable(tf.random_normal([3,3,3,64])),
'W_conv2': tf.Variable(tf.random_normal([3,3,64,128])),
'W_conv3': tf.Variable(tf.random_normal([3,3,128,256])),
'W_conv4': tf.Variable(tf.random_normal([3,3,256,256])),
'W_fc1': tf.Variable(tf.random_normal([256,1024])),
'W_fc2': tf.Variable(tf.random_normal([1024,1024])),
'soft_max': tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'b_conv1': tf.Variable(tf.random_normal([64])),
'b_conv2': tf.Variable(tf.random_normal([128])),
'b_conv3': tf.Variable(tf.random_normal([256])),
'b_conv4': tf.Variable(tf.random_normal([256])),
'b_fc1': tf.Variable(tf.random_normal([1024])),
'b_fc2': tf.Variable(tf.random_normal([1024])),
'soft_max': tf.Variable(tf.random_normal([n_classes]))}
conv1 = tf.nn.relu(conv2d(x, weights['W_conv1']) + biases['b_conv1'])
conv1 = maxpool2d(conv1)
conv2 = tf.nn.relu(conv2d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool2d(conv2)
conv3 = tf.nn.relu(conv2d(conv2, weights['W_conv3']) + biases['b_conv3'])
conv4 = tf.nn.relu(conv2d(conv3, weights['W_conv4']) + biases['b_conv4'])
conv4 = maxpool2d(conv4)
fc1 = tf.reshape(conv4,[256,-1])
fc1 = tf.nn.relu(tf.matmul(fc1, weights['W_fc1']) + biases['b_fc1'])
fc2 = tf.nn.relu(tf.matmul(fc1, weights['W_fc2'] + biases['b_fc2']))
soft_max = tf.matmul(fc2, weights['soft_max']) + biases['soft_max']
return soft_max
def train_neural_network(x):
prediction = convolutional_neural_network(x)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits = prediction,labels = y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 3
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
for epoch in range(hm_epochs):
epoch_loss = 0
i = 0
while i < len(images_train_train):
start = i
end = i+batch_size
batch_x = np.array(images_train_train[start:end])
batch_y = np.array(labels_train_train[start:end])
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
epoch_loss += c
print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:validation_train, y:validation_labels}))
train_neural_network(x)
Ans this is the error i have been getting.
WARNING:tensorflow:From D:/Personal details/Internship/cifar-10v1.0.py:310: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.
See #{tf.nn.softmax_cross_entropy_with_logits_v2}.
WARNING:tensorflow:From C:\Python35\lib\site-packages\tensorflow\python\util\tf_should_use.py:118: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Traceback (most recent call last):
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1322, in _do_call
return fn(*args)
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1307, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1409, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Matrix size-incompatible: In[0]: [256,2048], In[1]: [256,1024]
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Reshape, Variable_4/read)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/Personal details/Internship/cifar-10v1.0.py", line 344, in <module>
train_neural_network(x)
File "D:/Personal details/Internship/cifar-10v1.0.py", line 327, in train_neural_network
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 900, in run
run_metadata_ptr)
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1316, in _do_run
run_metadata)
File "C:\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Matrix size-incompatible: In[0]: [256,2048], In[1]: [256,1024]
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Reshape, Variable_4/read)]]
Caused by op 'MatMul', defined at:
File "<string>", line 1, in <module>
File "C:\Python35\lib\idlelib\run.py", line 130, in main
ret = method(*args, **kwargs)
File "C:\Python35\lib\idlelib\run.py", line 357, in runcode
exec(code, self.locals)
File "D:/Personal details/Internship/cifar-10v1.0.py", line 344, in <module>
train_neural_network(x)
File "D:/Personal details/Internship/cifar-10v1.0.py", line 309, in train_neural_network
prediction = convolutional_neural_network(x)
File "D:/Personal details/Internship/cifar-10v1.0.py", line 300, in convolutional_neural_network
fc1 = tf.nn.relu(tf.matmul(fc1, weights['W_fc1']) + biases['b_fc1'])
File "C:\Python35\lib\site-packages\tensorflow\python\ops\math_ops.py", line 2122, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "C:\Python35\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 4567, in mat_mul
name=name)
File "C:\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "C:\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 3392, in create_op
op_def=op_def)
File "C:\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1718, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Matrix size-incompatible: In[0]: [256,2048], In[1]: [256,1024]
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Reshape, Variable_4/read)]]
It looks like the problem is in convolutional_neural_network layer() function wherein somehow it is mad at not being able to multiply the same dimension of the matrix. But it is not clear how to solve the issue
Thank you for the help in advance...
After reshaping conv4 at line fc1 = tf.reshape(conv4,[256,-1]), the shape of fc1 is (256, 2048) and the weight matrix W_fc1 has shape (256, 1024). Thus, you get a size incompatible error at the next line fc1 = tf.nn.relu(tf.matmul(fc1, weights['W_fc1']) + biases['b_fc1'])
in the matrix multiplication part. I suggest you to go through the dimensions at every step manually to find errors in future.

LSTM tensorflow shape error?

I have a Class for RNN that can classify the sentences into positive or negative for sentiment analysis. Because the length of sentences are different, so I used placeholder named text_len to be a vector of sentence lengths in one batch when feeding the training batch data:
class TextRNN:
def __init__(self, hidden_size, num_classes, text_length, vocab_size,
embed_size, l2_lambda=0.001):
self.input_data = tf.placeholder(tf.int32,[None,text_length],name="input_data")
self.output_label = tf.placeholder(tf.float32,[None,num_classes],name="output_label")
self.dropout_rate = tf.placeholder(tf.float32,name="dropout_rate")
self.text_len = tf.placeholder(tf.int32, [None])
with tf.name_scope("embedding"):
self.W = tf.Variable(tf.random_uniform([vocab_size,embed_size],-1.0,1.0),trainable=True,name="W")
self.word_embeddings = tf.nn.embedding_lookup(self.W, self.input_data)
with tf.name_scope("hidden"):
lstm_fw_cell=rnn.BasicLSTMCell(hidden_size)
lstm_bw_cell=rnn.BasicLSTMCell(hidden_size)
lstm_fw_cell=rnn.DropoutWrapper(lstm_fw_cell,output_keep_prob=self.dropout_rate)
lstm_bw_cell=rnn.DropoutWrapper(lstm_bw_cell,output_keep_prob=self.dropout_rate)
outputs,_=tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell,inputs=self.word_embeddings,sequence_length=self.text_len,dtype=tf.float32)
In my main method:
for ex in range(1,3):
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
rnn = TextRNN(
hidden_size=100,
num_classes=2,
text_length=160,
vocab_size=len(vocab_processor.vocabulary_),
embed_size=100,
l2_lambda=0.001)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(rnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars,global_step=global_step)
sess.run(tf.global_variables_initializer())
vocabulary = vocab_processor.vocabulary_
initW = None
initW = loadw2v.convert_glove(vocabulary, pretrained_embeddings, 100)
sess.run(rnn.W.assign(initW))
#training step
def train_step(x_batch, y_batch, start_index, end_index):
feed_dict = {rnn.input_data:x_batch,rnn.output_label:y_batch,rnn.dropout_rate:0.5,rnn.text_len:x_train_len[start_index:end_index]}
_, step, loss, test = sess.run([train_op, global_step, rnn.loss, rnn.text_len], feed_dict)
because I want to feed a vector of sequence length into RNN model, so there is a rnn.text_len:x_train_len[start_index:end_index] when using tf.nn.bidirectional_dynamic_rnn, x_train_len[start_index:end_index] is a vector that saves the length of sequence, start_index is the index of batch start, end_index is the index of batch end. The error is:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
status, run_metadata)
File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 88, in __exit__
next(self.gen)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: seq_lens(45) > input.dims(1)
[[Node: hidden/bidirectional_rnn/bw/ReverseSequence = ReverseSequence[T=DT_FLOAT, Tlen=DT_INT32, batch_dim=0, seq_dim=1, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding/embedding_lookup, _arg_Placeholder_0_0)]]
I don't understand this error, any idea?

Cannot read predictions returned by estimator.predict in tensorflow

I am new to tensorflow and I have been following some of the documentation on how to create our own estimator and training the model. I was able to define a custom model and train the model. But when I try to predict and read the values I am unable to read the value returned by estimator.predict. Below is my code sample
import numpy as np
import tensorflow as tf
# Declare list of features, we only have one real-valued feature
def model_fn(features, labels, mode):
# Build a linear model and predict values
W = tf.get_variable("W", [1], dtype=tf.float64)
b = tf.get_variable("b", [1], dtype=tf.float64)
y = W*features['x'] + b
# Loss sub-graph
loss = tf.reduce_sum(tf.square(y - labels))
# Training sub-graph
global_step = tf.train.get_global_step()
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = tf.group(optimizer.minimize(loss),
tf.assign_add(global_step, 1))
# EstimatorSpec connects subgraphs we built to the
# appropriate functionality.
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=y,
loss=loss,
train_op=train)
estimator = tf.estimator.Estimator(model_fn=model_fn)
# define our data sets
x_train = np.array([1., 2., 3., 4.])
y_train = np.array([0., -1., -2., -3.])
x_eval = np.array([2., 5., 8., 1.])
y_eval = np.array([-1.01, -4.1, -7, 0.])
input_fn = tf.estimator.inputs.numpy_input_fn(
{"x":x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
{"x":x_train}, y_train, batch_size=4, num_epochs=1000, shuffle=False)
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
{"x":x_eval}, y_eval, batch_size=4, num_epochs=1000, shuffle=False)
estimate_input_fn = tf.estimator.inputs.numpy_input_fn({"x": x_eval},shuffle=False)
# train
estimator.train(input_fn=input_fn, steps=1000)
# Here we evaluate how well our model did.
train_metrics = estimator.evaluate(input_fn=train_input_fn)
eval_metrics = estimator.evaluate(input_fn=eval_input_fn)
estimate_metrics = estimator.predict(input_fn=estimate_input_fn, predict_keys=['y'])
print(list(estimate_metrics))
this throws an error
Traceback (most recent call last):
File "/Users/hdattada/PycharmProjects/TensorFlowIntro/custom_model_linear.py", line 49, in <module>
predictions = list(itertools.islice(estimate_metrics, 15))
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 339, in predict
model_fn_lib.ModeKeys.PREDICT)
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn
features=features, labels=labels, **kwargs)
File "/Users/hdattada/PycharmProjects/TensorFlowIntro/custom_model_linear.py", line 12, in model_fn
loss = tf.reduce_sum(tf.square(y - labels))
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 829, in binary_op_wrapper
y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 676, in convert_to_tensor
as_ref=False)
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 741, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 113, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 102, in constant
tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/Users/hdattada/.virtualenvs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/tensor_util.py", line 364, in make_tensor_proto
raise ValueError("None values not supported.")
ValueError: None values not supported.
I am using TF1.2, could someone please help me out?
The labels will be none when calling Estimator.predict, so the graph will build error when calling model_fn, you can change you model_fn as follows:
def model_fn(features, labels, mode):
# Build a linear model and predict values
W = tf.get_variable("W", [1], dtype=tf.float64)
b = tf.get_variable("b", [1], dtype=tf.float64)
y = W*features['x'] + b
loss = None
train = None
if labels is not None:
# Loss sub-graph
loss = tf.reduce_sum(tf.square(y - labels))
# Training sub-graph
global_step = tf.train.get_global_step()
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = tf.group(optimizer.minimize(loss),
tf.assign_add(global_step, 1))
# EstimatorSpec connects subgraphs we built to the
# appropriate functionality.
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"y":y},
loss=loss,
train_op=train)
Better solution is to check if you are in prediction mode and if so to exit the model_fn before you calculate the loss or any labels like this:
...
y = W*features['x'] + b
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=y)
loss = tf.reduce_sum(tf.square(y - labels))
...