TypeError: Can not convert a NoneType into a Tensor or Operation -- Error believe related to converting to graph - tensorflow

Below find my model:
class CustomModel(tf.keras.Model):
def __init__(self, model1, model2, model3, model4):
super(deep_and_wide, self).__init__()
self.model1 = model1
self.model2 = model2
self.model3 = model3
self.model4 = model4
def call(self, inputs):
x1 = self.mode1([inputs["a"], inputs["b"]])
x2 = self.model2([inputs["a"], inputs["b"]])
x3 = self.model3([inputs["a"], inputs["b"]])
x4 = self.model4([inputs["a"], inputs["b"]])
x = Concatenate()([x1, x2, x3])
x = TimeDistributed(Dense(2))(x)
x = Add()([x, x4])
x_fc = Dense(1)(x)
x_ec = Dense(1)(x)
return x_fc, x_ec
def train_step(self, data):
with tf.GradientTape() as tape:
data = data_adapter.expand_1d(data)
batch_inputs, batch_outputs, sample_weight= data_adapter.unpack_x_y_sample_weight(data)
y_true_fc, y_true_ec = batch_outputs["y_fc"], batch_outputs["y_ec"]
y_pred_fc, y_pred_ec = self(batch_inputs, training=True)
loss_fc = self.compiled_loss(y_true_fc, y_pred_fc)
loss_ec = self.compiled_loss(y_true_ec, y_pred_ec)
print("here")
trainable_variables = self.trainable_variables
print("here")
gradients = tape.gradient([loss_fc, loss_ec], trainable_variables)
print("here")
self.optimizer.apply_gradients(zip(gradients, trainable_variables))
print("here")
And below is my custom loss
class CustomLoss(tf.keras.losses.Loss):
def __init__(self, mask=True, alpha=1, beta=1, gamma=1, dtype=tf.float64):
super(CustomLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE)
self.mask = mask
self.alpha = alpha
self.beta = beta
self.gamma = gamma
self.dtype = dtype
def call(self, y_true, y_pred):
def loss_fn(y_true, y_pred, mask):
y_true = tf.boolean_mask(y_true, mask)
y_pred = tf.boolean_mask(y_pred, mask)
return tf.keras.losses.MSE(y_true, y_pred)
self.mask = tf.not_equal(y_true, 0.)
y_true = tf.cast(y_true, self.dtype)
y_pred = tf.cast(y_pred, self.dtype)
y_pred = tf.multiply(y_pred, tf.cast(self.mask, dtype=self.dtype))
y_pred_cum = tf.math.cumsum(y_pred, axis=1)
y_pred_cum = tf.multiply(y_pred_cum, tf.cast(self.mask, dtype=self.dtype))
y_true_cum = tf.math.cumsum(y_true, axis=1)
y_true_cum = tf.multiply(y_true_cum, tf.cast(self.mask, dtype=self.dtype))
loss_value = self.alpha * loss_fn(y_true, y_pred, self.mask) + \
self.gamma * loss_fn(y_true_cum, y_pred_cum, self.mask)
return loss_value
And then finally:
optimizer = tf.keras.optimizers.Adam()
loss = CustomLoss()
model.compile(optimizer, loss)
model.fit(train_data, epochs=5, validation_data=val_data)
My data inputs are of size (sequence length, feature length) where sequence length is variable hence I am using tf.data.experimental.bucket_by_sequence_length to pad to max sequence length of the batch (as opposed to batch to max sequence length). All in all, my train and val data are tf.data.Datasets each created using tf.data.experimental.bucket_by_sequence_length where each batch is of size (None, None, feature length).
When I run the above code, I get the following errors and cannot seem to understand where I am going wrong:
Traceback (most recent call last):
File "<input>", line 75, in <module>
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1100, in fit
tmp_logs = self.train_function(iterator)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 828, in __call__
result = self._call(*args, **kwds)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 871, in _call
self._initialize(args, kwds, add_initializers_to=initializers)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 725, in _initialize
self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\function.py", line 2969, in _get_concrete_function_internal_garbage_collected
graph_function, _ = self._maybe_define_function(args, kwargs)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\function.py", line 3361, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\function.py", line 3196, in _create_graph_function
func_graph_module.func_graph_from_py_func(
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\func_graph.py", line 990, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\eager\def_function.py", line 634, in wrapped_fn
out = weak_wrapped_fn().__wrapped__(*args, **kwds)
File "C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\func_graph.py", line 977, in wrapper
raise e.ag_error_metadata.to_exception(e)
TypeError: in user code:
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function *
return step_function(self, iterator)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
return fn(*args, **kwargs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\keras\engine\training.py:790 run_step **
with ops.control_dependencies(_minimum_control_deps(outputs)):
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:5359 control_dependencies
return get_default_graph().control_dependencies(control_inputs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\func_graph.py:362 control_dependencies
return super(FuncGraph, self).control_dependencies(filtered_control_inputs)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:4815 control_dependencies
c = self.as_graph_element(c)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:3726 as_graph_element
return self._as_graph_element_locked(obj, allow_tensor, allow_operation)
C:\Users\\Anaconda3\envs\tf_recsys\lib\site-packages\tensorflow\python\framework\ops.py:3814 _as_graph_element_locked
raise TypeError("Can not convert a %s into a %s." %
TypeError: Can not convert a NoneType into a Tensor or Operation.
The four print statements inserted in the train_step function above are printed.

This NoneType refers to the returned value of the custom train_step, when using a custom train_step you should return something that can be converted into a tensor so that the minimum control dependencies can process it, typically, the loss value as {"loss": loss_value} and potentially some other metrics, or at least an empty dict {}.

Related

Tape gradient gives wrong output

I am trying to compute gradient using tape.gradient() but it gives me wrong answer. The error is in the lines u_z=tape.gradient(u,z,unconnected_gradients=tf.UnconnectedGradients.ZERO) and two lines that follow it from below code. The function u is not constant in the variables z,f,t but the output from computing tape.gradient(u,z) or tape.gradient(u,t) gives me a None object. If I pass unconnected_gradients=tf.UnconnectedGradients.ZERO as the argument, then I get 0.0 as the derivative, which does not make sense. So one thing that might have gone wrong is that the network gets disconnected but I cannot understand why this happens and how to fix it. I am using tensorflow 2.6.0 and keras 2.6.0. I provide the code and error message below.
import tensorflow as tf
import numpy as np
from tensorflow import keras
import os
from tqdm import trange
import matplotlib.pyplot as plt
# Switch of unnecessary TF warning messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class Model():
def __init__(self):
self.optimizer = keras.optimizers.Adam()
self.initializer = tf.keras.initializers.HeNormal()
self.batchSize = 500
self.number_epochs=5000
def NN(self,num_layers = 3, num_neurons = 30):
model_ = keras.models.Sequential()
model_.add(keras.layers.Dense(num_neurons,activation='tanh',input_dim=3,kernel_initializer = self.initializer))
for layer in range(num_layers-1):
model_.add(keras.layers.Dense(num_neurons,activation='tanh',kernel_initializer=self.initializer))
model_.add(keras.layers.Dense(1,kernel_initializer=self.initializer))
return model_
def solve_pde(self,value_function,X,idx):
z,f,t = X[:,0:1],X[:,1:2],X[:,2:3]
with tf.GradientTape(persistent=True) as tape:
u = value_function(tf.concat([z,f,t],axis=1))
u_z = tape.gradient(u,z,unconnected_gradients=tf.UnconnectedGradients.ZERO)
u_zz = tape.gradient(u_z,z,unconnected_gradients=tf.UnconnectedGradients.ZERO)
u_t = tape.gradient(u,t)
u_pde = u_t + u_z + u_zz - tf.cast(0.5,dtype=tf.float32) * u
return u_pde
def loss_function(self,batchSize):
z = tf.linspace(0.001,0.999, 200)
f = tf.linspace(0.1,0.2, 20)
z_tile = tf.tile(tf.expand_dims(z,axis=-1),multiples=[20,1])
f_tile = tf.reshape(tf.repeat(f,200),[-1,1])
dt = 0.9
X=tf.concat((z_tile,f_tile,tf.reshape(tf.repeat(dt,z_tile.shape[0]),[-1,1])),axis=1)
X_pde = tf.concat((z_tile,f_tile,tf.random.uniform(shape=(z_tile.shape[0],1),minval=0,maxval=dt)),axis=1)
x_star = tf.concat((z_tile,f_tile,tf.reshape(tf.repeat(0.0,z_tile.shape[0]),[-1,1])),axis=1)
idx = np.random.choice(X.shape[0],batchSize,replace=True)
loss_e = self.solve_pde(self.value_function_e,X_pde,idx)
self.value_updated = self.value_function_e(tf.concat[x_star[:,0:1],x_star[:,1:2],x_star[:,2:3]]).numpy().reshape(self.innerStep.Nz,self.innerStep.Nf).transpose()
return loss_e
#tf.function
def training_step(self):
with tf.GradientTape(persistent=True) as tape:
loss_e = self.loss_function(self.batchSize)
grads_valueE = tape.gradient(loss_e,self.theta_valueFunction_e)
self.optimizer.apply_gradients(zip(grads_valueE,self.theta_valueFunction_e))
return loss_e
def train_model(self):
self.value_function_e = self.NN()
self.theta_valueFunction_e = self.value_function_e.trainable_variables
self.LVF= []
for epoch in trange(self.number_epochs):
print(epoch)
loss_e = self.training_step()
self.LVF_list.append(loss_e.numpy())
if __name__=="__main__":
ext = Model()
ext.train_model()
The error message along with full traceback is
Traceback (most recent call last):
File "<ipython-input-26-f5a127c3c9ae>", line 1, in <module>
runfile('C:/Users/user/Google Drive/S/Research Project4/trial.py', wdir='C:/Users/user/Google Drive/SFI/Research Project4')
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/user/Google Drive/SFI/Research Project4/trial.py", line 85, in <module>
ext.train_model()
File "C:/Users/user/Google Drive/SFI/Research Project4/trial.py", line 79, in train_model
loss_e = self.training_step()
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 862, in __call__
return self._python_function(*args, **kwds)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\eager\function.py", line 3985, in bound_method_wrapper
return wrapped_fn(weak_instance(), *args, **kwargs)
File "C:/Users/user/Google Drive/SFI/Research Project4/trial.py", line 65, in training_step
loss_e = self.loss_function(self.batchSize)
File "C:/Users/user/Google Drive/SFI/Research Project4/trial.py", line 58, in loss_function
loss_e = self.solve_pde(self.value_function_e,X_pde,idx)
File "C:/Users/user/Google Drive/SFI/Research Project4/trial.py", line 34, in solve_pde
u_pde = u_t + u_z + u_zz - tf.cast(0.5,dtype=tf.float32) * u
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1399, in r_binary_op_wrapper
y, x = maybe_promote_tensors(y, x)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1335, in maybe_promote_tensors
ops.convert_to_tensor(tensor, dtype, name="x"))
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\profiler\trace.py", line 163, in wrapped
return func(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1566, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 346, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 272, in constant
allow_broadcast=True)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 283, in _constant_impl
return _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 308, in _constant_eager_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 106, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
ValueError: Attempt to convert a value (None) with an unsupported type (<class 'NoneType'>) to a Tensor.
Any help is much appreciated. Thank you.
You have 2 problems in your code which prevents you from getting the result you want.
If you want to compute higher-order derivatives you have to create nested GradientTape objects
GradientTape automatically track variables in its context, if you want to track tensors (as in your case, you want to track z and t) you have to call tape.watch(<my_tensor>) otherwise you will not have gradients for it.
Fixed code:
def solve_pde(self, value_function, X, idx):
z, f, t = X[:, 0:1], X[:, 1:2], X[:, 2:3]
with tf.GradientTape(persistent=True) as tape:
tape.watch(z)
with tf.GradientTape(persistent=True) as tape2:
tape2.watch(z)
tape2.watch(t)
u = value_function(tf.concat([z, f, t], axis=1))
u_z = tape2.gradient(u, z)
u_zz = tape.gradient(u_z, z)
u_t = tape2.gradient(u, t)
u_pde = u_t + u_z + u_zz - tf.cast(0.5, dtype=tf.float32) * u
return u_pde
More on gradient tape can be found in the official documentation: https://www.tensorflow.org/api_docs/python/tf/GradientTape

(tf2.keras) InternalError: Recorded operation 'GradientReversalOperator' returned too few gradients. Expected 3 but received 2

My code is available on github.
I wrote a custom gradient layer as follow:
#tf.custom_gradient
def GradientReversalOperator(x, lambdal):
def grad(dy):
return lambdal * tf.negative(dy)
return x, grad
class GradientReversalLayer(tf.keras.layers.Layer):
def __init__(self, lambdal):
super(GradientReversalLayer, self).__init__()
self.lambdal = lambdal
def call(self, inputs):
return GradientReversalOperator(inputs, self.lambdal)
If I remove lambdal, everything works fine. But when I add it back, I get the error:
InternalError: Recorded operation 'GradientReversalOperator' returned too few gradients. Expected 3 but received 2
Some answers report I should make one more fake return value, but the error becomes "too many gradients". The Traceback is as follow:
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\contextlib.py", line 130, in exit
self.gen.throw(type, value, traceback)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2804, in variable_creator_scope
yield
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1695, in train_on_batch
logs = train_function(iterator)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\def_function.py", line 780, in call
result = self._call(*args, **kwds)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\def_function.py", line 823, in _call
self._initialize(args, kwds, add_initializers_to=initializers)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\def_function.py", line 697, in _initialize
*args, **kwds))
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\function.py", line 2855, in _get_concrete_function_internal_garbage_collected
graph_function, _, _ = self._maybe_define_function(args, kwargs)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\function.py", line 3213, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\function.py", line 3075, in _create_graph_function
capture_by_value=self._capture_by_value),
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\framework\func_graph.py", line 986, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\def_function.py", line 600, in wrapped_fn
return weak_wrapped_fn().wrapped(*args, **kwds)
File "D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\framework\func_graph.py", line 973, in wrapper
raise e.ag_error_metadata.to_exception(e)
tensorflow.python.framework.errors_impl.InternalError: in user code:
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function *
return step_function(self, iterator)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
return fn(*args, **kwargs)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step **
outputs = model.train_step(data)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\training.py:757 train_step
self.trainable_variables)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\training.py:2722 _minimize
gradients = tape.gradient(loss, trainable_variables)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\backprop.py:1073 gradient
unconnected_gradients=unconnected_gradients)
D:\Users\xiqxi\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\eager\imperative_grad.py:77 imperative_grad
compat.as_str(unconnected_gradients.value))
InternalError: Recorded operation 'GradientReversalOperator' returned too few gradients. Expected 3 but received 2
I had the same problem,
try this:
class GradientReversal(Layer):
'''Flip the sign of gradient during training.'''
#tf.custom_gradient
def grad_reverse(self, x):
y = tf.identity(x)
def custom_grad(dy):
return -self.hp_lambda * dy
return y, custom_grad
# --------------------------------------
def __init__(self, hp_lambda, **kwargs):
super(GradientReversal, self).__init__(**kwargs)
self.hp_lambda = K.variable(hp_lambda, dtype='float32', name='hp_lambda')
# --------------------------------------
def call(self, x, mask=None):
return self.grad_reverse(x)
# --------------------------------------
def set_hp_lambda(self,hp_lambda):
#self.hp_lambda = hp_lambda
K.set_value(self.hp_lambda, hp_lambda)
# --------------------------------------
def increment_hp_lambda_by(self,increment):
new_value = float(K.get_value(self.hp_lambda)) + increment
K.set_value(self.hp_lambda, new_value)
# --------------------------------------
def get_hp_lambda(self):
return float(K.get_value(self.hp_lambda))

dimensions must equal error but they are equal

I added a print to the "discriminator_loss" function to see what was going on. at first it will tell me the shape of both are 16. later it tells me the shape of "real_loss" is only 15 while the other stays 16. So far I have only tried lowering the batchsize's and increasing them by 1 ect. I have provided the most relevant parts of the code. I can provide the rest of the code if needed. I have no clue why this is happening and it breaks the code.
with strategy.scope():
BATCH_SIZE = 16
GLOBAL_BATCH_SIZE = 32#batchsize*# of gpus
im_size = 256
latent_size = 512
with strategy.scope():
cross_entropy = tf.keras.losses.BinaryCrossentropy(
from_logits=True,\
reduction = tf.keras.losses.Reduction.NONE)
#this is used to evaluate discriminators ability to discriminate
def discriminator_loss(real_output, fake_output):
real_loss = cross_entropy(tf.ones_like(real_output), real_output)#compares prediction to actual value of 1
fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)#compares rediction to actual value of 0
print(real_loss)
print(fake_loss)
total_loss = real_loss + fake_loss
total_loss = total_loss/GLOBAL_BATCH_SIZE
return total_loss
#how well was generator able to trick discriminator
def generator_loss(fake_output):
gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)#compares predictions to the expected value 1 of a real image
gen_loss = gen_loss / GLOBAL_BATCH_SIZE
return gen_loss
with strategy.scope():
EPOCHS = 80
noise_dim = 512
num_examples_to_generate = 32
# We will reuse this seed overtime (so it's easier)
# to visualize progress in the animated GIF)
with strategy.scope():
def noise(n):
return tf.random.normal([n, latent_size])
def noiseImage(n):
return tf.random.uniform([n, im_size, im_size, 1])
#seed = tf.random.normal([num_examples_to_generate, noise_dim])
#seed used to generate image>the discriminator than classifies real images from training set and a set of generated images>loss is calculated and gradients are used to update the model
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
with strategy.scope():
##tf.function
def train_step(images):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
real_output = discriminator(images, training=True)
fake_output = discriminator(generated_images, training=True)
g_loss = generator_loss(fake_output)#runs generator loss
d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
D_grads = disc_tape.gradient(d_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(D_grads, discriminator.trainable_variables))
#run g_optim twice to make sure d_loss doesn't go to zero
with tf.GradientTape() as gen_tape:
generated_imgs = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
fake_output = discriminator(generated_imgs, training=True)
g_loss = generator_loss(fake_output)
G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
return g_loss, d_loss
#tf.function
def distributed_train_step(dist_dataset):
per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
total_g_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_g_losses,axis=0)
total_d_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_d_losses,axis=0)
return total_g_loss, total_d_loss
with strategy.scope():
def train(dist_dataset, epochs):
for epoch in range(epochs):
start = time.time()
for image_batch in dist_dataset:
total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
with strategy.scope():
train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
error and traceback
Traceback (most recent call last):
File "C:\image generator\pixiv\#image generator.py", line 507, in <module>
train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
File "C:\image generator\pixiv\#image generator.py", line 441, in train
total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2419, in __call__
graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2777, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2667, in _create_graph_function
capture_by_value=self._capture_by_value),
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 981, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 441, in wrapped_fn
return weak_wrapped_fn().__wrapped__(*args, **kwds)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 968, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
C:\image generator\pixiv\#image generator.py:419 distributed_train_step *
per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
C:\image generator\pixiv\#image generator.py:393 train_step *
d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
C:\image generator\pixiv\#image generator.py:328 discriminator_loss *
total_loss = real_loss + fake_loss
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:984 binary_op_wrapper
return func(x, y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1276 _add_dispatch
return gen_math_ops.add_v2(x, y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:483 add_v2
"AddV2", x=x, y=y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
attrs=attr_protos, op_def=op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
compute_device)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
op_def=op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1817 __init__
control_input_ops, op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
raise ValueError(str(e))
ValueError: Dimensions must be equal, but are 0 and 2 for '{{node replica_1/add}} = AddV2[T=DT_FLOAT](replica_1/binary_crossentropy_1/weighted_loss/Mul, replica_1/binary_crossentropy_2/weighted_loss/Mul)' with input shapes: [0], [2].
So according to comments the problem lies in unequal batch sizes, due to the final batch being smaller than the specified batch size. I believe this is due to this line:
generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
where the constant size BATCH_SIZE is used, instead of the actual input shape of the batch, so that generated_images is of a different shape than images.
So one solution as mentioned is simply to use drop_remainder=True in batch(). However it might be better to get the generator to output images of the same shape as the input, so instead of passing BATCH_SIZE as argument to your noise generation functions, you should use the actual size of the input batch. So maybe using tf.shape(images)[0] would help. Alternatively, you could generate a fixed batch of images with BATCH_SIZE, and then simply discard any extra images, like
num_images = tf.shape(images)[0]
generated_images = generated_images[:num_images]

Error when using tf.compat.v1.nn.rnn_cell.LSTMCell in tensorflow2.0

I am trying to use lstm with projection in tensorflow2.0 .
I am writing a custom model, however I am getting error
ValueError: Dimensions must be equal, but are 4096 and 2288 for 'transducer/rnn_model/encoder_lstm1/MatMul' (op: 'MatMul') with input shapes: [4,4096], [2288,8192].
Below is the code:
class RNNModel(tf.keras.Model):
def __init__(self, input_size, vocab_size, hidden_size=2048, num_layers=8, dropout=.2, blank=0, bidirectional=False):
super(RNNModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.vocab_size = vocab_size
self.blank = blank
self.lstm_layers = []
self.proj_layers = []
self.lstm_proj_layers = []
self.batch_norm_layers = []
self.proj_dim = 640
rnn_size = hidden_size
output_dim = rnn_size
self.cell = tf.compat.v1.nn.rnn_cell.LSTMCell(2048, num_proj=2048)
for i in range(self.num_layers):
name1='encoder_lstm'+str(i)
self.lstm_layers.append(tf.keras.layers.RNN(self.cell, return_sequences=True, name=name1))
def reshape_pyramidal(self, outputs):
shape = tf.shape(outputs)
batch_size, max_time = shape[0], shape[1]
num_units = outputs.get_shape().as_list()[-1]
pads = [[0, 0], [0, tf.math.floormod(max_time, 2)], [0, 0]]
outputs = tf.pad(outputs, pads)
concat_outputs = tf.reshape(outputs, (batch_size, -1, num_units * 2))
return concat_outputs
def call(self, x, xlen):
for i in range(self.num_layers):
output = self.lstm_layers[i](inputs=x)
x = output
return x, state_h
Following is the error
train_proj.py:92 train_step *
xs, _ = self.encoder(xs_1, xlen)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
/home/ubuntu/E2E-ASR/model_proj.py:84 call *
output = self.lstm_layers[i](inputs=x)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/layers/recurrent.py:623 __call__
return super(RNN, self).__call__(inputs, **kwargs)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/layers/recurrent.py:756 call
zero_output_for_mask=self.zero_output_for_mask)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py:4035 rnn
input_time_zero, tuple(initial_states) + tuple(constants))
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/keras/layers/recurrent.py:732 step
output, new_states = self.cell.call(inputs, states, **kwargs)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:1028 call
array_ops.concat([inputs, m_prev], 1), self._kernel)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/util/dispatch.py:180 wrapper
return target(*args, **kwargs)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/ops/math_ops.py:2765 matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_math_ops.py:6136 mat_mul
name=name)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py:793 _apply_op_helper
op_def=op_def)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/func_graph.py:548 create_op
compute_device)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:3429 _create_op_internal
op_def=op_def)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1773 __init__
control_input_ops)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1613 _create_c_op
raise ValueError(str(e))
ValueError: Dimensions must be equal, but are 4096 and 2288 for 'transducer/rnn_model/encoder_lstm1/MatMul' (op: 'MatMul') with input shapes: [4,4096], [2288,8192].

InvalidArgumentError : ConcatOp : Dimensions of inputs should match

Tensorflow 1.7 when using dynamic_rnn.It runs fine at first , but at the 32th(it changes when i run the code) step , the error appears. When i used smaller batch , it seems the code can run longer , however the error still poped up .Just cannt figure out what's wrong.
from mapping import *
def my_input_fn(features, targets, batch_size=20, shuffle=True, num_epochs=None, sequece_lenth=None):
ds = tf.data.Dataset.from_tensor_slices(
(features, targets, sequece_lenth)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
features, labels, sequence = ds.make_one_shot_iterator().get_next()
return features, labels, sequence
def lstm_cell(lstm_size=50):
return tf.contrib.rnn.BasicLSTMCell(lstm_size)
class RnnModel:
def __init__(self,
batch_size,
hidden_units,
time_steps,
num_features
):
self.batch_size = batch_size
self.hidden_units = hidden_units
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[lstm_cell(i) for i in self.hidden_units])
self.initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
self.model = stacked_lstm
self.state = self.initial_state
self.time_steps = time_steps
self.num_features = num_features
def loss_mean_squre(self, outputs, targets):
pos = tf.add(outputs, tf.ones(self.batch_size))
eve = tf.div(pos, 2)
error = tf.subtract(eve,
targets)
return tf.reduce_mean(tf.square(error))
def train(self,
num_steps,
learningRate,
input_fn,
inputs,
targets,
sequenceLenth):
periods = 10
step_per_periods = int(num_steps / periods)
input, target, sequence = input_fn(inputs, targets, self.batch_size, shuffle=True, sequece_lenth=sequenceLenth)
initial_state = self.model.zero_state(self.batch_size, tf.float32)
outputs, state = tf.nn.dynamic_rnn(self.model, input, initial_state=initial_state)
loss = self.loss_mean_squre(tf.reshape(outputs, [self.time_steps, self.batch_size])[-1], target)
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate)
grads_and_vars = optimizer.compute_gradients(loss, self.model.variables)
optimizer.apply_gradients(grads_and_vars)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
for i in range(num_steps):
sess.run(init_op)
state2, current_loss= sess.run([state, loss])
if i % step_per_periods == 0:
print("period " + str(int(i / step_per_periods)) + ":" + str(current_loss))
return self.model, self.state
def processFeature(df):
df = df.drop('class', 1)
features = []
for i in range(len(df["vecs"])):
features.append(df["vecs"][i])
aa = pd.Series(features).tolist() # tramsform into list
featuresList = []
for i in features:
p1 = []
for k in i:
p1.append(list(k))
featuresList.append(p1)
return featuresList
def processTargets(df):
selected_features = df[
"class"]
processed_features = selected_features.copy()
return tf.convert_to_tensor(processed_features.astype(float).tolist())
if __name__ == '__main__':
dividNumber = 30
"""
some code here to modify my data to input
it looks like this:
inputs before use input function : [fullLenth, charactorLenth, embeddinglenth]
"""
model = RnnModel(15, [100, 80, 80, 1], time_steps=dividNumber, num_features=25)
model.train(5000, 0.0001, my_input_fn, training_examples, training_targets, sequenceLenth=trainSequenceL)
And error is under here
Traceback (most recent call last):
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1330, in _do_call
return fn(*args)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1315, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1423, in _call_tf_sessionrun
status, run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [20,25] vs. shape[1] = [30,100]
[[Node: rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](rnn/while/TensorArrayReadV3, rnn/while/Switch_4:1, rnn/while/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/Const)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/programming/mlwords/dnn_gragh.py", line 198, in <module>
model.train(5000, 0.0001, my_input_fn, training_examples, training_targets, sequenceLenth=trainSequenceL)
File "D:/programming/mlwords/dnn_gragh.py", line 124, in train
state2, current_loss, nowAccuracy = sess.run([state, loss, accuracy])
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 908, in run
run_metadata_ptr)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1143, in _run
feed_dict_tensor, options, run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1324, in _do_run
run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1343, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [20,25] vs. shape[1] = [30,100]
[[Node: rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](rnn/while/TensorArrayReadV3, rnn/while/Switch_4:1, rnn/while/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/Const)]]
Caused by op 'rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat', defined at:
File "D:/programming/mlwords/dnn_gragh.py", line 198, in <module>
model.train(5000, 0.0001, my_input_fn, training_examples, training_targets, sequenceLenth=trainSequenceL)
File "D:/programming/mlwords/dnn_gragh.py", line 95, in train
outputs, state = tf.nn.dynamic_rnn(self.model, input, initial_state=initial_state)#,sequence_length=sequence
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 627, in dynamic_rnn
dtype=dtype)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 824, in _dynamic_rnn_loop
swap_memory=swap_memory)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 3205, in while_loop
result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2943, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2880, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 3181, in <lambda>
body = lambda i, lv: (i + 1, orig_body(*lv))
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 795, in _time_step
(output, new_state) = call_cell()
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 781, in <lambda>
call_cell = lambda: cell(input_t, state)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 232, in __call__
return super(RNNCell, self).__call__(inputs, state)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\layers\base.py", line 714, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 1283, in call
cur_inp, new_state = cell(cur_inp, cur_state)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 339, in __call__
*args, **kwargs)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\layers\base.py", line 714, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 620, in call
array_ops.concat([inputs, h], 1), self._kernel)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\array_ops.py", line 1181, in concat
return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 1101, in concat_v2
"ConcatV2", values=values, axis=axis, name=name)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\ops.py", line 3309, in create_op
op_def=op_def)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\ops.py", line 1669, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): ConcatOp : Dimensions of inputs should match: shape[0] = [20,25] vs. shape[1] = [30,100]
[[Node: rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](rnn/while/TensorArrayReadV3, rnn/while/Switch_4:1, rnn/while/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/Const)]]
this is my code used to check my input
def checkData(inputs, targets, sequencelence):
batch_size = 20
features, target, sequece = my_input_fn(inputs, targets, batch_size=batch_size, shuffle=True, num_epochs=None,
sequece_lenth=sequencelence)
with tf.Session() as sess:
for i in range(1000):
features1, target1, sequece1 = sess.run([features, target, sequece])
assert len(features1) == batch_size
for sentence in features1 :
assert len(sentence) == 30
for word in sentence:
assert len(word) == 25
assert len(target1) == batch_size
assert len(sequece1) == batch_size
print(target1)
print("OK")
The error is coming from LSTMCell.call call method. There we are trying to tf.concat([inputs, h], 1) meaning that we want to concatenate the next input with the current hidden state before matmul'ing with the kernel variables matrix. The error is saying that you can't do it because the batch (0th) dimensions don't match up - your input is shaped [20,25] and your hidden state is shaped [30,100].
For some reason on your 32nd iteration, or whenever you see the error, the input is not batched to 30, but only to 20. This usually happens at the end of your training data when the total number of training examples does not evenly divide your batch size. This hypothesis is also consistent with "When i used smaller batch , it seems the code can run longer" statement.
I had the same issue. When I corrected the image input size to match the input shape, it ran without errors.