I try to save and restore alxenet slim model. But I always get this error when I run the codesaver.restore(sess, tf.train.latest_checkpoint('I:/model/mnist/')).
And which throw the error:
NotFoundError (see above for traceback): Key alexnet_v2/conv1/biases not found in checkpoint.
When I run the tf.global_variables(), I can only get the weights of conv2d, and there is no biases in the result.
I don't understand what's the problem is. Here is my code:
here is my alex model
def alexnet_v2_arg_scope(weight_decay=0.0005,
stddev=0.1,
batch_norm_var_collection='moving_vars',
use_fused_batchnorm=True):
batch_norm_params = {
# Decay for the moving averages.
'decay': 0.9997,
# epsilon to prevent 0s in variance.
'epsilon': 0.001,
# collection containing update_ops.
'updates_collections': ops.GraphKeys.UPDATE_OPS,
# Use fused batch norm if possible.
'fused': use_fused_batchnorm,
# collection containing the moving mean and moving variance.
'variables_collections': {
'beta': None,
'gamma': None,
'moving_mean': [batch_norm_var_collection],
'moving_variance': [batch_norm_var_collection],
}
}
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=tf.nn.relu,
biases_initializer=tf.constant_initializer,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params,
weights_regularizer=slim.l2_regularizer(weight_decay)):
with slim.arg_scope([slim.conv2d], padding='SAME'):
with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
return arg_sc
def alex_net(inputs,
num_classes=10,
is_training=True,
droupout_keep_prob=0.5,
spatial_squeze=True,
scope='alexnet_v2'):
with tf.variable_scope(scope, 'alexnet_v2',[inputs]) as sc:
end_points_collection = sc.name + '_end_points'
with slim.arg_scope([slim.conv2d],
weights_initializer=trunc_norm(0.1),
biases_initializer=tf.constant_initializer(0.1),
outputs_collections=end_points_collection):
inputs = tf.reshape(inputs,[-1,28,28,1])
net = slim.conv2d(inputs, 64, [3, 3], 1, padding='VALID', scope='conv1')
net = slim.max_pool2d(net, [2, 2], 2, scope='pool1')
net = slim.conv2d(net, 128, [3, 3], scope='conv2')
net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
# net = slim.conv2d(net, 384, [3, 3], scope='conv3')
# net = slim.conv2d(net, 384, [3, 3], scope='conv4')
net = slim.conv2d(net, 256, [3, 3], scope='conv3')
# net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')
net = slim.conv2d(net, 512, [3, 3], scope='conv4')
with slim.arg_scope([slim.conv2d],
weights_initializer=trunc_norm(0.1),
biases_initializer=tf.constant_initializer(0.1)):
# net = slim.conv2d(net, 1028, [6, 6], padding='VALID', scope='fc6')
net = slim.avg_pool2d(net, [6,6], stride=1,padding='VALID',scope='avg_pool5' )
net = slim.dropout(net, droupout_keep_prob, is_training=is_training, scope='dropout6')
# net = slim.conv2d(net, 512, [1, 1], scope='fc7')
# net = slim.dropout(net, droupout_keep_prob, is_training=is_training, scope='dropout7')
net = slim.conv2d(net, num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
biases_initializer=tf.zeros_initializer(),
scope='fc7x')
end_points = slim.utils.convert_collection_to_dict(end_points_collection)
if spatial_squeze:
net = tf.squeeze(net, [1,2], name='fc8/squeezed')
end_points[sc.name + '/fc8'] = net
return net, end_point
save model
varialbes_to_restore = slim.get_variables_to_restore()
saver = tf.train.Saver(varialbes_to_restore)
saver.save(sess,'I:/model/mnist/')
restore model
with tf.Session() as sess:
logits, _ = _alex_slim.alex_net(teX[:200])
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('I:/model/mnist/'))
logits_var = sess.run(logits)
print(logits_var)
Perhapse tensorflow expects you to save it in a checkpoint because you put it in a scope.
You will get the same error if you give a name to a tensor which isn't being saved.
If you don't want to save it, don't give it a name or scope.
Related
I have implemented roi_pooling from this library.
After running roi_pooling as
conv5_3 = net.end_points['conv5_3']
#implement ROI Pooling
input_boxes=tf.dtypes.cast(input_boxes,tf.int32)
pooled_features = roi_pooling(conv5_3, input_boxes, pool_height=5, pool_width=30)
then pooled_features has unknown shape.
(Pdb) p pooled_features
<tf.Tensor 'RoiPooling:0' shape=<unknown> dtype=float32>
My network still need to pass another recognition net after roi_pooling.
Recognition net is
def recognitionnet(inputs, fatness = 64, dilation = True):
"""
backbone net of vgg16
"""
# End_points collect relevant activations for external use.
end_points = {}
# Original VGG-16 blocks.
with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'):
# Block1
net = slim.repeat(inputs, 2, slim.conv2d, fatness, [3, 3], scope='conv1')
end_points['conv1_2'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool1')
end_points['pool1'] = net
# Block 2.
net = slim.repeat(net, 2, slim.conv2d, fatness * 2, [3, 3], scope='conv2')
end_points['conv2_2'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool2')
end_points['pool2'] = net
# fc7 as conv
net = slim.conv2d(net, fatness * 16, [1, 1], scope='fc7')
end_points['fc7'] = net
#model_summary()
#from keras.utils.visualize_util import plot
#plot(model, to_file='model.png')
return net, end_points;
But when pooled_features pass through the first layer of recognitionnet, I have error as
TypeError: TypeErro...neType',)
What could be wrong when pooled_features passed through net = slim.repeat(inputs, 2, slim.conv2d, fatness, [3, 3], scope='conv1').
How can I solve the issue?
What I did was the two graphs were splitted.
The first graph has roi_poling implementation and the second graph has recognitionnet.
Since recognitionnet is in separate graph, we can use placeholder as input to recognitionnet graph.
I defined a model function which named "drrn_model". While I was training my model, I use model by:
shared_model = tf.make_template('shared_model', drrn_model)
train_output = shared_model(train_input, is_training=True)
It begin training step by step, and I can restore .ckpt file to the model when I want to continue to train the model from an old point.
But there is a problem when I test my trained model.
I use the code below directly without using tf.make_template:
train_output = drrn_model(train_input, is_training=False)
Then the terminal gave me a lots of NotFoundError like "Key LastLayer/Variable_2 not found in checkpoint".
But when I use
shared_model = tf.make_template('shared_model', drrn_model)
output_tensor = shared_model(input_tensor,is_training=False)
It can test normally.
So why we must use tf.make_template() again in testing stage. What is the difference between drrn_model and make_template when we construct our model.
And there is another question: the BN layer in tensorflow.
I have tried many ways but the outputs is always wrong(always worse then the version without BN layer).
There is my newest version of model with BN layer:
tensor = None
def drrn_model(input_tensor, is_training):
with tf.device("/gpu:0"):
with tf.variable_scope("FirstLayer"):
conv_0_w = tf.get_variable("conv_w", [3, 3, 1, 128], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
tensor = tf.nn.conv2d(tf.nn.relu(batchnorm(input_tensor, is_training= is_training)), conv_0_w, strides=[1,1,1,1], padding="SAME")
first_layer = tensor
### recursion ###
with tf.variable_scope("recycle", reuse=False):
tensor = drrnblock(first_layer, tensor, is_training)
for i in range(1,10):
with tf.variable_scope("recycle", reuse=True):
tensor = drrnblock(first_layer, tensor, is_training)
### end layer ###
with tf.variable_scope("LastLayer"):
conv_end_w = tf.get_variable("conv_w", [3, 3, 128, 1], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
conv_end_layer = tf.nn.conv2d(tf.nn.relu(batchnorm(tensor, is_training= is_training)), conv_end_w, strides=[1, 1, 1, 1], padding='SAME')
tensor = tf.add(input_tensor,conv_end_layer)
return tensor
def drrnblock(first_layer, input_layer, is_training):
conv1_w = tf.get_variable("conv1__w", [3, 3, 128, 128], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
conv1_layer = tf.nn.conv2d(tf.nn.relu(batchnorm(input_layer, is_training= is_training)), conv1_w, strides=[1,1,1,1], padding= "SAME")
conv2_w = tf.get_variable("conv2__w", [3, 3, 128, 128], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
conv2_layer = tf.nn.conv2d(tf.nn.relu(batchnorm(conv1_layer, is_training=is_training)), conv2_w, strides=[1, 1, 1, 1], padding="SAME")
tensor = tf.add(first_layer, conv2_layer)
return tensor
def batchnorm(inputs, is_training, decay = 0.999):# there is my BN layer
scale = tf.Variable(tf.ones([inputs.get_shape()[-1]]))
beta = tf.Variable(tf.zeros([inputs.get_shape()[-1]]))
pop_mean = tf.Variable(tf.zeros([inputs.get_shape()[-1]]), trainable=False)
pop_var = tf.Variable(tf.ones([inputs.get_shape()[-1]]), trainable=False)
if is_training:
batch_mean, batch_var = tf.nn.moments(inputs,[0,1,2])
print("batch_mean.shape: ", batch_mean.shape)
train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay))
train_var = tf.assign(pop_var, pop_var*decay+batch_var*(1-decay))
with tf.control_dependencies([train_mean, train_var]):
return tf.nn.batch_normalization(inputs,batch_mean,batch_var,beta,scale,variance_epsilon=1e-3)
else:
return tf.nn.batch_normalization(inputs,pop_mean,pop_var,beta,scale,variance_epsilon=1e-3)
Please tell me where is wrong in my code.
Thanks a lot!!
I'm trying to set CIFAR10's tf-slim model to have input of dynamic batch, height, width and single channel, i.e. monochromatic images of different sizes. Given that all shapes but channel size are dynamic, the output shape of tf.flatten is (?, ?). Is there any way to circumvent this? I'm trying to adapt CIFAR10 to tf's DeepDream tutorial that uses InceptionV3 with an unspecific input shape.
I'm assuming this happens because CIFAR10 is not fully convolutional
import tensorflow as tf
slim = tf.contrib.slim
images = tf.placeholder(tf.float32, shape=(None, None, None, 1))
NUM_CLASSES = 18
scope = 'CifarNet'
with tf.variable_scope(scope, 'CifarNet', [images, NUM_CLASSES]):
net = slim.conv2d(images, 64, [5, 5], scope='conv1')
net = slim.max_pool2d(net, [2, 2], 2, scope='pool1')
net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm1')
net = slim.conv2d(net, 64, [5, 5], scope='conv2')
net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm2')
net = slim.max_pool2d(net, [2, 2], 2, scope='pool2')
net = slim.flatten(net)
net = slim.fully_connected(net, 384, scope='fc3')
ValueError: The last dimension of the inputs to Dense should be defined. Found None.
In the api of tf.contrib.rnn.DropoutWrapper, I am trying to set variational_recurrent=True, in which case, input_size is mandatory. As explained, input_size is TensorShape objects containing the depth(s) of the input tensors.
depth(s) is confusing, what is it please? Is it just the shape of the tensor as we can get by tf.shape()? Or the number of channels for the special case of images? But my input tensor is not an image.
And I don't understand why dtype is demanded when variational_recurrent=True.
Thanks!
Inpput_size for tf.TensorShape([200, None, 300]) is just 300
Play with this example.
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see TF issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import tensorflow as tf
import numpy as np
n_steps = 2
n_inputs = 3
n_neurons = 5
keep_prob = 0.5
learning_rate = 0.001
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
X_seqs = tf.unstack(tf.transpose(X, perm=[1, 0, 2]))
basic_cell = tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)
basic_cell_drop = tf.contrib.rnn.DropoutWrapper(
basic_cell,
input_keep_prob=keep_prob,
variational_recurrent=True,
dtype=tf.float32,
input_size=n_inputs)
output_seqs, states = tf.contrib.rnn.static_rnn(
basic_cell_drop,
X_seqs,
dtype=tf.float32)
outputs = tf.transpose(tf.stack(output_seqs), perm=[1, 0, 2])
init = tf.global_variables_initializer()
X_batch = np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
])
with tf.Session() as sess:
init.run()
outputs_val = outputs.eval(feed_dict={X: X_batch})
print(outputs_val)
See this for more details: https://github.com/tensorflow/tensorflow/issues/7927
The Problem
When I run my training, my preprocessing examples are successfully created, however my training does not start. Much weirder is the fact, that on analyzing my TensorBoard graph, I see some extra conditional nodes which do not exist in the code. I want to know where and why do these extra nodes come into picture and exactly why the training does not begin. Below is a systematic description of the situation :
TensorFlow Graph
The following TensorBoard diagram shows my graph :
The code which constructs this graph is below
def getconv2drelu(inputtensor, kernelsize, strides, padding, convname,
imagesummaries=False):
weights = tf.get_variable("weights", shape=kernelsize, dtype=tf.float32,
initializer=tf.truncated_normal_initializer(0,
0.01),
regularizer=tf.nn.l2_loss)
biases = tf.get_variable("biases", shape=kernelsize[3], dtype=tf.float32,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d(input=inputtensor, filter=weights, strides=strides,
padding=padding, name=convname)
response = tf.nn.bias_add(conv, biases)
if imagesummaries:
filters = (weights - tf.reduce_min(weights)) / (tf.reduce_max(
weights) - tf.reduce_min(weights))
filters = tf.transpose(filters, [3, 0, 1, 2])
tf.summary.image(convname + " filters", filters,
max_outputs=kernelsize[3])
response = tf.nn.relu(response)
activation_summary(response)
return response
def getfullyconnected(inputtensor, numinput, numoutput):
weights = tf.get_variable("weights", shape=[numinput, numoutput],
dtype=tf.float32,
initializer=
tf.truncated_normal_initializer(0, 0.01))
biases = tf.get_variable("biases", shape=[numoutput], dtype=tf.float32,
initializer=tf.truncated_normal_initializer(
0, 0.01))
response = tf.add(tf.matmul(inputtensor, weights), biases)
response = tf.nn.relu(response)
activation_summary(response)
return response
def inference(inputs):
with tf.variable_scope("layer1"):
conv = getconv2drelu(inputtensor=inputs, kernelsize=[7, 7, 3, 96],
strides=[1, 2, 2, 1], padding="VALID",
convname="conv1", imagesummaries=True)
pool = tf.nn.max_pool(conv, [1, 3, 3, 1], strides=[1, 3, 3, 1],
padding="SAME", name="pool1")
with tf.variable_scope("layer2"):
conv = getconv2drelu(inputtensor=pool, kernelsize=[7, 7, 96, 256],
strides=[1, 1, 1, 1], padding="VALID",
convname="conv2", imagesummaries=False)
pool = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
padding="SAME", name="pool2")
with tf.variable_scope("layer3"):
conv = getconv2drelu(inputtensor=pool, kernelsize=[7, 7, 256, 512],
strides=[1, 1, 1, 1], padding="SAME",
convname="conv3", imagesummaries=False)
with tf.variable_scope("layer4"):
conv = getconv2drelu(inputtensor=conv, kernelsize=[3, 3, 512, 512],
strides=[1, 1, 1, 1], padding="SAME",
convname="conv4", imagesummaries=False)
with tf.variable_scope("layer5"):
conv = getconv2drelu(inputtensor=conv, kernelsize=[3, 3, 512, 1024],
strides=[1, 1, 1, 1], padding="SAME",
convname="conv5", imagesummaries=False)
with tf.variable_scope("layer6"):
conv = getconv2drelu(inputtensor=conv, kernelsize=[3, 3, 1024, 1024],
strides=[1, 1, 1, 1], padding="SAME",
convname="conv6", imagesummaries=False)
pool = tf.nn.max_pool(conv, [1, 3, 3, 1], strides=[1, 3, 3, 1],
padding="SAME", name="pool1")
pool = tf.contrib.layers.flatten(pool)
with tf.variable_scope("fc1"):
fc = getfullyconnected(pool, 5 * 5 * 1024, 4096)
drop = tf.nn.dropout(fc, keep_prob=0.5)
with tf.variable_scope("fc2"):
fc = getfullyconnected(drop, 4096, 4096)
drop = tf.nn.dropout(fc, keep_prob=0.5)
with tf.variable_scope("fc3"):
logits = getfullyconnected(drop, 4096, 1000)
return logits
The complete TensorBoard graph is shown below :
The figure is too small, but you can see a series of pink nodes to the left. An expanded version of such a segment is shown below :
Expansion of one of the condition blocks ( all blocks are similar !!) is shown below :
I am unable to understand the presence and existence of these extra condition blocks. All my images when fed to the graph are of size [221, 221, 3].
You can also see that inside a condition block, there is an isVariableInitialized test. I do initialize my variables right after the launch of a session. So, I do not understand as to why these checks will be performed.I have figured out that these condition blocks are there due to the use tf.get_variable() which checks for the initialization Do they cause any performance difference ?
Another observation
When I decrease the batchsize, the size of my tensorboard file also decreases. But the nodes shown on the graph remain the same. Why is this so ?
My training code is as follows :
with tf.control_dependencies(putops):
train_op = tf.group(apply_gradient_op, variables_averages_op)
sess.run(train_op) # tf.Session() as been defined before sess.run()
And putops is initialized to [] and during graph construction for each GPU, it is populated as follows :
# cpu_compute_stage is appended only once since it corresponds to centralized preprocessing
cpu_compute_stage = data_flow_ops.StagingArea(
[tf.float32, tf.int32],
shapes=[images_shape, labels_shape]
)
cpu_compute_stage_op = gpu_copy_stage.put(
[host_images, host_labels])
putops.append(gpu_copy_stage_op)
# For each device the putops is further appended by gpu_compute_stage which is for each GPU since CPU-GPU copy has to take place
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
gpu_compute_stage = data_flow_ops.StagingArea(
[tf.float32, tf.int32],
shapes=[images_shape, labels_shape]
)
gpu_compute_stage_op = gpu_compute_stage.put(
[host_images, host_labels]
)
putops.append(gpu_compute_stage_op)
However, my code does not run despite the fact that I do initialize both global and local variables.