Why this model can't overfit one example? - tensorflow

I am practicing conv1D on TensorFlow 2.7, and I am checking a decoder I developed by checking if it will overfit one example. The model doesn't learn when trained on only one example and can't overfit this one example. I want to understand this strange behavior, please. This is the link to the notebook on colab Notebook.
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, Dense, BatchNormalization
from tensorflow.keras.layers import ReLU, MaxPool1D, GlobalMaxPool1D
from tensorflow.keras import Model
import numpy as np
def Decoder():
inputs = Input(shape=(68, 3), name='Input_Tensor')
# First hidden layer
conv1 = Conv1D(filters=64, kernel_size=1, name='Conv1D_1')(inputs)
bn1 = BatchNormalization(name='BN_1')(conv1)
relu1 = ReLU(name='ReLU_1')(bn1)
# Second hidden layer
conv2 = Conv1D(filters=64, kernel_size=1, name='Conv1D_2')(relu1)
bn2 = BatchNormalization(name='BN_2')(conv2)
relu2 = ReLU(name='ReLU_2')(bn2)
# Third hidden layer
conv3 = Conv1D(filters=64, kernel_size=1, name='Conv1D_3')(relu2)
bn3 = BatchNormalization(name='BN_3')(conv3)
relu3 = ReLU(name='ReLU_3')(bn3)
# Fourth hidden layer
conv4 = Conv1D(filters=128, kernel_size=1, name='Conv1D_4')(relu3)
bn4 = BatchNormalization(name='BN_4')(conv4)
relu4 = ReLU(name='ReLU_4')(bn4)
# Fifth hidden layer
conv5 = Conv1D(filters=1024, kernel_size=1, name='Conv1D_5')(relu4)
bn5 = BatchNormalization(name='BN_5')(conv5)
relu5 = ReLU(name='ReLU_5')(bn5)
global_features = GlobalMaxPool1D(name='GlobalMaxPool1D')(relu5)
global_features = tf.keras.layers.Reshape((1, -1))(global_features)
conv6 = Conv1D(filters=12, kernel_size=1, name='Conv1D_6')(global_features)
bn6 = BatchNormalization(name='BN_6')(conv6)
outputs = ReLU(name='ReLU_6')(bn6)
model = Model(inputs=[inputs], outputs=[outputs], name='Decoder')
return model
model = Decoder()
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
losses = tf.keras.losses.MeanSquaredError()
model.compile(optimizer=optimizer, loss=losses)
n = 1
X = np.random.rand(n, 68, 3)
y = np.random.rand(n, 1, 12)
model.fit(x=X,y=y, verbose=1, epochs=30)

I think the problem here is, that you have no basis to learn anything, so you can't overfit. In every epoch you have just one example which is used to adapt the weights of the network. So there is not enough time to adapt the weights for overfitting here.
So to get the result of overfitting you want to have the same data multiple times inside your training dataset so the weights can change enought to overfitt because you only change them just one small step per epoch.
A deeper look into the back propagation might help you to get a better understanding of the concept. Click
I took th liberty to adapt your notebook and enhanced the dataset as following:
n = 1
X = np.random.rand(n, 68, 3)
y = np.random.rand(n, 1, 12)
for i in range(0,10):
X=np.append(X,X,axis = 0)
y=np.append(y,y,axis = 0)
And the output would be:

Related

mlflow ui doesn't show logged runs of my keras model

I created a sample keras model, and use mlflow.tensorflow.autolog() to track my model. However, the logged runs are not appeared in mlflow ui.
mlflow.tensorflow.autolog()
#setting hyper parameters
batch_size = 10
epochs = 100
optimizer = 'adam'
loss = 'binary_crossentropy'
def create_classifier():
classifier = tf.keras.Sequential()
classifier.add(tf.keras.layers.Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))
classifier.add(tf.keras.layers.Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(tf.keras.layers.Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
return classifier
classifier = create_classifier()
classifier.summary()
classifier.fit(X_train.to_numpy(), y_train.to_numpy(), batch_size = batch_size, epochs = epochs,verbose = 1)
score, acc = classifier.evaluate(X_train.to_numpy(), y_train.to_numpy(), batch_size=batch_size)
y_pred = classifier.predict(X_test.to_numpy())
y_pred = (y_pred > 0.5)
print('*'*20)
score, acc = classifier.evaluate(X_test.to_numpy(), y_test.to_numpy(),
batch_size=batch_size)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
I got following warning when I call autolog
The output when I fit the model with training data:
What do I need to do in order to make the run available in mlflow ui?
Note: MlFlow doc states that it only compatible with tensorflow >=2.3 and my tensorflow is 2.10.1.
update:
I found there are 2 /mlruns directories in my project, one located where my python code located as you can see in the screen clip, "ml-flow-project-example2/model/mlruns", and another one located at "ml-flow-project-example2/venv/Scripts/mlruns". However, the /mlruns in venv is empty. But it is where the mlflow.exe located. If I move the directory "7c10db034cdd47dfbba12885da25ff0f" from /model/mlruns to venv/Scripts/mlruns, the run will appear in mlflow ui.
Is there anyway to let mlflow ui point to the correct mlruns directory?

How to solve "KeyError: '/conv2d_1/kernel:0'"

I am trying to use the colab to run the gym package with pacman, since the spec in colab is more powerful than my notebook. This program is successful simulate in Jupyter in my notebook, which using tensorflow 1.14. However, the errors keep appears when I put in google colab to simulate, and I also debug and change part of the code, so that the code can be used in tensor flow 2.0. Below is my code
#First we import all the necessary libraries
import numpy as np
import gym
import tensorflow as tf
from tensorflow import keras
from keras.layers import Flatten, Conv2D, Dense
#from tensorflow.contrib.layers import Flatten, conv2d, Dense
from collections import deque, Counter
import random
from datetime import datetime
#Now we define a function called preprocess_observation for preprocessing our input game screen.
#We reduce the image size and convert the image into greyscale.
color = np.array([210, 164, 74]).mean()
def preprocess_observation(obs):
# Crop and resize the image
img = obs[1:176:2, ::2]
# Convert the image to greyscale
img = img.mean(axis=2)
# Improve image contrast
img[img==color] = 0
# Next we normalize the image from -1 to +1
img = (img - 128) / 128-1
return img.reshape(88,80,1)
#Let us initialize our gym environment
env = gym.make('MsPacman-v0')
n_outputs = env.action_space.n
print(n_outputs)
print(env.env.get_action_meanings())
observation = env.reset()
import tensorflow as tf
import matplotlib.pyplot as plt
for i in range(22):
if i > 20:
plt.imshow(observation)
plt.show()
observation, _, _, _ = env.step(1)
#Okay, Now we define a function called q_network for building our Q network. We input the game state to the Q network
#and get the Q values for all the actions in that state.
#We build Q network with three convolutional layers with same padding followed by a fully connected layer.
tf.compat.v1.reset_default_graph()
def q_network(X, name_scope):
# Initialize layers
initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=2.0)
with tf.compat.v1.variable_scope(name_scope) as scope:
# initialize the convolutional layers
#layer_1 = tf.keras.layers.Conv2D(X, 32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer)
layer_1_set = Conv2D(32, (8,8), strides=4, padding="SAME", kernel_initializer=initializer)
layer_1= layer_1_set(X)
tf.compat.v1.summary.histogram('layer_1',layer_1)
#layer_2 = tf.keras.layers.Conv2D(layer_1, 64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
layer_2_set = Conv2D(64, (4,4), strides=2, padding="SAME", kernel_initializer=initializer)
layer_2= layer_2_set(layer_1)
tf.compat.v1.summary.histogram('layer_2',layer_2)
#layer_3 = tf.keras.layers.Conv2D(layer_2, 64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
layer_3_set = Conv2D(64, (3,3), strides=1, padding="SAME", kernel_initializer=initializer)
layer_3= layer_3_set(layer_2)
tf.compat.v1.summary.histogram('layer_3',layer_3)
flatten_layer = Flatten() # instantiate the layer
flat = flatten_layer(layer_3)
fc_set = Dense(128, kernel_initializer=initializer)
fc=fc_set(flat)
tf.compat.v1.summary.histogram('fc',fc)
#Add final output layer
output_set = Dense(n_outputs, activation= None, kernel_initializer=initializer)
output= output_set(fc)
tf.compat.v1.summary.histogram('output',output)
vars = {v.name[len(scope.name):]: v for v in tf.compat.v1.get_collection(key=tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
#Return both variables and outputs together
return vars, output
#Next we define a function called epsilon_greedy for performing epsilon greedy policy. In epsilon greedy policy we either select the best action
#with probability 1 - epsilon or a random action with probability epsilon.
#We use decaying epsilon greedy policy where value of epsilon will be decaying over time
#as we don't want to explore forever. So over time our policy will be exploiting only good actions.
epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000
def epsilon_greedy(action, step):
p = np.random.random(1).squeeze()
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs)
else:
return action
#Now, we initialize our experience replay buffer of length 20000 which holds the experience.
#We store all the agent's experience i.e (state, action, rewards) in the
#experience replay buffer and we sample from this minibatch of experience for training the network.
buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)
# Now we define our network hyperparameters,
num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'
tf.compat.v1.reset_default_graph()
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
# Now we define the placeholder for our input i.e game state
X = tf.placeholder(tf.float32, shape=X_shape)
#X = tf.Variable(tf.float32, tf.ones(shape=X_shape))
# we define a boolean called in_training_model to toggle the training
in_training_mode = tf.placeholder(tf.bool)
# we build our Q network, which takes the input X and generates Q values for all the actions in the state
mainQ, mainQ_outputs = q_network(X, 'mainQ')
# similarly we build our target Q network, for policy evaluation
targetQ, targetQ_outputs = q_network(X, 'targetQ')
# define the placeholder for our action values
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keepdims=True)
#Copy the primary Q network parameters to the target Q network
copy_op = [tf.compat.v1.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)
#Compute and optimize loss using gradient descent optimizer
# define a placeholder for our output i.e action
y = tf.placeholder(tf.float32, shape=(None,1))
# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))
# we use adam optimizer for minimizing the loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
Ok up to here, the error come out when i run this cell in colab :
#Copy the primary Q network parameters to the target Q network
copy_op = [tf.compat.v1.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)
The error gives:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-13-58715282cea8> in <module>()
----> 1 copy_op = [tf.compat.v1.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
2 copy_target_to_main = tf.group(*copy_op)
<ipython-input-13-58715282cea8> in <listcomp>(.0)
----> 1 copy_op = [tf.compat.v1.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
2 copy_target_to_main = tf.group(*copy_op)
KeyError: '/conv2d_1/kernel:0'
I have two question?
First, how to solve the question that already stated above.
Second, in tensor-flow 2.0 above,the placeholder command is replaced by tf.Variable, i rewrite the code:
X = tf.placeholder(tf.float32, shape=X_shape) to become
X = tf.Variable(tf.float32, tf.ones(shape=X_shape))
and still get error, and i have to use command below:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
X = tf.placeholder(tf.float32, shape=X_shape)
but get warning like this:
WARNING:tensorflow:From /usr/local/lib/python3.6/dist- packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating: non-resource variables are not supported in the long term
I doing intensive searching in the Stack overflow website by using keyword, yet i can't find solution. Really looking forward to any advise. Thank you very much.

Translating Pytorch program into Keras: different results

I have translated a pytorch program into keras.
A working Pytorch program:
import numpy as np
import cv2
import torch
import torch.nn as nn
from skimage import segmentation
np.random.seed(1)
torch.manual_seed(1)
fi = "in.jpg"
class MyNet(nn.Module):
def __init__(self, n_inChannel, n_outChannel):
super(MyNet, self).__init__()
self.seq = nn.Sequential(
nn.Conv2d(n_inChannel, n_outChannel, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.BatchNorm2d(n_outChannel),
nn.Conv2d(n_outChannel, n_outChannel, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.BatchNorm2d(n_outChannel),
nn.Conv2d(n_outChannel, n_outChannel, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2d(n_outChannel)
)
def forward(self, x):
return self.seq(x)
im = cv2.imread(fi)
data = torch.from_numpy(np.array([im.transpose((2, 0, 1)).astype('float32')/255.]))
data = data.cuda()
labels = segmentation.slic(im, compactness=100, n_segments=10000)
labels = labels.flatten()
u_labels = np.unique(labels)
label_indexes = np.array([np.where(labels == u_label)[0] for u_label in u_labels])
n_inChannel = 3
n_outChannel = 100
model = MyNet(n_inChannel, n_outChannel)
model.cuda()
model.train()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
label_colours = np.random.randint(255,size=(100,3))
for batch_idx in range(100):
optimizer.zero_grad()
output = model( data )[ 0 ]
output = output.permute( 1, 2, 0 ).view(-1, n_outChannel)
ignore, target = torch.max( output, 1 )
im_target = target.data.cpu().numpy()
nLabels = len(np.unique(im_target))
im_target_rgb = np.array([label_colours[ c % 100 ] for c in im_target]) # correct position of "im_target"
im_target_rgb = im_target_rgb.reshape( im.shape ).astype( np.uint8 )
for inds in label_indexes:
u_labels_, hist = np.unique(im_target[inds], return_counts=True)
im_target[inds] = u_labels_[np.argmax(hist, 0)]
target = torch.from_numpy(im_target)
target = target.cuda()
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
print (batch_idx, '/', 100, ':', nLabels, loss.item())
if nLabels <= 3:
break
fo = "out.jpg"
cv2.imwrite(fo, im_target_rgb)
(source: https://github.com/kanezaki/pytorch-unsupervised-segmentation/blob/master/demo.py)
My translation into Keras:
import cv2
import numpy as np
from skimage import segmentation
from keras.layers import Conv2D, BatchNormalization, Input, Reshape
from keras.models import Model
import keras.backend as k
from keras.optimizers import SGD, Adam
from skimage.util import img_as_float
from skimage import io
from keras.models import Sequential
np.random.seed(0)
fi = "in.jpg"
im = cv2.imread(fi).astype(float)/255.
labels = segmentation.slic(im, compactness=100, n_segments=10000)
labels = labels.flatten()
print (labels.shape)
u_labels = np.unique(labels)
label_indexes = [np.where(labels == u_label)[0] for u_label in np.unique(labels)]
n_channels = 100
model = Sequential()
model.add ( Conv2D(n_channels, kernel_size=3, activation='relu', input_shape=im.shape, padding='same'))
model.add( BatchNormalization())
model.add( Conv2D(n_channels, kernel_size=3, activation='relu', padding='same'))
model.add( BatchNormalization())
model.add( Conv2D(n_channels, kernel_size=1, padding='same'))
model.add( BatchNormalization())
model.add( Reshape((im.shape[0] * im.shape[1], n_channels)))
img = np.expand_dims(im,0)
print (img.shape)
output = model.predict(img)
print (output.shape)
im_target = np.argmax(output[0], 1)
print (im_target.shape)
for inds in label_indexes:
u_labels_, hist = np.unique(im_target[inds], return_counts=True)
im_target[inds] = u_labels_[np.argmax(hist, 0)]
def custom_loss(loss_target, loss_output):
return k.categorical_crossentropy(target=k.stack(loss_target), output=k.stack(loss_output), from_logits=True)
model.compile(optimizer=SGD(lr=0.1, momentum=0.9), loss=custom_loss)
model.fit(img, output, epochs=100, batch_size=1, verbose=1)
pred_result = model.predict(x=[img])[0]
print (pred_result.shape)
target = np.argmax(pred_result, 1)
print (target.shape)
nLabels = len(np.unique(target))
label_colours = np.random.randint(255, size=(100, 3))
im_target_rgb = np.array([label_colours[c % 100] for c in im_target])
im_target_rgb = im_target_rgb.reshape(im.shape).astype(np.uint8)
cv2.imwrite("out.jpg", im_target_rgb)
However, Keras output is really different than of pytorch
Input image:
Pytorch result:
Keras result:
Could someone help me for this translation?
Edit 1:
I corrected two errors as advised by #sebrockm
1. removed `relu` from last conv layer
2. added `from_logits = True` in the loss function
Also, changed the no. of conv layers from 4 to 3 to match with the original code.
However, output image did not improve than before and the `loss` are resulted in negative:
Epoch 99/100
1/1 [==============================] - 0s 92ms/step - loss: -22.8380
Epoch 100/100
1/1 [==============================] - 0s 99ms/step - loss: -23.039
I think that the Keras code lacks connection between model and output. However, could not figure out to make this connection.
Two major mistakes that I see (likely related):
The last convolutional layer in the original model does not have an activation function, while your translation uses relu.
The original model uses CrossEntropyLoss as loss function, while your model uses categorical_crossentropy with logits=False (a default argument). Without mathematical background the difference is tricky to explain, but in short: CrossEntropyLoss has a softmax built in, that's why the model doesn't have one on the last layer. To do the same in keras, use k.categorical_crossentropy(..., logits=True). "logits" means the input values are expected not to be "softmaxed", i.e. all values can be arbitrary. Currently, your loss function expects the output values to be "softmaxed", i.e. all values must be between 0 and 1 (and sum up to 1).
Update:
One other mistake, likely a huge one: In Keras, you calculate the output once in the beginning and never change it from there on. Then you train your model to fit on this initially generated output.
In the original pytorch code, target (which is the variable being trained on) gets updated in every training loop.
So, you cannot use Keras' fit method which is designed for doing the entire training for you (given fixed training data). You will have to replicate the training loop manually, just as it is done in the pytorch code. I'm not sure if this is easily doable with the API Keras provides. train_on_batch is one method you surely will need in your manual loop. You will have to do some more work, I'm afraid...

Shared weight matrices between dense layers in keras

I am trying to reimplement this tensorflow code into keras, I have noted other tickets submitted here that do not share the sentiment I am trying to recreate. The goal is to share a weight matrix across multiple dense layers.
import tensorflow as tf
# define input and weight matrices
x = tf.placeholder(shape=[None, 4], dtype=tf.float32)
w1 = tf.Variable(tf.truncated_normal(stddev=.1, shape=[4, 12]),
dtype=tf.float32)
w2 = tf.Variable(tf.truncated_normal(stddev=.1, shape=[12, 2]),
dtype=tf.float32)
# neural network
hidden_1 = tf.nn.tanh(tf.matmul(x, w1))
projection = tf.matmul(hidden_1, w2)
hidden_2 = tf.nn.tanh(projection)
hidden_3 = tf.nn.tanh(tf.matmul(hidden_2, tf.transpose(w2)))
y = tf.matmul(hidden_3, tf.transpose(w1))
# loss function and optimizer
loss = tf.reduce_mean(tf.reduce_sum((x - y) * (x - y), 1))
optimize = tf.train.AdamOptimizer().minimize(loss)
init = tf.initialize_all_variables()
The issue is reimplementing these weight layers in keras as the transpose of original layers. I am currently implementing my own network using keras functional API
Start by defining your two dense layers:
from keras.layers import Dense, Lambda
import keras.backend as K
dense1 = Dense(12, use_bias=False, activation='tanh')
dense2 = Dense(2, use_bias=False, activation='tanh')
You can then access the weights from your layers with for example dense1.weights[0]. You can wrap this in a lambda layer that also transposes your weights:
h3 = Lambda(lambda x: K.dot(x, K.transpose(dense2.weights[0])))(h2)

Writing the following CNN in tensorflow

I am new to this Deep Learning. I have learnt the basics through reading and trying to implement a real network to see how/if it'll really work. I chose Tensorflow in digits and the following network because they give out the exact architecture with training materiel. Steganalysis with DL
I wrote the following code for the architecture in Steganalysis with DL by looking at networks existing networks in digits and Tensorflow documentation.
from model import Tower
from utils import model_property
import tensorflow as tf
import tensorflow.contrib.slim as slim
import utils as digits
class UserModel(Tower):
#model_property
def inference(self):
x = tf.reshape(self.x, shape=[-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]])
with slim.arg_scope([slim.conv2d, slim.fully_connected],
weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.0001)):
conv1 = tf.layers.conv2d(inputs=x, filters=64, kernel_size=7, padding='same', strides=2, activation=tf.nn.relu)
rnorm1 = tf.nn.local_response_normalization(input=conv1)
conv2 = tf.layers.conv2d(inputs=rnorm1, filters=16, kernel_size=5, padding='same', strides=1, activation=tf.nn.relu)
rnorm2 = tf.nn.local_response_normalization(input=conv2)
flatten = tf.contrib.layers.flatten(rnorm2)
fc1 = tf.contrib.layers.fully_connected(inputs=flatten, num_outputs=1000, activation_fn=tf.nn.relu)
fc2 = tf.contrib.layers.fully_connected(inputs=fc1, num_outputs=1000, activation_fn=tf.nn.relu)
fc3 = tf.contrib.layers.fully_connected(inputs=fc2, num_outputs=2)
sm = tf.nn.softmax(fc3)
return fc3
#model_property
def loss(self):
model = self.inference
loss = digits.classification_loss(model, self.y)
accuracy = digits.classification_accuracy(model, self.y)
self.summaries.append(tf.summary.scalar(accuracy.op.name, accuracy))
return loss
I tried running it but the accuracy is pretty low. Could someone tell me if I've done it completely wrong or what's wrong with it and tell me how to properly code it?
UPDATE: Thank you Nessuno! With the fix you mentioned I came up with this code:
from model import Tower
from utils import model_property
import tensorflow as tf
import tensorflow.contrib.slim as slim
import utils as digits
class UserModel(Tower):
#model_property
def inference(self):
x = tf.reshape(self.x, shape=[-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]])
with slim.arg_scope([slim.conv2d, slim.fully_connected],
weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.00001)):
conv1 = tf.layers.conv2d(inputs=x, filters=64, kernel_size=7, padding='Valid', strides=2, activation=tf.nn.relu)
rnorm1 = tf.nn.local_response_normalization(input=conv1)
conv2 = tf.layers.conv2d(inputs=rnorm1, filters=16, kernel_size=5, padding='Valid', strides=1, activation=tf.nn.relu)
rnorm2 = tf.nn.local_response_normalization(input=conv2)
flatten = tf.contrib.layers.flatten(rnorm2)
fc1 = tf.contrib.layers.fully_connected(inputs=flatten, num_outputs=1000, activation_fn=tf.nn.relu)
fc2 = tf.contrib.layers.fully_connected(inputs=fc1, num_outputs=1000, activation_fn=tf.nn.relu)
fc3 = tf.contrib.layers.fully_connected(inputs=fc2, num_outputs=2, activation_fn=None)
return fc3
#model_property
def loss(self):
model = self.inference
loss = digits.classification_loss(model, self.y)
accuracy = digits.classification_accuracy(model, self.y)
self.summaries.append(tf.summary.scalar(accuracy.op.name, accuracy))
return loss
Solver type is SGD. Learning rate is 0.001. I am shuffling training data.I have increased training data to 6000 (3000 per category, 20% from that is reserved for validation). I downloaded the training data from this link. But I am only getting the following graph. I think this is overfitting. Do you have any suggestions to improve the validation accuracy?
In NVIDIA digits, classification_loss, exactly as in tensorflow tf.nn.softmax_cross_entropy_with_logits expects as input a linear layer of neuron.
Instead, you're passing as input sm = tf.nn.softmax(fc3), hence you're applying the softmax operation 2 times and this is the reasong of your low accuracy.
In order to solve this issue, just change the model output layer to
fc3 = slim.fully_connected(fc2, 2, activation_fn=None, scope='fc3')
return fc3