MNIST Classification: mean_squared_error loss function and tanh activation function - tensorflow

I changed the getting started example of Tensorflow as following:
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import numpy as np
import commons as cm
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation=tf.nn.tanh),
# tf.keras.layers.Dense(512, activation=tf.nn.tanh),
# tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation=tf.nn.tanh)
])
model.compile(optimizer='adam',
loss='mean_squared_error',
# loss = 'sparse_categorical_crossentropy',
metrics=['accuracy'])
history = cm.Histories()
h= model.fit(x_train, y_train, epochs=50, callbacks=[history])
print("history:", history.losses)
cm.plot_history(h)
# cm.plot(history.losses, history.aucs)
test_predictions = model.predict(x_test)
# Compute confusion matrix
pred = np.argmax(test_predictions,axis=1)
pred2 = model.predict_classes(x_test)
confusion = confusion_matrix(y_test, pred)
cm.draw_confusion(confusion,range(10))
With its default parameters:
relu activation at hidden layers,
softmax at the output layer and
sparse_categorical_crossentropy as loss function,
it works fine and the prediction for all digits are above 99%
However with my parameters: tanh activation function and mean_squared_error loss function it just predict 0 for all test samples:
I wonder what is the problem? The accuracy rate is increasing for each epoch and it reaches 99% and loss is about 20

You need to use the proper loss function for your data. Here you have a categorical output, so you need to use sparse_categorical_crossentropy, but also set from_logits without any activation for the last layer.
If you need to use tanh as your output, then you can use MSE with a one-hot encoded version of your labels + rescaling.

Related

Why are the weights of my QAT tf_model are floats and not 8-bit Integers?

I performed a simple Quantization Aware Training with Tensorflow on MNIST as follows:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import mnist
# Load MNIST dataset
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0
# Define the model architecture.
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(28, 28)),
keras.layers.Reshape(target_shape=(28, 28, 1)),
keras.layers.Conv2D(filters=12, kernel_size=(3, 3)),
keras.layers.Activation('relu'),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10)
])
# Train the digit classification model
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(
train_images,
train_labels,
epochs=5,
validation_split=0.1,
)
import tensorflow_model_optimization as tfmot
quantize_model = tfmot.quantization.keras.quantize_model
# q_aware stands for for quantization aware.
q_aware_model = quantize_model(model)
# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
train_images_subset = train_images[0:1000] # out of 60000
train_labels_subset = train_labels[0:1000]
q_aware_model.fit(train_images_subset, train_labels_subset,
batch_size=500, epochs=5, validation_split=0.1)
However, when I try to investigate the weights of the quantized model using, for instance, q_aware_model.get_weights()[5], I get an array of type Float-32. I am supposed to get type 8-bit integer; what am I doing wrong?

when we should use tf.function decorator

I'm trying to boost the performance of a simple 2NN. Here is the code:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.datasets import mnist
from tensorflow import keras
import tensorflow as tf
# load Mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data(path='mnist.npz')
X_train = X_train.reshape(60000, 784).astype('float32') / 255
X_test = X_test.reshape(10000, 784).astype('float32') / 255
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)
# configure the model
model = Sequential()
model.add(Dense(200, activation='relu', input_shape=(784,)))
model.add(Dense(200, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.1), metrics=['accuracy'])
# train and evaluate the model
model.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_data=(X_test, y_test))
model.evaluate(X_test, y_test)
Now, I wounder either there is a case to use #tf.function decorator or not, and if it's needed, how?
Your code only used builtin functions and classes so there is no need to use a #tf.function decorator. #tf.function is basically used to convert a normal function into a TensorFlow Graph as mentioned here. Since you are only using the builtin modules and functions, they are already treated as a graph by the TF compiler.

Any ideas how to slove problem with activation function?

I have a problem with loading self pretrained keras model.
When i make predictions directly after training model everything works fine.
My code:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
import matplotlib.pyplot as plt
import numpy as np
import os
from keras.models import load_model
"""
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
from tensorflow.keras.models import load_model
"""
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import matplotlib.pyplot as plt
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# Reshaping the array to 4-dims so that it can work with the Keras API
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
input_shape = (28, 28, 1)
# Making sure that the values are float so that we can get decimal points after division
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
# Normalizing the RGB codes by dividing it to the max RGB value.
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print('Number of images in x_train', x_train.shape[0])
print('Number of images in x_test', x_test.shape[0])
# Importing the required Keras modules containing model and layers
def train():
# Creating a Sequential Model and adding the layers
model = Sequential()
model.add(Conv2D(28, kernel_size=(3,3), input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten()) # Flattening the 2D arrays for fully connected layers
model.add(Dense(128, activation=tf.nn.relu))
model.add(Dropout(0.2))
model.add(Dense(10, activation=tf.nn.softmax))
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(x=x_train,y=y_train, epochs=10)
print(model.summary())
model.save('x.h5')
def eval():
model = load_model('x.h5')
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
train()
eval()
And i got an error:
ValueError: Unknown activation function:softmax_v2
I've tried to use different tensorflow versions (1.15, 2.0, 1.5) but this changes nothing.
Any ideas what is wrong with it?
Edit:
This problem occurs only when i try to load model.

Finding the optimal weights in a one layer and two hidden units network

SKlearn's neural network implementation can not consistently find the optimal weights with the best landing at acc=0.975 but not always (seemingly depending on the initailizaition)
How to guarantee optimal accuracy in such a simple settings?
Why the Keras version is not even getting close?
I am new to neural networks and interested in sparse models i.e. highest accuracy with least parameters. Sklearn with lbfgs solver seems sometimes to land the optimal weights
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
X, y =make_moons(n_samples=200, noise=0.1, random_state=0)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
#sklearn
nn=MLPClassifier(hidden_layer_sizes=(2,),
alpha=alpha,
max_iter=100000000,
solver='lbfgs',
activation='tanh',
verbose=False).fit(X_train, y_train)
print(nn.score(X_test, y_test))
# Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()
model.add(Dense(2, input_dim=2, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(X_train, y_train,
epochs=900,
batch_size=200,
validation_data=(X_test, y_test),
verbose=0)
print(model.evaluate(X_test, y_test)[1])
the weights for sklearn's model where I landed the 0.975 accuracy are
print(nn.coefs_)
array([[-0.66491063, -5.99816399],
[ 0.13907145, -3.82369965]]),
array([[-16.53339218],
[ 9.14208312]])]
print(nn.intercepts_)
[array([0.04063317, 0.01169239]), array([0.40667715])]

student-teacher model in keras

I'm converting student-teacher model in below url to keras one.
https://github.com/chengshengchan/model_compression/blob/master/teacher-student.py
How can I give input to two model(student, teacher) and get one output from only student in keras?
I'll set teacher's all tensors with trainable=false, and loss function as difference between student and teacher's output like below :
tf_loss = tf.nn.l2_loss(teacher - student)/batch_size
As I know, it is possible to give input to only one model when defining model.fit. But in this cases, I should it to both of teacher and student model.
Thank in advance!
Below is very simple student-teacher model in keras.
I hope it might be helpful to someone like me.
Good job!
import keras
from keras.datasets import mnist
from keras.layers import Input, Embedding, LSTM, Dense, Lambda
from keras.models import Model
import numpy as np
from keras.utils import np_utils
from keras.layers.core import Dense, Dropout, Activation
nb_classes = 10
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
from keras.models import Sequential
from keras.layers import Dense, Merge
from keras.optimizers import SGD, Adam, RMSprop
batch_size = 128
nb_classes = 10
nb_epoch = 3
teacher = Sequential()
teacher.add(Dense(10, input_shape=(784,)))
teacher.add(Dense(10))
teacher.add(Activation('softmax'))
teacher.summary()
teacher.compile(loss='categorical_crossentropy',
optimizer=RMSprop(),
metrics=['accuracy'])
history = teacher.fit(X_train, Y_train,
batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1, validation_data=(X_test, Y_test))
score = teacher.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
for i in range(len(teacher.layers)):
setattr(teacher.layers[i], 'trainable', False)
Y_train = np.zeros((60000, 10))
student = Sequential()
student.add(Dense(10, input_dim=784))
student.add(Activation('softmax'))
student.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy'])
from keras.layers import *
def negativeActivation(x):
return -x
negativeRight = Activation(negativeActivation)(student.output)
diff = Add()([teacher.output,negativeRight])
model = Model(inputs=[teacher.input, student.input], outputs=[diff])
model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['acc'])
model.summary(line_length=150)
model.fit([X_train, X_train], [Y_train], batch_size=128, nb_epoch=5)
print student.evaluate(X_test, Y_test)
The only implementation I have seen in Keras involves building 2 separate functions which either widen or deepen weight layers from the teacher model as initial weights for the student model.
I am not sure if it is precisely that Hinton et al. (2015) distillation to be honest, but it is teacher-student.
https://github.com/fchollet/keras/issues/3491