Convolutional neural network outputting equal probabilities for all labels - tensorflow

I am currently training a CNN on MNIST, and the output probabilities (softmax) are giving [0.1,0.1,...,0.1] as training goes on. The initial values aren't uniform, so I can't figure out if I'm doing something stupid here?
I'm only training for 15 steps, just to see how training progresses; even though that's a low number, I don't think that should result in uniform predictions?
import numpy as np
import tensorflow as tf
import imageio
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
# Getting data
from sklearn.model_selection import train_test_split
def one_hot_encode(data):
new_ = []
for i in range(len(data)):
_ = np.zeros([10],dtype=np.float32)
_[int(data[i])] = 1.0
new_.append(np.asarray(_))
return new_
data = np.asarray(mnist["data"],dtype=np.float32)
labels = np.asarray(mnist["target"],dtype=np.float32)
labels = one_hot_encode(labels)
tr_data,test_data,tr_labels,test_labels = train_test_split(data,labels,test_size = 0.1)
tr_data = np.asarray(tr_data)
tr_data = np.reshape(tr_data,[len(tr_data),28,28,1])
test_data = np.asarray(test_data)
test_data = np.reshape(test_data,[len(test_data),28,28,1])
tr_labels = np.asarray(tr_labels)
test_labels = np.asarray(test_labels)
def get_conv(x,shape):
weights = tf.Variable(tf.random_normal(shape,stddev=0.05))
biases = tf.Variable(tf.random_normal([shape[-1]],stddev=0.05))
conv = tf.nn.conv2d(x,weights,[1,1,1,1],padding="SAME")
return tf.nn.relu(tf.nn.bias_add(conv,biases))
def get_pool(x,shape):
return tf.nn.max_pool(x,ksize=shape,strides=shape,padding="SAME")
def get_fc(x,shape):
sh = x.get_shape().as_list()
dim = 1
for i in sh[1:]:
dim *= i
x = tf.reshape(x,[-1,dim])
weights = tf.Variable(tf.random_normal(shape,stddev=0.05))
return tf.nn.relu(tf.matmul(x,weights) + tf.Variable(tf.random_normal([shape[1]],stddev=0.05)))
#Creating model
x = tf.placeholder(tf.float32,shape=[None,28,28,1])
y = tf.placeholder(tf.float32,shape=[None,10])
conv1_1 = get_conv(x,[3,3,1,128])
conv1_2 = get_conv(conv1_1,[3,3,128,128])
pool1 = get_pool(conv1_2,[1,2,2,1])
conv2_1 = get_conv(pool1,[3,3,128,512])
conv2_2 = get_conv(conv2_1,[3,3,512,512])
pool2 = get_pool(conv2_2,[1,2,2,1])
conv3_1 = get_conv(pool2,[3,3,512,1024])
conv3_2 = get_conv(conv3_1,[3,3,1024,1024])
conv3_3 = get_conv(conv3_2,[3,3,1024,1024])
conv3_4 = get_conv(conv3_3,[3,3,1024,1024])
pool3 = get_pool(conv3_4,[1,3,3,1])
fc1 = get_fc(pool3,[9216,1024])
fc2 = get_fc(fc1,[1024,10])
softmax = tf.nn.softmax(fc2)
loss = tf.losses.softmax_cross_entropy(logits=fc2,onehot_labels=y)
train_step = tf.train.AdamOptimizer().minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(15):
print(i)
indices = np.random.randint(len(tr_data),size=[200])
batch_data = tr_data[indices]
batch_labels = tr_labels[indices]
sess.run(train_step,feed_dict={x:batch_data,y:batch_labels})
Thank you so much.

There are several issues with your code, including elementary ones. I strongly suggest you first go through the Tensorflow step-by-step tutorials for MNIST, MNIST For ML Beginners and Deep MNIST for Experts.
In short, regarding your code:
First, your final layer fc2 should not have a ReLU activation.
Second, the way you build your batches, i.e.
indices = np.random.randint(len(tr_data),size=[200])
is by just grabbing random samples in each iteration, which is far from the correct way of doing so...
Third, the data you feed into the network are not normalized in [0, 1], as they should be:
np.max(tr_data[0]) # get the max value of your first training sample
# 255.0
The third point was initially puzzling for me, too, since in the aforementioned Tensorflow tutorials they don't seem to normalize the data either. But close inspection revealed the reason: if you import the MNIST data through the Tensorflow-provided utility functions (instead of the scikit-learn ones, as you do here), they come already normalized in [0, 1], something that is nowhere hinted at:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import numpy as np
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
np.max(mnist.train.images[0])
# 0.99607849
This is an admittedly strange design decision - as far as I am aware of, in all other similar cases/tutorials normalizing the input data is an explicit part of the pipeline (see e.g. the Keras example), and with good reason (it is something you will be certainly expected to do yourself later, when using your own data).

Related

Unexpected input data type. Actual: (tensor(double)) , expected: (tensor(float))

I am learning this new ONNX framework that allows us to deploy the deep learning (and others) model into production.
However, there is one thing I am missing. I thought that the main reason for having such a framework is so that for inference purposes e.g. when we have a trained model and want to use it in a different venv (where for example we cannot have PyTorch) the model still can be used.
I have preped a "from scratch" example here:
# Modules
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torchvision
import onnx
import onnxruntime
import matplotlib.pyplot as plt
import numpy as np
# %config Completer.use_jedi = False
# MNIST Example dataset
train_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST(
'data', train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
])),
batch_size=800)
# Take data and labels "by hand"
inputs_batch, labels_batch = next(iter(train_loader))
# Simple Model
class CNN(nn.Module):
def __init__(self, in_channels, num_classes):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(in_channels=in_channels,
out_channels = 10, kernel_size = (3, 3), stride = (1, 1), padding=(1, 1))
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride = (2, 2))
self.conv2 = nn.Conv2d(in_channels = 10, out_channels=16, kernel_size = (3, 3), stride = (1, 1), padding=(1, 1))
self.fc1 = nn.Linear(16*7*7, num_classes)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc1(x)
return x
# Training setting
device = 'cpu'
batch_size = 64
learning_rate = 0.001
n_epochs = 10
# Dataset prep
dataset = TensorDataset(inputs_batch, labels_batch)
TRAIN_DF = DataLoader(dataset = dataset, batch_size = batch_size, shuffle = True)
# Model Init
model = CNN(in_channels=1, num_classes=10)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
# Training Loop
for epoch in range(n_epochs):
for data, labels in TRAIN_DF:
model.train()
# Send Data to GPU
data = data.to(device)
# Send Data to GPU
labels = labels.to(device)
# data = data.reshape(data.shape[0], -1)
# Forward
pred = model(data)
loss = F.cross_entropy(pred, labels)
# Backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Check Accuracy
def check_accuracy(loader, model):
num_correct = 0
num_total = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device)
y = y.to(device)
# x = x.reshape(x.shape[0], -1)
scores = model(x)
_, pred = scores.max(1)
num_correct += (pred == y).sum()
num_total += pred.size(0)
print(F"Got {num_correct} / {num_total} with accuracy {float(num_correct)/float(num_total)*100: .2f}")
check_accuracy(TRAIN_DF, model)
# Inference with ONNX
# Create Artifical data of the same size
img_size = 28
dummy_data = torch.randn(1, img_size, img_size)
dummy_input = torch.autograd.Variable(dummy_data).unsqueeze(0)
input_name = "input"
output_name = "output"
model_eval = model.eval()
torch.onnx.export(
model_eval,
dummy_input,
"model_CNN.onnx",
input_names=["input"],
output_names=["output"],
)
# Take Random Image from Training Data
X_pred = data[4].unsqueeze(0)
# Convert the Tensor image to PURE numpy and pretend we are working in venv where we only have numpy - NO PYTORCH
X_pred_np = X_pred.numpy()
X_pred_np = np.array(X_pred_np)
IMG_Rando = np.random.rand(1, 1, 28, 28)
np.shape(X_pred_np) == np.shape(IMG_Rando)
ort_session = onnxruntime.InferenceSession(
"model_CNN.onnx"
)
def to_numpy(tensor):
return (
tensor.detach().gpu().numpy()
if tensor.requires_grad
else tensor.cpu().numpy()
)
# compute ONNX Runtime output prediction
# WORKS
# ort_inputs = {ort_session.get_inputs()[0].name: X_pred_np}
# DOES NOT WORK
ort_inputs = {ort_session.get_inputs()[0].name: IMG_Rando}
# WORKS
# ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(X_pred)}
ort_outs = ort_session.run(None, ort_inputs)
ort_outs
Firstly, we create a simple model and train it on the MNIST dataset.
Then we export the trained model using the ONNX framework.
Now, when I want to classify an image using the X_pred_np It works even though it is a "pure" NumPy, which is what I want.
However, I suspect that this particular case works only because it has been derived from the PyTorch tensor object, and thus "under the hood" it still has PyTorch attributes.
While when I try to inference on the random "pure" NumPy object IMG_Rando, there seems to be a problem:
Unexpected input data type. Actual: (tensor(double)) , expected: (tensor(float)).
Referring that PyTorch form is needed.
Is there a way how to be able to use only numpy Images for the ONNX predictions?. So the inference can be performed in separated venv where no pytorch is installed?
Secondly, is there a way that ONNX would remember the actual classes?
In this particular case, the index corresponds to the label of the image. However, in animal classification, ONNX would not provide us with the "DOG" and "CAT" and other labels but would only provide us the index of the predicted label. Which we would need to run throw our own "prediction dictionary" so we know that the fifth label is associated with "cat" and sixth label is associated with "dog" etc.
Numpy defaults to float64 while pytorch defaults to float32. Cast the input to float32 before the inference:
IMG_Rando = np.random.rand(1, 1, 28, 28).astype(np.float32)
double is short for double-precision floating-point format, which is a floating point number representation on 64 bits, while float refers to a floating point number on 32 bits.
As an improvement to the accepted answer, the idiomatic way to generate random numbers in Numpy is now by using a Generator. This offers the benefit of being able to create the array in the right type directly, rather than using the expensive astype operation, which copies the array (as in the accepted answer). Thus, the improved solution would look like:
rng = np.random.default_rng() # set seed if desired
IMG_Rando = rng.random((1, 1, 28, 28), dtype=np.float32)

Prediction in non classified answer

I have create neuronetwork in Kerars, program is runing but there is problem of result, it is Forexforcast network in forcast it should return 0 or 1 , as provided in traing dataset but result is showing in between 0 and 1 in float like "[[0.47342286]]"
I have tried to use numpy athmax but it only result in 1 answer
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.preprocessing import MinMaxScaler
from ta import *
dataset = pd.read_csv('C:/Users/SIGMA COM/PycharmProjects/deep/GBP_JPY Historical Data.csv',index_col="Date",parse_dates=True)
dataset = dataset[::-1]
print(dataset.head())
print(dataset.isna().any())
print(dataset.info())
dataset['Open'].plot(figsize=(16,6))
# initial value
step_size = 4
batch_sizes = 1
dataset['Diff'] = dataset['Open'] - dataset['Price']
dataset['Range'] = dataset['High'] - dataset['Low']
dataset['Rsi'] = rsi(close=dataset['Price'],n=4,fillna=True)
dataset['Macd'] = macd(close=dataset['Price'],n_fast=12,n_slow=26,fillna=True)
dataset['Cci'] = cci(high=dataset['High'],low=dataset['Low'],close=dataset['Price'],n=20,fillna=True)
# dataset['Rsi'] = dataset['Rsi'] /100.0
# # dataset['Macd'] = dataset['Macd'] /2.0
# dataset['Cci'] = dataset['Cci'] / 500.0
training_set = dataset[['Rsi','Macd','Cci','Price','Low','High','Open','Signal']]
sc = MinMaxScaler()
training_set_scaled = sc.fit_transform(training_set)
# Creating a data structure with 60 timesteps and 1 output
X_train = []
y_train = []
for i in range(60, 1258):
X_train.append(training_set_scaled[i-60:i, 0])
y_train.append(training_set_scaled[i, -1:])
X_train, y_train = np.array(X_train), np.array(y_train)
# Reshaping
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
print(X_train.shape)
print(X_train)
plt.show()
# Part 2 - Building the RNN
# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
print((X_train.shape[1], 1))
print(X_train.shape)
# Initialising the RNN
regressor = Sequential()
# Adding the first LSTM layer and some Dropout regularisation
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
regressor.add(Dropout(0.2))
# Adding a second LSTM layer and some Dropout regularisation
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
# Adding a third LSTM layer and some Dropout regularisation
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
# Adding a fourth LSTM layer and some Dropout regularisation
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))
# Adding the output layer
regressor.add(Dense(units = 1,activation='sigmoid'))
# Compiling the RNN
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Fitting the RNN to the Training set
regressor.fit(X_train, y_train, epochs = 10, batch_size = 32)
result = regressor.predict(np.reshape(X_train[100],(1,60,1)))
print(result)
I want to make model to make predication in class 0 and 1
This behavior is expected, because the sigmoid function is going to return a number between zero and one, like so:
So if your class labels are either 0 or 1, which seems to be the case here, for a binary classification problem you can just round the resultant output for your class prediction. Let's make a distinction between a classification vs. a regression problem here: regression is like finding the "line of best fit;" that is, the model is being trained to approximate the data. This appears to be what you're doing here: you're minimizing the mean squared error and searching for the model that best approximates your data, but that doesn't make a prediction.
If you want to actually make a classification, you can just round all elements of the result of regressor.predict to 0 or 1, and then compare your predictions with the true labels. This can actually be done easily in numpy like so: numpy.around(your_predictions, decimals=0). Note the decimals argument is not strictly required since it defaults to a value of 0, it's nice for clarity.
As for using numpy.argmax (I'm going to assume that's what you meant by athmax since I can't find a function with that spelling), it will give you the same label for everything because it returns the index of the largest element in an array. Since your output array has length one (because it's simply a single neuron that calculates the logistic function), it will always return index zero! However, you're sort of on the right track: if your last layer was instead Dense(units=n_classes, activation='softmax') — softmax outputs a probability distribution that a particular row of data will produce each label. In that case, numpy.argmax is correct.
Here's a Tensorflow tutorial on classification that I found super helpful when I was just learning it myself. It uses softmax instead of sigmoid like you, but I think it's fairly adaptable to your needs: https://www.tensorflow.org/tutorials/keras/basic_classification
Hope this helps!

Could I use tf.session() in enviroment where keras only used?

Thanks for reading my question.
I was using keras to develop my reinforcement learning agent based on keres-rl. But I want to upgrade my agent so that I get some update from open ai base line code for better action exploration. But the code used tensorflow only. It is my first time to use tensorflow. I am so confused. I build keras deep learninng model using its "Model API". I have never concerned about inside of model. But the code I referenced was full of the code that kick in inside of deep learning model and give some change to its weight and get immediate layer output using tf.Session(). The framework is so flexible. Like below, using tf.Session() the tensor, which is recognized tensor and is not callable, can get result feeding feed_dict data. In keras, it is impossible as far as I know.
Once I allow using tf.Session(), my architecture will be complex and nobody wants to understand and use it except that I can adapt reference code more easily.
On the other side, if I don't allow that, I needs to break down my existing model and use tons of K.function to get middle layer's output or something that I can't get from keras model.
import numpy as np
from keras.layers import Dense, Input, BatchNormalization
from keras.models import Model
import tensorflow as tf
import keras.backend as K
import rl2.tf_util as U
def normalize(x, stats):
if stats is None:
return x
return (x - stats.mean) / (stats.std + 1e-8)
class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, my, epsilon=1e-2, shape=()):
self._sum = K.variable(value=np.zeros(shape), dtype=tf.float32, name=my+"_runningsum")
self._sumsq = K.variable(value=np.zeros(shape) + epsilon, dtype=tf.float32, name=my+"_runningsumsq")
self._count = K.variable(value=np.zeros(()) + epsilon, dtype=tf.float32, name=my+"_count")
self.mean = self._sum / self._count
self.std = K.sqrt(K.maximum((self._sumsq / self._count) - K.square(self.mean), epsilon))
newsum = K.variable(value=np.zeros(shape), dtype=tf.float32, name=my+'_sum')
newsumsq = K.variable(value=np.zeros(shape), dtype=tf.float32, name=my+'_var')
newcount = K.variable(value=np.zeros(()), dtype=tf.float32, name=my+'_count')
self.incfiltparams = K.function([newsum, newsumsq, newcount], [],
updates=[K.update_add(self._sum, newsum),
K.update(self._sumsq, newsumsq),
K.update(self._count, newcount)])
def update(self, x):
x = x.astype('float64')
n = int(np.prod(self.shape))
totalvec = np.zeros(n*2+1, 'float64')
addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
self.incfiltparams(totalvec[0:n].reshape(self.shape),
totalvec[n:2*n].reshape(self.shape),
totalvec[2*n])
i = Input(shape=(1,))
# h = BatchNormalization()(i)
h = Dense(4, activation='relu', kernel_initializer='he_uniform')(i)
h = Dense(10, activation='relu', kernel_initializer='he_uniform')(h)
o = Dense(1, activation='linear', kernel_initializer='he_uniform')(h)
model = Model(i, o)
obs_rms = RunningMeanStd(my='obs', shape=(1,))
normalized_obs0 = K.clip(normalize(i, obs_rms), 0, 100)
tf2 = model(normalized_obs0)
# print(model.predict(np.asarray([2,2,2,2,2]).reshape(5,)))
# print(tf(np.asarray([2,2,2,2,2]).reshape(5,)))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run([tf2], feed_dict={i : U.adjust_shape(i, [np.asarray([2,]).reshape(1,)])}))

tf.gather_nd is really slow when used for many times

I would like a loss function in tensorflow which is a complex combination of many elements. For example, this code:
import tensorflow as tf
import numpy as np
import time
input_layer = tf.placeholder(tf.float64, shape=[64,4])
output_layer = input_layer + 0.5*tf.tanh(tf.Variable(tf.random_uniform(shape=[64,4],\
minval=-1,maxval=1,dtype=tf.float64)))
# random_combination is 2-d numpy array of the form:
# [[32, 34, 23, 56],[23,54,33,21],...]
random_combination = np.random.randint(64, size=(210000000, 4))
# a collector to collect the values
collector=[]
print('start looping')
print(time.asctime(time.localtime(time.time())))
# loop through random_combination and pick the elements of output_layer
for i in range(len(random_combination)):
[i,j,k,l] = [random_combination[i][0],random_combination[i][1],\
random_combination[i][2],random_combination[i][3]]
# pick the needed element from output_layer
f1 = tf.gather_nd(output_layer,[i,0])
f2 = tf.gather_nd(output_layer,[i,2])
f3 = tf.gather_nd(output_layer,[i,3])
f4 = tf.gather_nd(output_layer,[i,4])
tf1 = f1+1
tf2 = f2+1
tf3 = f3+1
tf4 = f4+1
collector.append(0.3*tf.abs(f1*f2*tf3*tf4-tf1*tf2*f3*f4))
print('end looping')
print(time.asctime(time.localtime(time.time())))
# loss function
loss = tf.add_n(collector)
This takes around 50 minutes on my computer.
My question is that is it the proper way to do the coding in tensorflow?
Or there is a more time efficient way to index the elements?

Making simple rnn code with scan function in Tensorflow

I recently started to learn Tensorflow and try to make simple rnn code using scan function.
What I'm trying to do is to make The RNN predict sine function.
It gets input of 1 dim. and outputs also 1 dim in batch as follow.
import tensorflow as tf
from tensorflow.examples.tutorials import mnist
import numpy as np
import matplotlib.pyplot as plt
import os
import time
# FLAGS (options)
tf.flags.DEFINE_string("data_dir", "", "")
#tf.flags.DEFINE_boolean("read_attn", True, "enable attention for reader")
#tf.flags.DEFINE_boolean("write_attn",True, "enable attention for writer")
opt = tf.flags.FLAGS
#Parameters
time_step = 10
num_rnn_h = 16
batch_size = 2
max_epoch=10000
learning_rate=1e-3 # learning rate for optimizer
eps=1e-8 # epsilon for numerical stability
#temporary sinusoid data
x_tr = np.zeros([batch_size,time_step])
y_tr = np.zeros([batch_size,time_step])
ptrn = 0.7*np.sin(np.arange(time_step+1)/(2*np.pi))
x_tr[0] = ptrn[0:time_step]
y_tr[0] = ptrn[1:time_step+1]
x_tr[1] = ptrn[0:time_step]
y_tr[1] = ptrn[1:time_step+1]
#Build model
x = tf.placeholder(tf.float32,shape=[batch_size,time_step,1], name= 'input')
y = tf.placeholder(tf.float32,shape=[None,time_step,1], name= 'target')
cell = tf.nn.rnn_cell.BasicRNNCell(num_rnn_h)
#cell = tf.nn.rnn_cell.LSTMCell(num_h, state_is_tuple=True)
with tf.variable_scope('output'):
W_o = tf.get_variable('W_o', shape=[num_rnn_h, 1])
b_o = tf.get_variable('b_o', shape=[1], initializer=tf.constant_initializer(0.0))
init_state = cell.zero_state(batch_size, tf.float32)
#make graph
#rnn_outputs, final_states = tf.scan(cell, xx1, initializer= tf.zeros([num_rnn_h]))
scan_outputs = tf.scan(lambda a, xi: cell(xi, a), tf.transpose(x, perm=[1,0,2]), initializer= init_state)
rnn_outputs, rnn_states = tf.unpack(tf.transpose(scan_outputs,perm=[1,2,0,3]))
print rnn_outputs, rnn_states
with tf.variable_scope('predictions'):
weighted_sum = tf.reshape(tf.matmul(tf.reshape(rnn_outputs, [-1, num_rnn_h]), W_o), [batch_size, time_step, 1])
predictions = tf.add(weighted_sum, b_o, name='predictions')
with tf.variable_scope('loss'):
loss = tf.reduce_mean((y - predictions) ** 2, name='loss')
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
But It gives an error at the last line (optimizer) like ,
ValueError: Shapes (2, 16) and (2, 2, 16) are not compatible
Please someone knows the reason, tell me how to fix it...
I assume your error is not on the last line (the optimizer) but rather on some operation you are doing earlier. Perhaps in the reduce_mean with this y - prediction? I will not go over your code in details but I will tell you that this error comes when you do an operation between two tensors which require the same shape (usually math operations).