tf.nn.softmax behaving strangely - tensorflow

I am learning LSTM with tensorflow with enable_eager_execution. However when implementing LSTM, I have noticed the behaviour of tf.nn.softmax
that has made me stuck. Here is a section of my code
class RNN_LSTM(object):
def __init__(self,hidden_size):
data=open('Shakespear.txt', 'r').read()
self.data = data.split()
vocab_size=len(list(set(self.data)))
self.words =list(set(self.data))
self.hidden_size=hidden_size
self.input_size=vocab_size+hidden_size
self.vocab_size=vocab_size
self.W1=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W1")*0.1)
self.b1=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b1"))
self.W2=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W2")*0.1)
self.b2=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b2")*0.1)
self.W3=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W3")*0.1)
self.b3=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b3")*0.1)
self.W4=tf.Variable(tf.random.uniform((hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W4")*0.1)
self.b4=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b4")*0.1)
self.W5=tf.Variable(tf.random.uniform((self.vocab_size,self.hidden_size),dtype=tf.dtypes.float32,name="W5")*0.1)
self.b5=tf.Variable(tf.random.uniform((self.vocab_size,1),dtype=tf.dtypes.float32,name="b5")*0.1)
self.learning_rate=1e-1
self.sequence_length=50
#self.M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
def one_hot_encoding(self,x,hprev):
M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
vocab=tf.Variable(tf.zeros((self.vocab_size,1)))
#hprev=tf.Variable(tf.zeros((self.hidden_size,1)))
vocab=vocab.numpy()
vocab[x]=1
M_c=tf.concat((hprev,vocab),axis=0)
return M_c
def feedforward(self,M_c,p_s):
ft=tf.sigmoid( tf.matmul(self.W1,M_c)+self.b1)
it=tf.sigmoid(tf.matmul(self.W2,M_c)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M_c)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
ot=tf.nn.sigmoid(tf.matmul(self.W4,M_c)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
output=self.softmax(tf.matmul(self.W5,ht)+self.b5)
return ht,output,cs
def sample_text(self,hprev,begin,p_s,n):
vocab=tf.Variable(tf.zeros((self.vocab_size,1)),tf.float32)
vocab=vocab.numpy()
vocab[begin]=1
letters=[]
for i in range(n):
M=tf.Variable(tf.zeros((self.input_size,1)),name="M")
M=tf.assign(M,tf.concat((hprev,vocab),axis=0))
ft=tf.nn.sigmoid(tf.matmul(self.W1,M)+self.b1)
it=tf.nn.sigmoid(tf.matmul(self.W2,M)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
p_s=cs
ot=tf.sigmoid(tf.matmul(self.W4,M)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
ht=tf.reshape(ht,(self.hidden_size,1))
output=tf.matmul(self.W5,ht)+self.b5
p=self.softmax(output)
#print(p.numpy())
p=tf.reshape(p,(1,self.vocab_size))
samples = tf.random.categorical(p,1)
sample_selected=tf.cast(samples[0][0].numpy(),tf.int32)
selection_sample_np=[i for i in range(self.vocab_size)]
selection_sample_tf=tf.convert_to_tensor(selection_sample_np)
selected_next_letter=selection_sample_tf[sample_selected]
trial=tf.cast(selected_next_letter,tf.int32)
k=tf.Variable(tf.zeros((self.vocab_size,1)),tf.int32)
k[selected_next_letter,0].assign(1)
letters.append(selected_next_letter)
hprev=ht
return letters
def process_input(self):
char_to_ix={ch:ix for ix,ch in enumerate(self.words)}
ix_to_char={ix:ch for ix,ch in enumerate(self.words)}
return char_to_ix,ix_to_char
def softmax(self,z):
return tf.math.exp(z-max(z))/tf.math.reduce_sum(tf.math.exp(z-max(z)))
def AggregatorNew(self):
losses,iterations=[],[]
char_to_ix,ix_to_char=self.process_input()
mem1=tf.Variable(tf.zeros_like(self.W1))
mem2=tf.Variable(tf.zeros_like(self.W2))
mem3=tf.Variable(tf.zeros_like(self.W3))
mem4=tf.Variable(tf.zeros_like(self.W4))
mem5=tf.Variable(tf.zeros_like(self.W5))
mem6=tf.Variable(tf.zeros_like(self.b1))
mem7=tf.Variable(tf.zeros_like(self.b2))
mem8=tf.Variable(tf.zeros_like(self.b3))
mem9=tf.Variable(tf.zeros_like(self.b4))
mem10=tf.Variable(tf.zeros_like(self.b5))
dW1=tf.Variable(tf.zeros_like(self.W1))
dW2=tf.Variable(tf.zeros_like(self.W2))
dW3=tf.Variable(tf.zeros_like(self.W3))
dW4=tf.Variable(tf.zeros_like(self.W4))
dW5=tf.Variable(tf.zeros_like(self.W4))
db1=tf.Variable(tf.zeros_like(self.b1))
db2=tf.Variable(tf.zeros_like(self.b2))
db3=tf.Variable(tf.zeros_like(self.b3))
db4=tf.Variable(tf.zeros_like(self.b4))
db5=tf.Variable(tf.zeros_like(self.b5))
n=0
p=0
self.loss=tf.Variable(0,dtype=tf.dtypes.float32,name="loss")
smooth_loss =-tf.math.log(1.0/self.vocab_size)*self.sequence_length
while(1):
try:
with DelayedKeyboardInterrupt():
if p+self.sequence_length+1>= len(self.data) or n == 0:
hprev=tf.Variable(np.zeros((self.hidden_size,1)),dtype=tf.float32,name="hprev")
p_s=tf.Variable(tf.zeros((self.hidden_size,1)),name="p_s")
p=0
inputs=[char_to_ix[ch] for ch in self.data[p:p+self.sequence_length]]
targets=[char_to_ix[ch] for ch in self.data[p+1:p+self.sequence_length+1]]
sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
list_of_strings=[ix_to_char[ix.numpy()] for ix in sample_ix]
list_of_strings_tf=tf.convert_to_tensor(list_of_strings)
txt = tf.strings.join(list_of_strings_tf,separator=" ")
print ('----\n %s \n----' % (txt.numpy(), ))
#loss=tf.reduce_mean(xentropy,name="loss")
with tf.GradientTape() as g:
for x, y in zip(inputs,targets):
M_c=self.one_hot_encoding(x,hprev)
hprev,output,p_s=self.feedforward(M_c,p_s)
activation=output[y]
loss=-(tf.math.log(activation))
dW1,dW2,dW3,dW4,dW5,db1,db2,db3,db4,db5=g.gradient(loss,[self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5])
smooth_loss = smooth_loss * 0.999 + loss * 0.001
except KeyboardInterrupt:
sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print ('----\n %s \n----' % (txt, ))
break
when I use self.softmax() it gives me probability values in the output in the feedforward, however when I use tf.nn.softmax() all values of output are strangely 1.
Second question: Is tensorflow generally slower in cpu as compared to a pure python implementation or i am implementing tensorlow wrongly?

If you are using tf.nn.softmax(), and you don't specify the axis, it defaults to tf.nn.softmax(logits ,axis=1) hence giving a tensor ouput where all values are 1s . In my case I was getting wrong values just because of not providing axis i.e tf.nn.softmax(logits,axis=0)

Related

Keras custom loss-function

I would like to implement the following custom loss function, with argument x as the output of the last layer. Until now I implemented function this as Lambda layer, coupled with the keras mae loss, but I do not want that anymore
def GMM_UNC2(self, x):
tmp = self.create_mr(x) # get mr series
mr = k.sum(tmp, axis=1) # sum over time
tmp = k.square((1/self.T_i) * mr)
tmp = k.dot(tmp, k.transpose(self.T_i))
tmp = (1/(self.T * self.N)) * tmp
f = self.create_factor(x) # get factor
std = k.std(f)
mu = k.mean(f)
tmp = tmp + std/mu
def loss(y_true, y_pred=tmp):
return k.abs(y_true-y_pred)
return loss
self.y_true = np.zeros((1,1))
self.sdf_net = Model(inputs=[self.in_ma, self.in_mi, self.in_re, self.in_si], outputs=w)
self.sdf_net.compile(optimizer=self.optimizer, loss=self.GMM_UNC2(w))
self.sdf_net.fit([self.macro, self.micro, self.R, self.R_sign], self.y_true, epochs=epochs, verbose=1)
The code actually runs but it doesn't actually use tmp as input to loss (I multiplied it with some number, but the loss stays the same)
What am I doing wrong?
It is not completely clear from your question if you want to apply GMM_UNC2 function to the predictions, or it is applied only once to build the loss. If it is the first option, then all that code should be inside the loss and apply it over y_pred, like
def GMM_UNC2(self):
def loss(y_true, y_pred):
tmp = self.create_mr(y_pred) # get mr series
mr = k.sum(tmp, axis=1) # sum over time
tmp = k.square((1/self.T_i) * mr)
tmp = k.dot(tmp, k.transpose(self.T_i))
tmp = (1/(self.T * self.N)) * tmp
f = self.create_factor(x) # get factor
std = k.std(f)
mu = k.mean(f)
tmp = tmp + std/mu
return k.abs(y_true-y_pred)
return loss
If it is the second option, in general, passing objects as default values in a Python function definition is not a good idea, because it can be changed in the function definition. Also, you are assuming that the second argument to the loss has a name y_pred, but when called, it is done without a name, as a positional argument. In summary, you could try using a explicit comparison inside the loss, like
def loss(y_true, y_pred):
if y_pred is None:
y_pred = tmp
return k.abs(y_true - y_pred)
If what you like is ignoring the predictions, and forcibly using tmp, then you can ignore the y_pred argument of the loss and only use tmp, like
def loss(y_true, _):
return k.abs(y_true - tmp)

how to calculate entropy on float numbers over a tensor in python keras

I have been struggling on this and could not get it to work. hope someone can help me with this.
I want to calculate the entropy on each row of the tensor. Because my data are float numbers not integers I think I need to use bin_histogram.
For example a sample of my data is tensor =[[0.2, -0.1, 1],[2.09,-1.4,0.9]]
Just for information My model is seq2seq and written in keras with tensorflow backend.
This is my code so far: I need to correct rev_entropy
class entropy_measure(Layer):
def __init__(self, beta,batch, **kwargs):
self.beta = beta
self.batch = batch
self.uses_learning_phase = True
self.supports_masking = True
super(entropy_measure, self).__init__(**kwargs)
def call(self, x):
return K.in_train_phase(self.rev_entropy(x, self.beta,self.batch), x)
def get_config(self):
config = {'beta': self.beta}
base_config = super(entropy_measure, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def rev_entropy(self, x, beta,batch):
for i in x:
i = pd.Series(i)
p_data = i.value_counts() # counts occurrence of each value
entropy = entropy(p_data) # get entropy from counts
rev = 1/(1+entropy)
return rev
new_f_w_t = x * (rev.reshape(rev.shape[0], 1))*beta
return new_f_w_t
Any input is much appreciated:)
It looks like you have a series of questions that come together on this issue. I'll settle it here.
You calculate entropy in the following form of scipy.stats.entropy according to your code:
scipy.stats.entropy(pk, qk=None, base=None)
Calculate the entropy of a distribution for given probability values.
If only probabilities pk are given, the entropy is calculated as S =
-sum(pk * log(pk), axis=0).
Tensorflow does not provide a direct API to calculate entropy on each row of the tensor. What we need to do is to implement the above formula.
import tensorflow as tf
import pandas as pd
from scipy.stats import entropy
a = [1.1,2.2,3.3,4.4,2.2,3.3]
res = entropy(pd.value_counts(a))
_, _, count = tf.unique_with_counts(tf.constant(a))
# [1 2 2 1]
prob = count / tf.reduce_sum(count)
# [0.16666667 0.33333333 0.33333333 0.16666667]
tf_res = -tf.reduce_sum(prob * tf.log(prob))
with tf.Session() as sess:
print('scipy version: \n',res)
print('tensorflow version: \n',sess.run(tf_res))
scipy version:
1.329661348854758
tensorflow version:
1.3296613488547582
Then we need to define a function and achieve for loop through tf.map_fn in your custom layer according to above code.
def rev_entropy(self, x, beta,batch):
def row_entropy(row):
_, _, count = tf.unique_with_counts(row)
prob = count / tf.reduce_sum(count)
return -tf.reduce_sum(prob * tf.log(prob))
value_ranges = [-10.0, 100.0]
nbins = 50
new_f_w_t = tf.histogram_fixed_width_bins(x, value_ranges, nbins)
rev = tf.map_fn(row_entropy, new_f_w_t,dtype=tf.float32)
new_f_w_t = x * 1/(1+rev)*beta
return new_f_w_t
Notes that the hidden layer will not produce a gradient that cannot propagate backwards since entropy is calculated on the basis of statistical probabilistic values. Maybe you need to rethink your hidden layer structure.

`scipy.optimize` functions hang even with `maxiter=0`

I am trying to train the MNIST data (which I downloaded from Kaggle) with simple multi-class logistic regression, but the scipy.optimize functions hang.
Here's the code:
import csv
from math import exp
from numpy import *
from scipy.optimize import fmin, fmin_cg, fmin_powell, fmin_bfgs
# Prepare the data
def getIiter(ifname):
"""
Get the iterator from a csv file with filename ifname
"""
ifile = open(ifname, 'r')
iiter = csv.reader(ifile)
iiter.__next__()
return iiter
def parseRow(s):
y = [int(x) for x in s]
lab = y[0]
z = y[1:]
return (lab, z)
def getAllRows(ifname):
iiter = getIiter(ifname)
x = []
l = []
for row in iiter:
lab, z = parseRow(row)
x.append(z)
l.append(lab)
return x, l
def cutData(x, y):
"""
70% training
30% testing
"""
m = len(x)
t = int(m * .7)
return [(x[:t], y[:t]), (x[t:], y[t:])]
def num2IndMat(l):
t = array(l)
tt = [vectorize(int)((t == i)) for i in range(10)]
return array(tt).T
def readData(ifname):
x, l = getAllRows(ifname)
t = [[1] + y for y in x]
return array(t), num2IndMat(l)
#Calculate the cost function
def sigmoid(x):
return 1 / (1 + exp(-x))
vSigmoid = vectorize(sigmoid)
vLog = vectorize(log)
def costFunction(theta, x, y):
sigxt = vSigmoid(dot(x, theta))
cm = (- y * vLog(sigxt) - (1 - y) * vLog(1 - sigxt)) / m / N
return sum(cm)
def unflatten(flatTheta):
return [flatTheta[i * N : (i + 1) * N] for i in range(n + 1)]
def costFunctionFlatTheta(flatTheta):
return costFunction(unflatten(flatTheta), trainX, trainY)
def costFunctionFlatTheta1(flatTheta):
return costFunction(flatTheta.reshape(785, 10), trainX, trainY)
x, y = readData('train.csv')
[(trainX, trainY), (testX, testY)] = cutData(x, y)
m = len(trainX)
n = len(trainX[0]) - 1
N = len(trainY[0])
initTheta = zeros(((n + 1), N))
flatInitTheta = ndarray.flatten(initTheta)
flatInitTheta1 = initTheta.reshape(1, -1)
In the last two lines we flatten initTheta because the fmin{,_cg,_bfgs,_powell} functions seem to only take vectors as the initial value argument x0. I also flatten initTheta using reshape in hope this answer can be of help.
There is no problem computing the cost function which takes up less than 2 seconds on my computer:
print(costFunctionFlatTheta(flatInitTheta), costFunctionFlatTheta1(flatInitTheta1))
# 0.69314718056 0.69314718056
But all the fmin functions hang, even if I set maxiter=0.
e.g.
newFlatTheta = fmin(costFunctionFlatTheta, flatInitTheta, maxiter=0)
or
newFlatTheta1 = fmin(costFunctionFlatTheta1, flatInitTheta1, maxiter=0)
When I interrupt the program, it seems to me it all hangs at lines in optimize.py calling the cost functions, lines like this:
return function(*(wrapper_args + args))
For example, if I use fmin_cg, this would be line 292 in optimize.py (Version 0.5).
How do I solve this problem?
OK I found a way to stop fmin_cg from hanging.
Basically I just need to write a function that computes the gradient of the cost function, and pass it to the fprime parameter of fmin_cg.
def gradient(theta, x, y):
return dot(x.T, vSigmoid(dot(x, theta)) - y) / m / N
def gradientFlatTheta(flatTheta):
return ndarray.flatten(gradient(flatTheta.reshape(785, 10), trainX, trainY))
Then
newFlatTheta = fmin_cg(costFunctionFlatTheta, flatInitTheta, fprime=gradientFlatTheta, maxiter=0)
terminates within seconds, and setting maxiter to a higher number (say 100) one can train the model within reasonable amount of time.
The documentation of fmin_cg says the gradient would be numerically computed if no fprime is given, which is what I suspect caused the hanging.
Thanks to this notebook by zgo2016#Kaggle which helped me find the solution.

Tensorflow: Random selection of masks

I know that this stackoverflow thread already gives some nice examples about conditionals in tensorflow, but I'm still struggling how to solve my issue of randomly selecting among several different masks in tensorflow.
Right now I can only select between two mask tensors a and b:
rand_num = tf.random_uniform([], minval=0, maxval=2.0, dtype=tf.float32, seed=None)
def if_true():
return b
def if_false():
return a
mask_sel = tf.cond(tf.less(rand_num , tf.constant(1.0)),if_true,if_false)
(I still find it weird that one needs to define these two helper functions, but not using them weirdly throws an error.)
Now the question: Lets say I have 4 mask tensors (a,b,c,d) or more to randomly select, what would be the best way to do that in tensorflow?
In python that would be
rand_num = np.random.uniform(low=0,high=4.0)
if (rand_num < 1.0):
mask_sel = a
elif(rand_num < 2.0):
mask_sel = b
elif(rand_num < 3.0):
mask_sel = c
else
mask_sel = d
About the helper functions, they are useful because they allow tensorflow to know which operations will run under each condition, this way it can optimize by running only the selected branch and ignoring the other. Operations outside the helper functions but used by any of them will always be run before tf.cond runs.
The other options is to use tf.select; you won't need the helper functions here but it will always evaluate both sides before running tf.select which can be inefficient if you don't need to.
Now for the main problem 'selecting from more than 2 tesnors', you can use multiple options:
1- Recursively nesting tf.cond operations:
def select_from_list(selector, tensor_list):
length = len(tensor_list)
if length == 0:
raise ValueError('List is empty')
elif length == 1:
return tensor_list[0]
else:
half = length // 2
return tf.cond(tf.less(selector, float(half)), lambda: select_from_list(selector, tensor_list[:half]), lambda: select_from_list(selector - half, tensor_list[half:]))
2- Using tf.case:
def select_from_list(selector, tensor_list):
length = len(tensor_list)
if length == 0:
raise ValueError('List is empty')
elif length == 1:
return tensor_list[0]
else:
def fn(tensor):
return lambda: tensor
pred_fn_pairs = [(tf.less(selector, float(i+1)), fn(tensor)) for i, tensor in enumerate(tensor_list)]
return tf.case(pred_fn_pairs, default=lambda:tensor_list[-1])
You can test any of them using:
def test(selector, value_list, sess):
return select_from_list(float(selector), [tf.constant(value) for value in value_list]).eval(session = sess)
sess = tf.Session()
test(3.5, [4,2,6,7,5], sess)
This should return 7

How to use maxout activation function in tensorflow?

I want to use maxout activation function in tensorflow, but I don't know which function should use.
I sent a pull request for maxout, here is the link:
https://github.com/tensorflow/tensorflow/pull/5528
Code is as follows:
def maxout(inputs, num_units, axis=None):
shape = inputs.get_shape().as_list()
if axis is None:
# Assume that channel is the last dimension
axis = -1
num_channels = shape[axis]
if num_channels % num_units:
raise ValueError('number of features({}) is not a multiple of num_units({})'
.format(num_channels, num_units))
shape[axis] = -1
shape += [num_channels // num_units]
outputs = tf.reduce_max(tf.reshape(inputs, shape), -1, keep_dims=False)
return outputs
Here is how it works:
I don't think there is a maxout activation but there is nothing stopping yourself from making it yourself. You could do something like the following.
with tf.variable_scope('maxout'):
layer_input = ...
layer_output = None
for i in range(n_maxouts):
W = tf.get_variable('W_%d' % d, (n_input, n_output))
b = tf.get_variable('b_%d' % i, (n_output,))
y = tf.matmul(layer_input, W) + b
if layer_output is None:
layer_output = y
else:
layer_output = tf.maximum(layer_output, y)
Note that this is code I just wrote in my browser so there may be syntax errors but you should get the general idea. You simply perform a number of linear transforms and take the maximum across all the transforms.
How about this code?
This seems to work in my test.
def max_out(input_tensor,output_size):
shape = input_tensor.get_shape().as_list()
if shape[1] % output_size == 0:
return tf.transpose(tf.reduce_max(tf.split(input_tensor,output_size,1),axis=2))
else:
raise ValueError("Output size or input tensor size is not fine. Please check it. Reminder need be zero.")
I refer the diagram in the following page.
From version 1.4 on you can use tf.contrib.layers.maxout.
Maxout is a layer such that it calculates N*M output for a N*1 input, and then it returns the maximum value across the column, i.e., the final output has shape N*1 as well. Basically it uses multiple linear fittings to mimic a complex function.