tensorflow distribution create probability greater than 1 - tensorflow

I am using tensorflow distribution API for sampling, following is the sample code I am using, but I found the probability is greater than 1, then log probability is smaller than 0. I have tried both CPU and GPU, both produce this weird result. the tensorflow is 1.3.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
from matplotlib import pyplot as plt
import numpy as np
learning_rate = 0.01
total_features, total_prices = load_boston(True)
# Keep 300 samples for training
train_features = scale(total_features[:300])
train_prices = total_prices[:300]
x = tf.placeholder(tf.float32, [None, 13])
l1 = tf.layers.dense(inputs=x, units=20, activation=tf.nn.elu)
l2 = tf.layers.dense(inputs=l1, units=20, activation=tf.nn.elu)
mu = tf.squeeze(tf.layers.dense(inputs=l2, units=1))
sigma = tf.squeeze(tf.layers.dense(inputs=l2, units=1))
sigma = tf.nn.softplus(sigma) + 1e-5
normal_dist = tf.contrib.distributions.Normal(mu, sigma)
samples = tf.squeeze(normal_dist._sample_n(1))
log_prob = -normal_dist.log_prob(samples)
prob = normal_dist.prob(samples)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
avg_cost = 0.0
feed_dict = {x: train_features}
p = sess.run(prob, feed_dict)
lp = sess.run(log_prob, feed_dict)
The p is my probability output
and lp is log probability
Thank you!

The functions .prob and .log_prob are the PDF and Log PDF of the normal distribution: https://en.wikipedia.org/wiki/Probability_density_function. Note that the PDF doesn't have to evaluate to a value between 0 and 1; It's integral over a range (which is related to the CDF) has to be between 0 and 1.
Consider the case where mu = 0 and sigma = 1e-4. If we use the PDF of the normal distribution: https://en.wikipedia.org/wiki/Normal_distribution, then PDF(0) ~= 4000! However, if we were to integrate the PDF and get the CDF (or use the CDF directly), then we will always get a value between 0 and 1.

Related

Why am I obtaining high MAE values when constructing my deep neural network

I am trying to do hyperparameter optimization for a regression problem using a deep neural network.
I am getting slightly high MAE values (5-7) when in the literature this is around 0.9-5 (using the same dataset to train the NN).
Any idea of what I could improve? I can provide the dataset if needed, but it's very large.
X looks like this:
Y looks like this:
X is composed of 865432 rows=features and 134 columns=samples where rows are ordered by decreasing pearson correlation with variable Y which is the age of each sample (a float variable).
Each entry in X is a number between 0-1.
Since there are too many features for the number of samples I decided to take only the 40000 most important features. (Also because I don't know how to include feature selection within the training of the model).
Here is the loss vs epochs:
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
import keras.losses
from keras import backend as K
import keras_tuner as kt
from keras_tuner.tuners import RandomSearch
from sklearn.model_selection import cross_val_score
def dynamic_model(hp):
# hyperparameters (independent of number of layers)
# Number of hidden layers: 1 - 50
hp_num_layers = hp.Int("num_layers", 2, 10)
hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
# Initialize sequential API and start building model.
model = keras.Sequential()
model.add(keras.layers.InputLayer(input_shape=(CpG_num,)))
# Tune the number of hidden layers and units in each
for i in range(1, hp_num_layers):
# hyperparameters (dependent on num_layers):
hp_units = hp.Int(f"units_{i}", min_value=100, max_value=400, step=100)
hp_dropout_rate = hp.Float(f"dropout_{i}", 0, 0.5, step=0.1)
hp_activation = hp.Choice(f"act_{i}", ['LeakyReLU', 'PReLU'])
model.add(
keras.layers.Dense(units=hp_units, kernel_initializer='normal', activation=hp_activation)
)
model.add(keras.layers.Dropout(hp_dropout_rate))
# Add output layer
model.add(keras.layers.Dense(units=1, kernel_initializer='normal'))
# Define optimizer, loss, and metrics
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
loss=loss_function,
metrics=metrics
)
return model
def correlation_coefficient_loss(y_true, y_pred):
x = y_true
y = y_pred
mx = K.mean(x)
my = K.mean(y)
xm, ym = x-mx, y-my
print(type(mx))
r_num = K.sum(tf.multiply(xm,ym))
r_den = K.sqrt(tf.multiply(K.sum(K.square(xm)), K.sum(K.square(ym))))
r = r_num / r_den
r = K.maximum(K.minimum(r, 1.0), -1.0)
return (1 - K.square(r))
loss_function = 'mae'
metrics = ['mae', correlation_coefficient_loss]
scoring_function = 'neg_mean_absolute_error'
tuner_kind = 'random'
results_dir = '../tmp_files/Grid_Search_results/'+name+tuner_kind
hp = kt.HyperParameters()
if tuner_kind == 'random':
tuner = kt.RandomSearch(
dynamic_model,
objective = 'mae',
overwrite=True,
directory = results_dir,
project_name = 'random_tuner_trials',
max_trials = 500
)
tuner.search(
X1[name],Y1[name],
epochs=50,
validation_data=(X2[name],Y2[name]),
)
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps_model = dynamic_model(best_hps)
best_hps_model.fit(X1_train[name],y1_train[name],validation_data=(X2_train[name],y2_train[name]), epochs=500, batch_size=10)

Wrong ROC curve for multiclass classification

I have trained a CNN to classify images into 5 classes. But when I try to plot ROC curve for each class versus the rest, all 5 classes have almost a diagonal curve with AUC of around 0.5. I have no idea what has gone wrong.
The model should have an accuracy of around 86%.
Here is the code:
import os, shutil
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, RocCurveDisplay
from sklearn.preprocessing import label_binarize
import random
model = tf.keras.models.load_model('G:/Myxoid lesion/Myxoid_EN3_finetune4b')
model.summary()
data_dir='G:/Myxoid lesion/Test/'
batch_size = 64
img_height = 300
img_width = 300
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
data_dir,
seed = 123,
image_size=(img_height, img_width),
batch_size=batch_size)
model.compile(optimizer = optimizers.Adam(lr=0.00002),
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics = ['sparse_categorical_accuracy'])
correct = np.array([], dtype='int32')
# Get the labels of test_ds
for x, y in test_ds:
correct = np.concatenate([correct, y.numpy()])
# Get the prediction probabilities for each class for each test image
prediction_prob = tf.nn.softmax(model.predict(test_ds))
num_class = 5
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_class):
fpr[i], tpr[i], _ = roc_curve(correct, prediction_prob[:,i], pos_label=i)
roc_auc[i] = auc(fpr[i], tpr[i])
plt.figure()
lw = 2
for i in range(num_class):
plt.plot(fpr[i],tpr[i],
color=(random.random(),random.random(),random.random()),
label='{0} (AUC = {1:0.2f})'''.format(labels[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.legend(loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC analysis')
plt.show()
The "prediction_prob" variable contains:
array([[6.3877934e-09, 6.3617526e-06, 5.5736535e-07, 4.9789862e-05,
9.9994326e-01],
[6.5260068e-08, 8.8882577e-03, 3.9350948e-06, 9.9110776e-01,
4.0252076e-11],
[2.7514220e-04, 2.9315910e-05, 1.6688553e-04, 9.9952865e-01,
3.5938730e-10],
...,
[1.1131389e-09, 9.8325908e-01, 3.4283744e-06, 1.6737511e-02,
7.3243338e-12],
[1.4697845e-08, 4.7125661e-05, 1.4077022e-03, 6.4052530e-02,
9.3449265e-01],
[9.9999940e-01, 1.3071107e-07, 4.3149896e-07, 4.7902233e-08,
9.2861301e-09]], dtype=float32)>
While the "correct" variable contains the correct label for each test image:
array([0, 1, 4, ..., 4, 2, 4])
I think I follow what is mentioned on the scikit-learn website.
The tpr[i] and fpr[i] variables generated becomes linear correlated, so the AUC becomes 0.5
I think there is a problem in generating tpr[i] and fpr[i]? Could anyone figure out the problem?
Thanks!
If I generate the labels and prediction in this way, then I can get the correct ROC curve:
prediction_prob = np.array([]).reshape(0,5)
correct = np.array([], dtype='int32')
for x, y in test_ds:
correct = np.concatenate([correct, y.numpy()])
prediction_prob = np.vstack([prediction_prob, tf.nn.softmax(model.predict(x))])
However, if I get the prediction from model.predict(test_ds), somehow the order the prediction is different from the original dataset, so that it does not match with the original label. I am not sure if this is the 'bug' in tensorflow, or there is other explanation to this.
Also I cannot get the micro-averaging (though this is not that important for my goal)
fpr["micro"], tpr["micro"], _ = roc_curve(correct.ravel(), prediction_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
It gives the following error:
raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Making custom activation function in tensorflow 2.0

I am trying to create a custom tanh() activation function in tensorflow to work with a particular output range that I want. I want my network to output concentration multipliers, so I figured if the output of tanh() were negative it should return a value between 0 and 1, and if it were positive to output a value between 1 and 10.
Here is what I currently have
def output_activation(x):
# function to scale tanh activation to be 1-10 if x > 0, or 0-1 if x < 0
return tf.cond(x >= 0, lambda: tf.math.tanh(x+0.1)*10, lambda: tf.math.tanh(x) + 1)
I believe this will work with a single value, but I want to output a vector of values, to which python throws a value error
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Tensors are immutable and, from my understanding, converting to a numpy array and back will slow down network training if I am on a GPU. What is the best way to get around this error but still keep the benefits of hardware acceleration?
I suggest you tf.keras.backend.switch. Here a dummy example
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras import backend as K
def output_activation(x):
return K.switch(x >= 0, tf.math.tanh(x+0.1)*10, tf.math.tanh(x) + 1)
X = np.random.uniform(0,1, (100,10))
y = np.random.uniform(0,1, 100)
inp = Input((10,))
x = Dense(8, activation=output_activation)(inp)
out = Dense(1)(x)
model = Model(inp, out)
model.compile('adam', 'mse')
model.fit(X,y, epochs=3)
here the running notebook: https://colab.research.google.com/drive/1T_kRNUphJt9xTjiOheTgoIGpGDZaaRAg?usp=sharing

How should I fix that after I use a slice of a tensor, tensorflow optimizer will break?

Here is a sample:
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
if __name__ == '__main__':
embs = tf.Variable(np.random.random((40,5)),dtype=tf.float32)
X = np.array(np.array(range(1,25)).reshape(4, 6))
x0 = tf.placeholder(tf.int32, [None, None])
x1 = tf.nn.embedding_lookup(embs, x0)
lstm = tf.nn.rnn_cell.BasicLSTMCell(5,state_is_tuple=True)
outputs, states = tf.nn.dynamic_rnn(lstm, x1, dtype=tf.float32,time_major = True)
cost = tf.reduce_mean(outputs[:,-1,:])
optimizer = tf.train.AdagradOptimizer(learning_rate=0.12).minimize(cost)
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
result3, opt = sess.run([outputs, optimizer],{x0:X})
I use just one slice of outputs which is outputs[:,-1,:] to get a cost function. When I run the code, I got the result
F ./tensorflow/core/framework/tensor.h:581] Check failed: new_num_elements == NumElements() (0 vs. 20)
How to fix this? It's just a sample. I met this problem when I implement a hierarchical LSTM in which the representations of sentences computed by a LSTM is feed into another LSTM.
I confirmed that this is a bug in TensorFlow 0.10. Upgrading to TensorFlow 0.11 will fix the problem.

How can I improve numpy's broadcast

I'm trying implementing k-NN with Mahalanobis's distance in python with numpy. However, the code below works very slowly when I use broadcasting.
Please teach me how can I improve numpy speed or implement this better.
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))
mnist_X = mnist_X/255.0
train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2)
k = 2
def data_gen(n):
return train_X[train_y == n]
train_X_num = [data_gen(i) for i in range(10)]
inv_cov = [np.linalg.inv(np.cov(train_X_num[i], rowvar=0)+np.eye(784)*0.00001) for i in range(10)] # Making Inverse covariance matrices
for i in range(10):
ivec = train_X_num[i] # ivec size is (number of 'i' data, 784)
ivec = ivec - test_X[:, np.newaxis, :] # This code is too much slowly, and using huge memory
iinv_cov = inv_cov[i]
d[i] = np.add.reduce(np.dot(ivec, iinv_cov)*ivec, axis=2).sort(1)[:, :k+1] # Calculate x.T inverse(sigma) x, and extract k-minimal distance