Related
I have trained a CNN to classify images into 5 classes. But when I try to plot ROC curve for each class versus the rest, all 5 classes have almost a diagonal curve with AUC of around 0.5. I have no idea what has gone wrong.
The model should have an accuracy of around 86%.
Here is the code:
import os, shutil
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, RocCurveDisplay
from sklearn.preprocessing import label_binarize
import random
model = tf.keras.models.load_model('G:/Myxoid lesion/Myxoid_EN3_finetune4b')
model.summary()
data_dir='G:/Myxoid lesion/Test/'
batch_size = 64
img_height = 300
img_width = 300
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
data_dir,
seed = 123,
image_size=(img_height, img_width),
batch_size=batch_size)
model.compile(optimizer = optimizers.Adam(lr=0.00002),
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics = ['sparse_categorical_accuracy'])
correct = np.array([], dtype='int32')
# Get the labels of test_ds
for x, y in test_ds:
correct = np.concatenate([correct, y.numpy()])
# Get the prediction probabilities for each class for each test image
prediction_prob = tf.nn.softmax(model.predict(test_ds))
num_class = 5
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_class):
fpr[i], tpr[i], _ = roc_curve(correct, prediction_prob[:,i], pos_label=i)
roc_auc[i] = auc(fpr[i], tpr[i])
plt.figure()
lw = 2
for i in range(num_class):
plt.plot(fpr[i],tpr[i],
color=(random.random(),random.random(),random.random()),
label='{0} (AUC = {1:0.2f})'''.format(labels[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.legend(loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC analysis')
plt.show()
The "prediction_prob" variable contains:
array([[6.3877934e-09, 6.3617526e-06, 5.5736535e-07, 4.9789862e-05,
9.9994326e-01],
[6.5260068e-08, 8.8882577e-03, 3.9350948e-06, 9.9110776e-01,
4.0252076e-11],
[2.7514220e-04, 2.9315910e-05, 1.6688553e-04, 9.9952865e-01,
3.5938730e-10],
...,
[1.1131389e-09, 9.8325908e-01, 3.4283744e-06, 1.6737511e-02,
7.3243338e-12],
[1.4697845e-08, 4.7125661e-05, 1.4077022e-03, 6.4052530e-02,
9.3449265e-01],
[9.9999940e-01, 1.3071107e-07, 4.3149896e-07, 4.7902233e-08,
9.2861301e-09]], dtype=float32)>
While the "correct" variable contains the correct label for each test image:
array([0, 1, 4, ..., 4, 2, 4])
I think I follow what is mentioned on the scikit-learn website.
The tpr[i] and fpr[i] variables generated becomes linear correlated, so the AUC becomes 0.5
I think there is a problem in generating tpr[i] and fpr[i]? Could anyone figure out the problem?
Thanks!
If I generate the labels and prediction in this way, then I can get the correct ROC curve:
prediction_prob = np.array([]).reshape(0,5)
correct = np.array([], dtype='int32')
for x, y in test_ds:
correct = np.concatenate([correct, y.numpy()])
prediction_prob = np.vstack([prediction_prob, tf.nn.softmax(model.predict(x))])
However, if I get the prediction from model.predict(test_ds), somehow the order the prediction is different from the original dataset, so that it does not match with the original label. I am not sure if this is the 'bug' in tensorflow, or there is other explanation to this.
Also I cannot get the micro-averaging (though this is not that important for my goal)
fpr["micro"], tpr["micro"], _ = roc_curve(correct.ravel(), prediction_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
It gives the following error:
raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported
I am currently receiving one of the following errors (depending on the sequence of data prep):
TypeError: Inputs to a layer should be tensors. Got: <tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x000001E02F62FB00>
TypeError: Inputs to a layer should be tensors. Got: <_VariantDataset shapes: OrderedDict
Background: I have some parquet files, where each file is a multi-variate time-series. Since I am using the files for a multivariate time-series classification problem, I am storing the labels in a single numpy array. I need to use tf.data.Dataset for reading the files, since I cannot fit them all in memory.
Here is a working example that reproduces my error:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Masking, LSTM, Dropout, Dense
#!pip install tensorflow-io
import tensorflow_io as tfio
num_files = 10
num_features = 3
num_timesteps = 50
num_classes = 2
batch_size = 2
for i in range(num_files):
df = pd.DataFrame({"A": np.random.rand(num_timesteps), "B": np.random.rand(num_timesteps), "C": np.random.rand(num_timesteps)})
df.to_parquet("file_{}.parquet".format(i))
columns_init = {"A": tf.TensorSpec(tf.TensorShape([]), tf.float32), "B": tf.TensorSpec(tf.TensorShape([]), tf.float32), "C": tf.TensorSpec(tf.TensorShape([]), tf.float32)}
labels = np.array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0])
train_split_size = 0.8
num_train_files = int(train_split_size * num_files)
train_names = ["file_{}.parquet".format(i) for i in range(num_train_files)]
val_names = ["file_{}.parquet".format(i) for i in range(num_train_files, num_files)]
y_train = labels[ : num_train_files]
y_val = labels[num_train_files : num_files]
def map_fn(file_names, label_ds):
return tfio.IODataset.from_parquet(file_names, columns=columns_init), label_ds
train_ds = tf.data.Dataset.from_tensor_slices((train_names, y_train))
train_ds = train_ds.shuffle(buffer_size = num_train_files)
train_ds = train_ds.map(map_fn)
train_ds = train_ds.batch(batch_size)
train_ds = train_ds.prefetch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((val_names, y_val))
# No need for shuffling the validation set
val_ds = val_ds.map(map_fn)
val_ds = val_ds.batch(batch_size)
val_ds = val_ds.prefetch(batch_size)
ip = Input(shape=(num_timesteps, num_features))
x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='softmax')(x)
model = Model(ip, out)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)
How do I overcome this error? I would prefer to keep my files separate and shuffle only how they are batched, since I don't want to meddle with the time-series sequences within the files. Is there a similar solution for .csv files instead of .parquet. I prefer parquet files because they are lighter and easier to read, but I am happy to convert my files if there is no turnaround.
For anyone experiencing a similar issue, I found a workaround, which was not straightforward. In this case, I defined a common_ds function for reading all the data from the files. I applied batching, where the batch size is equal to the time-series length to split the observations as they were stored. (Note: this assumes that the files are already preprocessed and all the files have equal number of rows.) After combining the features with the labels, the data is shuffled and batched according to the desired batch size. The final step uses the pack_features_function to change the format into tensor shapes that can be fed to the model.
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Masking, LSTM, Dropout, Dense, Input
#!pip install tensorflow-io
import tensorflow_io as tfio
num_files = 10
num_features = 3
num_timesteps = 50
num_classes = 2
batch_size = 2
for i in range(num_files):
df = pd.DataFrame({"A": np.random.rand(num_timesteps),
"B": np.random.rand(num_timesteps),
"C": np.random.rand(num_timesteps)})
df.to_parquet("file_{}.parquet".format(i))
columns_init = {"A": tf.TensorSpec(tf.TensorShape([]), tf.float32),
"B": tf.TensorSpec(tf.TensorShape([]), tf.float32),
"C": tf.TensorSpec(tf.TensorShape([]), tf.float32)}
labels = np.array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0])
train_split_size = 0.8
num_train_files = int(train_split_size * num_files)
train_names = ["file_{}.parquet".format(i) for i in range(num_train_files)]
val_names = ["file_{}.parquet".format(i) for i in range(num_train_files, num_files)]
y_train = labels[ : num_train_files]
y_val = labels[num_train_files : num_files]
def make_common_ds(files):
common_ds = tfio.IODataset.from_parquet(files[0], columns=columns_init)
for file_name in files[1:]:
ds = tfio.IODataset.from_parquet(file_name, columns=columns_init)
common_ds = common_ds.concatenate(ds)
return common_ds
def pack_features_vector(features, labels):
"""Pack the features into a single array."""
features = tf.stack(list(features.values()), axis=2)
return features, labels
train_names_ds = make_common_ds(train_names)
train_names_ds = train_names_ds.batch(num_timesteps)
train_label_ds = tf.data.Dataset.from_tensor_slices(y_train)
train_ds = tf.data.Dataset.zip((train_names_ds, train_label_ds))
train_ds = train_ds.shuffle(buffer_size = num_train_files)
train_ds = train_ds.batch(batch_size)
train_ds = train_ds.prefetch(batch_size)
train_ds = train_ds.map(pack_features_vector)
val_names_ds = make_common_ds(val_names)
val_names_ds = val_names_ds.batch(num_timesteps)
val_label_ds = tf.data.Dataset.from_tensor_slices(y_val)
val_ds = tf.data.Dataset.zip((val_names_ds, val_label_ds))
# No need to shuffle the validation set
val_ds = val_ds.batch(batch_size)
val_ds = val_ds.prefetch(batch_size)
val_ds = val_ds.map(pack_features_vector)
ip = Input(shape=(num_timesteps, num_features))
x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='softmax')(x)
model = Model(ip, out)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)
I have an issue with Tensorflow model that is converted from Pytorch -> Onnx -> Tensorflow. The issue is the converted Tensorflow model expects the input in Pytorch format that is (batch size, number channels, height, width) but not in Tensorflow format (batch size, height, width, number channel). Therefore, I cannot use the model to process further with Vitis AI.
So I would like to ask is there is any ways to convert this Pytorch input format to Tensorflow format by using tools from Onnx, Tensorflow 1, or others?
My code is as below:
Pytorch -> Onnx
from hardnet import hardnet
import torch
import onnx
ckpt = torch.load('../hardnet.pth')
model_state_dict = ckpt['model_state_dict']
optimizer_state_dict = ckpt['optimizer_state_dict']
model = hardnet(11)
model.load_state_dict(model_state_dict)
model.eval()
dummy_input = torch.randn(1, 3, 1080, 1920)
input_names = ['input0']
output_names = ['output0']
output_file = 'hardnet.onnx'
torch.onnx.export(model, dummy_input, output_file, verbose=True,
input_names=input_names, output_names=output_names,
opset_version=11, keep_initializers_as_inputs=True)
onnx_model = onnx.load(output_file)
onnx.checker.check_model(onnx_model)
print('Passed Onnx')
Onnx -> Tensorflow 1 (using Tensorflow 1.15)
import cv2
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import onnx
from onnx_tf.backend import prepare
output_file = 'hardnet.onnx'
onnx_model = onnx.load(output_file)
output = prepare(onnx_model)
output.export_graph('hardnet.pb')
tf.compat.v1.disable_eager_execution()
def load_pb(path_to_pb: str):
"""From: https://stackoverflow.com/questions/51278213/what-is-the-use-of-a-pb-file-in-tensorflow-and-how-does-it-work
"""
with tf.gfile.GFile(path_to_pb, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name='')
return graph
graph = load_pb('hardnet.pb')
input = graph.get_tensor_by_name('input0:0')
output = graph.get_tensor_by_name('output0:0')
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
img = cv2.imread('train_0.jpg', cv2.IMREAD_COLOR)
img = cv2.resize(img, (1920, 1080))
img = img/255
img = img - mean
img = img/std
img = np.expand_dims(img, -1)
# To Pytorch format.
img = np.transpose(img, (3, 2, 0, 1))
img = img
with tf.Session(graph=graph) as sess:
pred = sess.run(output, {input: img})
You could wrap your Pytorch model into another one that would do the transpose you want to have in TensorFlow. See the following example:
Let's say you have the following toy NN:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.rnn = nn.LSTM(10, 20, 2)
def forward(self, x):
h0 = torch.zeros(2, 3, 20)
c0 = torch.zeros(2, 3, 20)
return self.rnn(x, (h0, c0))
the exemplary pytorch/tensorflow input shape would be :
>> pytorch_input = torch.randn(5, 3, 10)
>> tf_input = torch.transpose(pytorch_input, 1, 2)
>> print("PyTorch input shape: ", pytorch_input.shape)
>> print("TensorFlow input shape: ", tf_input.shape)
PyTorch input shape: torch.Size([5, 3, 10])
TensorFlow input shape: torch.Size([5, 10, 3])
Now, the wrapper which will first transpose input and then pass transposed input to some model:
class NetTensorFlowWrapper(nn.Module):
def __init__(self, main_module: nn.Module):
super(NetTensorFlowWrapper, self).__init__()
self.main_module = main_module
def forward(self, x):
x = torch.transpose(x, 1, 2)
return self.main_module(x)
Then, this is possible:
net = Net()
net_wrapper = NetTensorFlowWrapper(net)
net(pytorch_input)
net_wrapper(tf_input)
and then, when you finally save your models like you did previously via torch.onnx.export and read their graph via onnx package (not torch.onnx) you will have...
for Net- input 5x3x10 and no transpose layer
graph torch-jit-export (
%input0[FLOAT, 5x3x10]
{
%76 = Shape(%input0)
%77 = Constant[value = <Scalar Tensor []>]()
for NetTensorFlowWrapper- input 5x10x3 and transpose layer
graph torch-jit-export (
%input0[FLOAT, 5x10x3]
{
%9 = Transpose[perm = [0, 2, 1]](%input0)
%77 = Shape(%9)
%78 = Constant[value = <Scalar Tensor []>]()
...
How can I see the value of final features that are being trained inside tensorflow model. Like in the below case I am trying to multi-hot my column 'x' and I want to see how the features are going to my model.
This is very easy to do in sklearn but being new to Tensorflow I dont understand how is it possible.
import tensorflow as tf
import pandas as pd
data = {'x':['a c', 'a b', 'b c'], 'y': [1, 1, 0]}
df = pd.DataFrame(data)
Y = df['y']
X = df.drop('y', axis=1)
indicator_features = [tf.feature_column.indicator_column(categorical_column=
tf.feature_column.categorical_column_with_vocabulary_list(key = 'x',
vocabulary_list = ['a','b','c']))]
model = tf.estimator.LinearClassifier(feature_columns=indicator_features,
model_dir = "/tmp/samplemodel")
training_input_fn = tf.estimator.inputs.pandas_input_fn(x = X,
y=Y,
batch_size=64,
shuffle= True,
num_epochs = None)
model.train(input_fn=training_input_fn,steps=1000)
I have been able to print the values by enabling the eager execution in tensorflow.
Posting my solution below. Welcome for any other ideas as well.
import tensorflow as tf
import tensorflow.feature_column as fc
import pandas as pd
PATH = "/tmp/sample.csv"
tf.enable_eager_execution()
COLUMNS = ['education','label']
train_df = pd.read_csv(PATH, header=None, names = COLUMNS)
#train_df['education'] = train_df['education'].str.split(" ")
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
label = df[label_key]
ed = tf.string_split(df['education']," ")
df['education'] = ed
ds = tf.data.Dataset.from_tensor_slices((dict(df),label))
if shuffle:
ds = ds.shuffle(10000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
ds = easy_input_function(train_df, label_key='label', num_epochs=5, shuffle=False, batch_size=5)
for feature_batch, label_batch in ds.take(1):
print('Some feature keys:', list(feature_batch.keys())[:5])
print()
print('A batch of education :', feature_batch['education'])
print()
print('A batch of Labels:', label_batch )
print(feature_batch)
education_vocabulary_list = [
'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
'5th-6th', '10th', '1st-4th', 'Preschool', '12th']
education = tf.feature_column.categorical_column_with_vocabulary_list('education', vocabulary_list=education_vocabulary_list)
fc.input_layer(feature_batch, [fc.indicator_column(education)])
I have a multi-class classification problem with 3 classes in total.
I am using LinearDiscriminantAnalysis for the classification and I want to plot the average ROC across KFolds (k = 5).
I am able to do this for a binary classification case but I cannot find a way to make it work for my multi-class case.
Below is my code for the binary case:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
plt.style.use('ggplot')
X, y = make_classification(n_samples=500, random_state=100, flip_y=0.3)
kf = KFold(n_splits = 5, shuffle = True, random_state= 0)
clf = LinearDiscriminantAnalysis()
pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
tprs = []
aucs = []
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']
for i, (train, test) in enumerate(kf.split(X,y)):
model = pipe.fit(X[train], y[train])
y_score = model.predict_proba(X[test])
fpr, tpr, _ = roc_curve(y[test], y_score[:, 1])
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
#plt.plot(fpr, tpr, lw=1, alpha=0.6, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc), c = colors[i])
tpr = interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
std = tprs.std(axis=0)
mean_auc = auc(base_fpr, mean_tprs)
std_auc = np.std(aucs)
tprs_upper = np.minimum(mean_tprs + std, 1)
tprs_lower = mean_tprs - std
plt.figure(figsize=(12, 8))
plt.plot(base_fpr, mean_tprs, 'b', alpha = 0.8, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),)
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color = 'blue', alpha = 0.2)
plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'r', label = 'Luck', alpha= 0.8)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.title('Receiver operating characteristic (ROC) curve')
#plt.axes().set_aspect('equal', 'datalim')
plt.show()
EDIT 1:
My attempt to make it work for the multiclass case using OneVsRestClassifier:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import label_binarize
plt.style.use('ggplot')
plt.figure(figsize=(12, 8))
X, y = make_classification(n_samples=500, random_state=100, n_classes=3,n_clusters_per_class=1, flip_y=0.3)
kf = KFold(n_splits = 5, shuffle = True, random_state= 0)
clf = OneVsRestClassifier(LinearDiscriminantAnalysis())
pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
classes = np.unique(y)
y_true = label_binarize(y, classes=classes)
n_classes = y_true.shape[1]
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']
fpr = dict()
tpr = dict()
roc_auc = dict()
fff=[]
ttt=[]
aucc=[]
# Fit the model for each fold
for i, (train, test) in enumerate(kf.split(X,y)):
model = pipe.fit(X[train], y[train])
y_score = model.predict_proba(X[test])
# Compute ROC curve and ROC area for each class PER FOLD
for j in range(n_classes):
fpr[j], tpr[j], _ = roc_curve(y_true[test][:, j], y_score[:, j])
roc_auc[j] = auc(fpr[j], tpr[j])
# First aggregate all false positive rates per classe for each fold
all_fpr = np.unique(np.concatenate([fpr[j] for j in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for j in range(n_classes):
mean_tpr += interp(all_fpr, fpr[j], tpr[j])
# Finally average it and compute AUC for EACH FOLD
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
fff.append(all_fpr)
ttt.append(mean_tpr)
aucc.append(roc_auc["macro"])
# Compute average across Folds
fff = np.array(fff)
ttt = np.array(ttt)
aucc = np.array(aucc)
all_fpr_folds = np.unique(np.concatenate([fff[j] for j in range(kf.get_n_splits())]))
# Then interpolate all ROC curves at this points
mean_tpr_folds = np.zeros_like(all_fpr_folds)
for j in range(kf.get_n_splits()):
mean_tpr_folds += interp(all_fpr_folds, fff[j], ttt[j])
# Finally average it and compute AUC
mean_tpr_folds /= float(kf.get_n_splits())
mean_mean_tpr_folds= mean_tpr_folds.mean(axis = 0)
std = mean_tpr_folds.std(axis=0)
tprs_upper = np.minimum(mean_mean_tpr_folds + std, 1)
tprs_lower = mean_mean_tpr_folds - std
plt.plot(all_fpr_folds, mean_tpr_folds, 'b', alpha = 0.8, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (aucc.mean(), aucc.std()),)
plt.fill_between(all_fpr_folds, tprs_lower, tprs_upper, color = 'blue', alpha = 0.2)
plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'r', label = 'Luck', alpha= 0.8)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.title('Receiver operating characteristic (ROC) curve')
#plt.axes().set_aspect('equal', 'datalim')
plt.show()
I am missing something here...