Related
I've used kmeans clustering algorithm for training the data and then try to get accuracy with some Classification algorithms such as decision tree, random forest, KNN algorithm etc,. After training of data while running model accuracy it takes long time for running. I've attached the code below.
# lets import the warnings library so that we can avoid warnings
import warnings
warnings.filterwarnings('ignore')
# Lets select the Spending score, and Annual Income Columns from the Data
x = data.loc[:, ['Time', 'V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20']].values
# let's check the shape of x
print(x.shape)
# lets convert this data into a dataframe
x_data = pd.DataFrame(x)
x_data.head()
km = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(x)
# lets find out the Results
a = data['Class']
y_means = pd.DataFrame(y_means)
z = pd.concat([y_means, a], axis = 1)
z = z.rename(columns = {0: 'cluster'})
# lets check the Clusters of each Crops
print("Lets check the Results After Applying the K Means Clustering Analysis \n")
print("First Cluster:", z[z['cluster'] == 0]['Class'].unique())
print("---------------------------------------------------------------")
print("Second Cluster:", z[z['cluster'] == 1]['Class'].unique())
print("---------------------------------------------------------------")
from sklearn.cluster import KMeans
hc=KMeans
hc= KMeans(n_clusters=2)
y_her= hc.fit_predict(x)
# lets find out the Results
b = data['Class']
y_herr = pd.DataFrame(y_her)
w = pd.concat([y_herr, b], axis = 1)
w= w.rename(columns = {0: 'cluster'})
# lets check the Clusters of each Crops
print("K-Means Clustering Analysis \n")
print("Zero Cluster:", w[w['cluster'] == 0]['Class'].unique())
print("---------------------------------------------------------------")
print("First Cluster:", w[w['cluster'] == 1]['Class'].unique())
print("---------------------------------------------------------------")
y = data['Class']
x = data.drop(['Class'], axis = 1)
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print("The Shape of x train:", x_train.shape)
print("The Shape of x test:", x_test.shape)
print("The Shape of y train:", y_train.shape)
print("The Shape of y test:", y_test.shape)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from mlxtend.plotting import plot_confusion_matrix
def evaluator(y_test, y_pred):
# Accuracy:
print('Accuracy is: ', accuracy_score(y_test,y_pred))
print('')
# Classification Report:
print('Classification Report: \n',classification_report(y_test,y_pred))
print('Confusion Matrix: \n\n')
plt.style.use("ggplot")
cm = confusion_matrix(y_test,y_pred)
plot_confusion_matrix(conf_mat = cm,figsize=(10,10),show_normed=True)
plt.title('Confusion Matrix for Logistic Regression', fontsize = 15)
plt.show()
#In this below part where the code is running for a long time.
model_accuracy = pd.DataFrame(columns=['Model','Accuracy'])
models = {
"KNN" : KNeighborsClassifier(),
"DT" : DecisionTreeClassifier(),
'RFC' : RandomForestClassifier(),
'GBC' : GradientBoostingClassifier(),
'XGB' : XGBClassifier()
}
for test, clf in models.items():
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test,y_pred)
train_pred = clf.predict(x_train)
train_acc = accuracy_score(y_train, train_pred)
print("\n", test + ' scores')
print(acc)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('*' * 100,"\n")
model_accuracy = model_accuracy.append({'Model': test, 'Accuracy': acc, 'Train_acc': train_acc}, ignore_index=True)
I want a detailed output as it mentioned in the code.
For KNN algorithm:
enter image description here
enter image description here
same it goes for other algorithms..
I have trained a CNN to classify images into 5 classes. But when I try to plot ROC curve for each class versus the rest, all 5 classes have almost a diagonal curve with AUC of around 0.5. I have no idea what has gone wrong.
The model should have an accuracy of around 86%.
Here is the code:
import os, shutil
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, RocCurveDisplay
from sklearn.preprocessing import label_binarize
import random
model = tf.keras.models.load_model('G:/Myxoid lesion/Myxoid_EN3_finetune4b')
model.summary()
data_dir='G:/Myxoid lesion/Test/'
batch_size = 64
img_height = 300
img_width = 300
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
data_dir,
seed = 123,
image_size=(img_height, img_width),
batch_size=batch_size)
model.compile(optimizer = optimizers.Adam(lr=0.00002),
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics = ['sparse_categorical_accuracy'])
correct = np.array([], dtype='int32')
# Get the labels of test_ds
for x, y in test_ds:
correct = np.concatenate([correct, y.numpy()])
# Get the prediction probabilities for each class for each test image
prediction_prob = tf.nn.softmax(model.predict(test_ds))
num_class = 5
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_class):
fpr[i], tpr[i], _ = roc_curve(correct, prediction_prob[:,i], pos_label=i)
roc_auc[i] = auc(fpr[i], tpr[i])
plt.figure()
lw = 2
for i in range(num_class):
plt.plot(fpr[i],tpr[i],
color=(random.random(),random.random(),random.random()),
label='{0} (AUC = {1:0.2f})'''.format(labels[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.legend(loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC analysis')
plt.show()
The "prediction_prob" variable contains:
array([[6.3877934e-09, 6.3617526e-06, 5.5736535e-07, 4.9789862e-05,
9.9994326e-01],
[6.5260068e-08, 8.8882577e-03, 3.9350948e-06, 9.9110776e-01,
4.0252076e-11],
[2.7514220e-04, 2.9315910e-05, 1.6688553e-04, 9.9952865e-01,
3.5938730e-10],
...,
[1.1131389e-09, 9.8325908e-01, 3.4283744e-06, 1.6737511e-02,
7.3243338e-12],
[1.4697845e-08, 4.7125661e-05, 1.4077022e-03, 6.4052530e-02,
9.3449265e-01],
[9.9999940e-01, 1.3071107e-07, 4.3149896e-07, 4.7902233e-08,
9.2861301e-09]], dtype=float32)>
While the "correct" variable contains the correct label for each test image:
array([0, 1, 4, ..., 4, 2, 4])
I think I follow what is mentioned on the scikit-learn website.
The tpr[i] and fpr[i] variables generated becomes linear correlated, so the AUC becomes 0.5
I think there is a problem in generating tpr[i] and fpr[i]? Could anyone figure out the problem?
Thanks!
If I generate the labels and prediction in this way, then I can get the correct ROC curve:
prediction_prob = np.array([]).reshape(0,5)
correct = np.array([], dtype='int32')
for x, y in test_ds:
correct = np.concatenate([correct, y.numpy()])
prediction_prob = np.vstack([prediction_prob, tf.nn.softmax(model.predict(x))])
However, if I get the prediction from model.predict(test_ds), somehow the order the prediction is different from the original dataset, so that it does not match with the original label. I am not sure if this is the 'bug' in tensorflow, or there is other explanation to this.
Also I cannot get the micro-averaging (though this is not that important for my goal)
fpr["micro"], tpr["micro"], _ = roc_curve(correct.ravel(), prediction_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
It gives the following error:
raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported
I have a DNNRegressor model and I want to calculate some metrics to understand how well my model is predicting. How can I calculate the mean absolute error (MAE), mean squared error (MSE) and the R squared coefficient?
So far I only have the loss so can someone help me calculate MAE, MSE and R2?
# Imports
import itertools
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
COLUMNS = ['Prot', 'Gra', 'Cen', 'Sal', 'TVN', 'Velocidad_Prensa']
FEATURES = ['Prot', 'Gra', 'Cen', 'Sal', 'TVN']
LABEL = ['Velocidad_Prensa']
def get_input_fn(data_set, num_epochs=None, shuffle=True):
return tf.compat.v1.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y=pd.Series(data_set[LABEL].values),
num_epochs=num_epochs,
shuffle=shuffle)
training_set = pd.read_csv("prensa train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
test_set = pd.read_csv("prensa eval.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
training_set.head()
# Model
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
activation_fn = tf.nn.relu, hidden_units=[200, 100, 50, 25, 12])
# Reset the index of training
training_set.reset_index(drop = True, inplace =True)
def input_fn(data_set, pred = False):
if pred == False:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
labels = tf.constant(data_set[LABEL].values)
return feature_cols, labels
if pred == True:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
return feature_cols
# Deep Neural Network Regressor with the training set which contain the data split by train test split
regressor.train(input_fn=lambda: input_fn(training_set), steps=2000)
# Evaluation on the test set created by train_test_split
ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
# Display the score on the testing set
loss_score1 = ev["loss"]
print("Final Loss on the testing set: {0:f}".format(loss_score1))
def input_fn(features, batch_size=256):
return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)
features = ['Prot', 'Gra', 'Cen', 'Sal','TVN']
predict = {}
print("Ingresar caracterĂsticas quĂmicas de la materia prima")
for feature in features:
valid = True
while valid:
val = input(feature + ": ")
if not val.isdigit(): valid = False
predict[feature] = [float(val)]
predictions = regressor.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
print(pred_dict)
sklearn.metrics has dedicated scoring methods for each of the metrics you are asking for.
Just to the following:
# Import metrics
from sklearn import metrics
# Make predictions
predictions = regressor.predict(input_fn=lambda: input_fn(predict))
# Calculate MAE, MSE, R2
print('MAE:', metrics.mean_absolute_error(y_true, predictions))
print('MSE:', metrics.mean_squared_error(y_true, predictions))
print('R2:', metrics.r2_score(y_true, predictions))
Note: This question is not a multiplication one and please ignore some of the import statements.
Now the details are as follows, I am using a curve_fit() to fit a periodic pandas dataset.
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import leastsq
#import matplotlib.pyplot as plt
import pylab as plt
from scipy.optimize import curve_fit
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df['holiday'].replace(to_replace = 'None', value = '0', inplace=True)
df.loc[df['holiday'] != '0', 'holiday'] = 1
print(df.shape)
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%Y %H:%M')
df['date_time'] = (df['date_time']- dt.datetime(1970,1,1)).dt.total_seconds()
#print(df['date_time'].head())
non_dummy_cols = ['holiday','temp','rain_1h', 'snow_1h', 'clouds_all','date_time', 'traffic_volume']
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
df = pd.get_dummies(df, columns=dummy_cols)
print(df.shape)
x = df[df.columns.values]
x = x.drop(['traffic_volume'], axis=1)
x = x.drop(['clouds_all'], axis = 1)
y = df['traffic_volume']
print(x.shape)
print(y.shape)
#plt.figure(figsize=(6,4))
#plt.scatter(df.date_time[0:100], df.traffic_volume[0:100], color = 'blue')
#plt.xlabel("Date Time")
#plt.ylabel("Traffic volume")
#plt.show()
x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state= 4)
def my_sin(x, freq, amplitude, phase, offset):
return np.sin(x * freq + phase) * amplitude + offset
#x_train = np.array(x_train)
#y_train = np.array(y_train)
print(x_train)
popt, pcov = curve_fit(my_sin, x_train, y_train)
y_hat = my_sin(x_test, *popt)
Error:
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Download dataset URL
The dataset before any programmatic changes is:
So how do i overcome this error? Is it not possible to use curve_fit for a m*n x_train?
I have also tried by reshaping the y_train to m*1 or [2,2,....[]] like this but that's also not working. So please help me to solve this issue.
The entire error message tells the story just above the last line:
Traceback (most recent call last):
File "temp.py", line 50, in <module>
popt, pcov = curve_fit(my_sin, x_train, y_train)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 736, in curve_fit
res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 377, in leastsq
shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 26, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 454, in func_wrapped
return func(xdata, *params) - ydata
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Curve_fit() is handing your function "my_sin()" data which has shape of (38563, 54) - this is x_train.shape() output - and is returning data with the same shape. The curve_fit code needs the function being fitted to instead return data with the same shape as y_train, so it can subtract the two and calculate error. Since the function does not return data with the same shape as y_train, the subtraction is giving an exception.
I suspect you should be using the linear regression in sklearn, and not the curve_fit routine.
I want to differentiate a vector with respect to another using TensorFlow. I am unable to write and visualize the output (just started my journey on TensorFlow)
I am attaching the code snippet I have tried.
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
x = np.linspace(-np.pi, np.pi, 120)
y = np.sinh(x)
plt.plot(x,y)
plt.axhline(color="gray", zorder=-1)
plt.axvline(color="gray", zorder=-1)
plt.show()
X = tf.constant(x, dtype=tf.float32)
Y = tf.constant(y, dtype=tf.float32)
gradient = tf.gradients(Y, X)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
I am unable to output the gradient. I also tried a placeholder for the gradients but cannot figure out how to go about.
Your Y doesn't depend on X. The way you have defined them they are just two independent tensors. This is probably what you want:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
x_data = np.linspace(-np.pi, np.pi, 120)
y_data = np.sinh(x_data)
plt.plot(x_data, y_data)
plt.axhline(color="gray", zorder=-1)
plt.axvline(color="gray", zorder=-1)
plt.show() # <-- shows image
x = tf.constant(x_data, dtype=tf.float32)
y = tf.math.sinh(x) # <-- `y` is a function of `x`
grads = tf.gradients(y, x)
# init = tf.global_variables_initializer() # <-- No need, you don't have variables here
with tf.Session() as sess:
print(sess.run(grads)) # <-- prints long array