How to format number in the plot_confusion_matrix - matplotlib

How to improve this strange, illegible number format in the matrix so that it shows me only simple numbers?
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
NBC = GaussianNB()
LRE = LogisticRegression(solver='lbfgs')
GBC = GradientBoostingClassifier()
RFC = RandomForestClassifier()
LGBM = LGBMClassifier()
CBC = CatBoostClassifier(verbose=0, n_estimators=100)
classifiers = [NBC,LRE,GBC,RFC,LGBM,CBC]
for cls in classifiers:
cls.fit(X_train, y_train)
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10,6))
target_names = ['0','1']
for cls, ax in zip(classifiers, axes.flatten()):
plot_confusion_matrix(cls,
X_test,
y_test,
ax=ax,
cmap='Reds',
display_labels=target_names)
ax.title.set_text(type(cls).__name__)
plt.tight_layout()
plt.show()

Try passing a blank value format as the argument to the plot_confusion_matrix. The docs state
values_format : str, default=None
Format specification for values in confusion matrix. If None, the format specification is ā€˜dā€™ or ā€˜.2gā€™ whichever is shorter.
plot_confusion_matrix(cls, X_test, y_test, ax=ax, cmap='Reds',
display_labels=target_names,
values_format='') # <--------- Passed here

Related

I'm having problems with one-hot encoding

I am using logistic regression for a football dataset, but it seems when i try to one-hot encode the home team names and away team names it gives the model a 100% accuracy, even when doing a train_test_split i still get 100. What am i doing wrong?
from sklearn.linear_model
import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
df = pd.read_csv("FIN.csv")
df['Date'] = pd.to_datetime(df["Date"])
df = df[(df["Date"] > '2020/04/01')]
df['BTTS'] = np.where((df.HG > 0) & (df.AG > 0), 1, 0)
#print(df.to_string())
df.dropna(inplace=True)
x = df[['Home', 'Away', 'Res', 'HG', 'AG', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']].values
y = df['BTTS'].values
np.set_printoptions(threshold=np.inf)
model = LogisticRegression()
ohe = OneHotEncoder(categories=[df.Home, df.Away, df.Res], sparse=False)
x = ohe.fit_transform(x)
print(x)
model.fit(x, y)
print(model.score(x, y))
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("accuracy:",
accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
Overfitting would be a situation where your training accuracy is very high, and your test accuracy is very low. That means it's "over fitting" because it essentially just learns what the outcome will be on the training, but doesn't fit well on new, unseen data.
The reason you are getting 100% accuracy is precisely as I stated in the comments, there's a (for lack of a better term) data leakage. You are essentially allowing your model to "cheat". Your target variable y (which is 'BTTS') is feature engineered by the data. It is derived from 'HG' and 'AG', and thus are highly (100%) correlated/associated to your target. You define 'BTTS' as 1 when both 'HG' and 'AG' are greater than 1. And then you have those 2 columns included in your training data. So the model simply picked up that obvious association (Ie, when the home goals is 1 or more, and the away goals are 1 or more -> Both teams scored).
Once the model sees those 2 values greater than 0, it predicts 1, if one of those values is 0, it predicts 0.
Drop 'HG' and 'AG' from the x (features).
Once we remove those 2 columns, you'll see a more realistic performance (albeit poor - slightly better than a flip of the coin) here:
1.0
0.5625
accuracy: 0.5625
precision: 0.6666666666666666
recall: 0.4444444444444444
f1 score: 0.5333333333333333
With the Confusion Matrix:
from sklearn.metrics import confusion_matrix
labels = labels = np.unique(y).tolist()
cf_matrixGNB = confusion_matrix(y_test, y_pred, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrixGNB, annot=True,
cmap='Blues')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
Another option would be to do a calculated field of 'Total_Goals', then see if it can predict on that. Obviously again, it has a little help in the obvious (if 'Total_Goals' is 0 or 1, then 'BTTS' will be 0.). But then if 'Total_Goals' is 2 or more, it'll have to rely on the other features to try to work out if one of the teams got shut out.
Here's that example:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
df = pd.read_csv("FIN.csv")
df['Date'] = pd.to_datetime(df["Date"])
df = df[(df["Date"] > '2020/04/01')]
df['BTTS'] = np.where((df.HG > 0) & (df.AG > 0), 1, 0)
#print(df.to_string())
df.dropna(inplace=True)
df['Total_Goals'] = df['HG'] + df['AG']
x = df[['Home', 'Away', 'Res', 'Total_Goals', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']].values
y = df['BTTS'].values
np.set_printoptions(threshold=np.inf)
model = LogisticRegression()
ohe = OneHotEncoder(sparse=False)
x = ohe.fit_transform(x)
#print(x)
model.fit(x, y)
print(model.score(x, y))
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("accuracy:",
accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix
labels = np.unique(y).tolist()
cf_matrixGNB = confusion_matrix(y_test, y_pred, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrixGNB, annot=True,
cmap='Blues')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
Output:
1.0
0.8
accuracy: 0.8
precision: 0.8536585365853658
recall: 0.7777777777777778
f1 score: 0.8139534883720929
To Predict on new data, you need the new data in the form of the training data. You then also need to apply any transformations you fit on the trianing data, to transform on the new data:
new_data = pd.DataFrame(
data = [['Haka', 'Mariehamn', 3.05, 3.66, 2.35, 3.05, 3.66, 2.52, 2.88, 3.48, 2.32]],
columns = ['Home', 'Away', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']
)
to_predcit = new_data[['Home', 'Away', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']]
to_predict_encoded = ohe.transform(to_predcit)
prediction = model.predict(to_predict_encoded)
prediction_prob = model.predict_proba(to_predict_encoded)
print(f'Predict: {prediction[0]} with {prediction_prob[0][0]} probability.')
Output:
Predict: 0 with 0.8204957018099501 probability.

Show evaluation metrics in decision boundary plot

I am working on imbalanced classification. I wanted to add g-mean, and accuracy in my decision boundary plot. It would be nice to see the differences of these scoring metrics in plot. I don't see any option to compute these scores within this decision boundary plot. Is there way I can add this extra information in my decision boundary plot. I appreciate your time. Thanks!
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
scoring = {'G-mean': gmean, 'Accuracy':'accuracy'}
X, y = make_blobs(n_samples=[1000, 10],centers=[[0.0, 0.0], [2.0, 2.0]],cluster_std= [1.5, 0.5],random_state=0, shuffle=False)
clf1 = LogisticRegression(max_iter=100000)
clf2 = LogisticRegression(class_weight="balanced",max_iter=100000)
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,8))
labels = ['Logistic Regression', 'Weighted Logistic Regression']
for clf, lab, grd in zip([clf1, clf2],
labels,
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()
You can use plt.text() to add g-mean, and accuracy in your decision boundary plot.
For example:
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(15, 8))
labels = ['Logistic Regression', 'Weighted Logistic Regression']
for clf, lab, grd in zip([clf1, clf2],
labels,
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
ax.text(6, 4, "gmean : ", fontsize=10)
ax.text(6, 2, "accuracy : ", fontsize=10)
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()

Plotting 3D Decision Boundary From MLPClassifier By Using make_classification Dataset

I used make_classification library and MLPClassifier from sklearn. However, I could not make my points separated like on this screenshot. And this screenshot is what my plot shows. Could you help me to separate the points or what is the problem?
My code is:
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=550, n_features=10, n_informative=2,random_state=0)
from sklearn.model_selection import train_test_split
X_train,X_test , y_train , y_test = train_test_split(X, y, test_size=0.3, random_state=1)
from sklearn.neural_network import MLPClassifier
mlp= MLPClassifier(hidden_layer_sizes=(), max_iter=300, random_state=0)
clf = mlp.fit(X_test, y_test)
z = lambda x,y: (-clf.intercepts_[0]-clf.coefs_[0][0]*x -clf.coefs_[0][1]*y) / clf.coefs_[0][2]
tmp = np.linspace(-5,5,30)
x,y = np.meshgrid(tmp,tmp)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_test[y_test==0,0]+2, X_test[y_test==0,1]-2, X_test[y_test==0,2]-5, c='b', marker='^')
ax.scatter(X_test[y_test==1,0]-2, X_test[y_test==1,1]+2, X_test[y_test==1,2]+5, c='r', marker='o')
ax.plot_surface(x, y, z(x,y))
ax.view_init(30, 60)
plt.show()
There is nothing (seriously) wrong on your code. You need to set some parameters to get what you desire. First, you may need larger figure.
# To set figure size other than the default value, specify width and height
fig = plt.figure( figsize=(8,8) )
Secondly, for the size of markers in scatter() function:
# To set marker size, use `s=number` as an option
ax.scatter(X_test[y_test==0,0]+2, X_test[y_test==0,1]-2, X_test[y_test==0,2]-5, \
c='b', marker='^', s=3)
ax.scatter(X_test[y_test==1,0]-2, X_test[y_test==1,1]+2, X_test[y_test==1,2]+5, \
c='r', marker='o', s=3)
The plot should be similar to this:

ValueError: operands could not be broadcast together with shapes (38563,54) (38563,) while using curve_fit()

Note: This question is not a multiplication one and please ignore some of the import statements.
Now the details are as follows, I am using a curve_fit() to fit a periodic pandas dataset.
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import leastsq
#import matplotlib.pyplot as plt
import pylab as plt
from scipy.optimize import curve_fit
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df['holiday'].replace(to_replace = 'None', value = '0', inplace=True)
df.loc[df['holiday'] != '0', 'holiday'] = 1
print(df.shape)
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%Y %H:%M')
df['date_time'] = (df['date_time']- dt.datetime(1970,1,1)).dt.total_seconds()
#print(df['date_time'].head())
non_dummy_cols = ['holiday','temp','rain_1h', 'snow_1h', 'clouds_all','date_time', 'traffic_volume']
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
df = pd.get_dummies(df, columns=dummy_cols)
print(df.shape)
x = df[df.columns.values]
x = x.drop(['traffic_volume'], axis=1)
x = x.drop(['clouds_all'], axis = 1)
y = df['traffic_volume']
print(x.shape)
print(y.shape)
#plt.figure(figsize=(6,4))
#plt.scatter(df.date_time[0:100], df.traffic_volume[0:100], color = 'blue')
#plt.xlabel("Date Time")
#plt.ylabel("Traffic volume")
#plt.show()
x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state= 4)
def my_sin(x, freq, amplitude, phase, offset):
return np.sin(x * freq + phase) * amplitude + offset
#x_train = np.array(x_train)
#y_train = np.array(y_train)
print(x_train)
popt, pcov = curve_fit(my_sin, x_train, y_train)
y_hat = my_sin(x_test, *popt)
Error:
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Download dataset URL
The dataset before any programmatic changes is:
So how do i overcome this error? Is it not possible to use curve_fit for a m*n x_train?
I have also tried by reshaping the y_train to m*1 or [2,2,....[]] like this but that's also not working. So please help me to solve this issue.
The entire error message tells the story just above the last line:
Traceback (most recent call last):
File "temp.py", line 50, in <module>
popt, pcov = curve_fit(my_sin, x_train, y_train)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 736, in curve_fit
res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 377, in leastsq
shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 26, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 454, in func_wrapped
return func(xdata, *params) - ydata
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Curve_fit() is handing your function "my_sin()" data which has shape of (38563, 54) - this is x_train.shape() output - and is returning data with the same shape. The curve_fit code needs the function being fitted to instead return data with the same shape as y_train, so it can subtract the two and calculate error. Since the function does not return data with the same shape as y_train, the subtraction is giving an exception.
I suspect you should be using the linear regression in sklearn, and not the curve_fit routine.

Regression on large dataset: Why does accuracy drop?

I am trying to predict the views on olx's ads. I write a scraper to scrape all the data(50000) ads. When I perform linear regression (on 1400 samples) I got 66% accuracy.But after that I perform on 52000 samples it dropped to 8%. Here is the Imgcount vs Views and Price vs Views stats.
Is there any problem with my data? or How can I perform regression on this. I know that this data is very polarized.
I wanted to know what's the problem why my accuracy dropped when I used large dataset.
Thank you for the help.`
CODE:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
url = '/home/msz/olx/olx/with_images.csv'
df = pd.read_csv(url, index_col='url')
df['price'] = df['price'].str.replace('.', '')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('Rs', '')
df['price'] = df['price'].astype(int)
df['text'] = df['text'].str.replace(',', ' ')
df['text'] = df['text'].str.replace('\t', '')
df['text'] = df['text'].str.replace('\n', '')
X = df[['price', 'img']]
y = df['views']
print ("X is like ", X.shape)
print ("Y is like ", y.shape)
df.plot(y='views', x='img', style='x')
plt.title('ImgCount vs Views')
plt.xlabel('ImgCount')
plt.ylabel('Views')
plt.show()
df.plot(y='views', x='price', style='x')
plt.title('Price vs Views')
plt.xlabel('Price')
plt.ylabel('Views')
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.451, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print('Accuracy is : ',score*100)
Regression is the basic algorithm which works on linear datasets mostly but if you have a large and non liner dataset you have to use another algorithm like k-nearest neighbour or may be decision tree. But I prefer to use Naives Bayes classifier and others.