I can't get the output I want with Lasso Regression using the Sklearn library - lasso-regression

I'm trying to set up a regression model over a random sample data set. But when I try for different alpha values, the predicted output becomes a straight line, everytime. Below you can see my code and the comparison of the outputs. In which part do you think I am going wrong?
#Importing libraries.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
#Define input array with angles from 60deg to 300deg converted to radians
x = np.array([i*np.pi/180 for i in range(60,300,4)])
np.random.seed(10) #Setting seed for reproducibility
y = np.sin(x) + np.random.normal(0,0.15,len(x))
data = pd.DataFrame(np.column_stack([x,y]),columns=['x','y'])
X = data['x']
Y = data['y']
X = X.values.reshape(-1,1)
Y = Y.values.reshape(-1,1)
#Lasso regression
model = Lasso(alpha=0.001) #Alpha = 0.001
model.fit(X,Y)
Y_predicted_lasso = model.predict(X)
#Plot
plt.scatter(X,Y)
plt.plot(X,Y_predicted_lasso,'r')
plt.show()
comparison1
comparison2

Here is what I mean by my comment above:
#Importing libraries.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
#Define input array with angles from 60deg to 300deg converted to radians
x = np.array([i*np.pi/180 for i in range(60,300,4)])
np.random.seed(10) #Setting seed for reproducibility
y = np.sin(x) + np.random.normal(0,0.15,len(x))
x2 = np.power(x, 2)
x3 = np.power(x, 3)
x4 = np.power(x, 4)
data = pd.DataFrame(np.column_stack([x, x2, x3, x4, y]), columns=["x", "x2", "x3", "x4", "y"])
X = data[["x", "x2", "x3", "x4"]]
Y = data["y"]
# X = X.values.reshape(-1,1)
Y = Y.values.reshape(-1, 1)
# Lasso regression
model = Lasso(alpha=0.00001, max_iter=30_000) # Alpha = 0.001
model.fit(X, Y)
Y_predicted_lasso = model.predict(X)
# Plot
plt.scatter(X.x, Y)
plt.plot(X.x, Y_predicted_lasso, "r")
plt.show()

Related

Show evaluation metrics in decision boundary plot

I am working on imbalanced classification. I wanted to add g-mean, and accuracy in my decision boundary plot. It would be nice to see the differences of these scoring metrics in plot. I don't see any option to compute these scores within this decision boundary plot. Is there way I can add this extra information in my decision boundary plot. I appreciate your time. Thanks!
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
scoring = {'G-mean': gmean, 'Accuracy':'accuracy'}
X, y = make_blobs(n_samples=[1000, 10],centers=[[0.0, 0.0], [2.0, 2.0]],cluster_std= [1.5, 0.5],random_state=0, shuffle=False)
clf1 = LogisticRegression(max_iter=100000)
clf2 = LogisticRegression(class_weight="balanced",max_iter=100000)
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,8))
labels = ['Logistic Regression', 'Weighted Logistic Regression']
for clf, lab, grd in zip([clf1, clf2],
labels,
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()
You can use plt.text() to add g-mean, and accuracy in your decision boundary plot.
For example:
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(15, 8))
labels = ['Logistic Regression', 'Weighted Logistic Regression']
for clf, lab, grd in zip([clf1, clf2],
labels,
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
ax.text(6, 4, "gmean : ", fontsize=10)
ax.text(6, 2, "accuracy : ", fontsize=10)
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()

Triangular surface plot matplotlib

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.gca(projection='3d')
# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)
ax.plot_trisurf(X, Y, Z, cmap='viridis', edgecolor='none')
plt.show()
I tried to plot this data in form of triangular data but I get this error:
ValueError: x and y must be equal-length 1-D arrays
Can someone help me on it?
In triangular surface plot X,Y,Z must be one dimensional array rather than two dimensional as in case of wireframe and surface plot.
Don't use np.meshgrid().

Problem in computing the gradient of a function

I want to differentiate a vector with respect to another using TensorFlow. I am unable to write and visualize the output (just started my journey on TensorFlow)
I am attaching the code snippet I have tried.
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
x = np.linspace(-np.pi, np.pi, 120)
y = np.sinh(x)
plt.plot(x,y)
plt.axhline(color="gray", zorder=-1)
plt.axvline(color="gray", zorder=-1)
plt.show()
X = tf.constant(x, dtype=tf.float32)
Y = tf.constant(y, dtype=tf.float32)
gradient = tf.gradients(Y, X)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
I am unable to output the gradient. I also tried a placeholder for the gradients but cannot figure out how to go about.
Your Y doesn't depend on X. The way you have defined them they are just two independent tensors. This is probably what you want:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
x_data = np.linspace(-np.pi, np.pi, 120)
y_data = np.sinh(x_data)
plt.plot(x_data, y_data)
plt.axhline(color="gray", zorder=-1)
plt.axvline(color="gray", zorder=-1)
plt.show() # <-- shows image
x = tf.constant(x_data, dtype=tf.float32)
y = tf.math.sinh(x) # <-- `y` is a function of `x`
grads = tf.gradients(y, x)
# init = tf.global_variables_initializer() # <-- No need, you don't have variables here
with tf.Session() as sess:
print(sess.run(grads)) # <-- prints long array

Average ROC curve across folds for multi-class classification case in sklearn

I have a multi-class classification problem with 3 classes in total.
I am using LinearDiscriminantAnalysis for the classification and I want to plot the average ROC across KFolds (k = 5).
I am able to do this for a binary classification case but I cannot find a way to make it work for my multi-class case.
Below is my code for the binary case:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
plt.style.use('ggplot')
X, y = make_classification(n_samples=500, random_state=100, flip_y=0.3)
kf = KFold(n_splits = 5, shuffle = True, random_state= 0)
clf = LinearDiscriminantAnalysis()
pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
tprs = []
aucs = []
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']
for i, (train, test) in enumerate(kf.split(X,y)):
model = pipe.fit(X[train], y[train])
y_score = model.predict_proba(X[test])
fpr, tpr, _ = roc_curve(y[test], y_score[:, 1])
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
#plt.plot(fpr, tpr, lw=1, alpha=0.6, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc), c = colors[i])
tpr = interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
std = tprs.std(axis=0)
mean_auc = auc(base_fpr, mean_tprs)
std_auc = np.std(aucs)
tprs_upper = np.minimum(mean_tprs + std, 1)
tprs_lower = mean_tprs - std
plt.figure(figsize=(12, 8))
plt.plot(base_fpr, mean_tprs, 'b', alpha = 0.8, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),)
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color = 'blue', alpha = 0.2)
plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'r', label = 'Luck', alpha= 0.8)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.title('Receiver operating characteristic (ROC) curve')
#plt.axes().set_aspect('equal', 'datalim')
plt.show()
EDIT 1:
My attempt to make it work for the multiclass case using OneVsRestClassifier:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import label_binarize
plt.style.use('ggplot')
plt.figure(figsize=(12, 8))
X, y = make_classification(n_samples=500, random_state=100, n_classes=3,n_clusters_per_class=1, flip_y=0.3)
kf = KFold(n_splits = 5, shuffle = True, random_state= 0)
clf = OneVsRestClassifier(LinearDiscriminantAnalysis())
pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
classes = np.unique(y)
y_true = label_binarize(y, classes=classes)
n_classes = y_true.shape[1]
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']
fpr = dict()
tpr = dict()
roc_auc = dict()
fff=[]
ttt=[]
aucc=[]
# Fit the model for each fold
for i, (train, test) in enumerate(kf.split(X,y)):
model = pipe.fit(X[train], y[train])
y_score = model.predict_proba(X[test])
# Compute ROC curve and ROC area for each class PER FOLD
for j in range(n_classes):
fpr[j], tpr[j], _ = roc_curve(y_true[test][:, j], y_score[:, j])
roc_auc[j] = auc(fpr[j], tpr[j])
# First aggregate all false positive rates per classe for each fold
all_fpr = np.unique(np.concatenate([fpr[j] for j in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for j in range(n_classes):
mean_tpr += interp(all_fpr, fpr[j], tpr[j])
# Finally average it and compute AUC for EACH FOLD
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
fff.append(all_fpr)
ttt.append(mean_tpr)
aucc.append(roc_auc["macro"])
# Compute average across Folds
fff = np.array(fff)
ttt = np.array(ttt)
aucc = np.array(aucc)
all_fpr_folds = np.unique(np.concatenate([fff[j] for j in range(kf.get_n_splits())]))
# Then interpolate all ROC curves at this points
mean_tpr_folds = np.zeros_like(all_fpr_folds)
for j in range(kf.get_n_splits()):
mean_tpr_folds += interp(all_fpr_folds, fff[j], ttt[j])
# Finally average it and compute AUC
mean_tpr_folds /= float(kf.get_n_splits())
mean_mean_tpr_folds= mean_tpr_folds.mean(axis = 0)
std = mean_tpr_folds.std(axis=0)
tprs_upper = np.minimum(mean_mean_tpr_folds + std, 1)
tprs_lower = mean_mean_tpr_folds - std
plt.plot(all_fpr_folds, mean_tpr_folds, 'b', alpha = 0.8, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (aucc.mean(), aucc.std()),)
plt.fill_between(all_fpr_folds, tprs_lower, tprs_upper, color = 'blue', alpha = 0.2)
plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'r', label = 'Luck', alpha= 0.8)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.title('Receiver operating characteristic (ROC) curve')
#plt.axes().set_aspect('equal', 'datalim')
plt.show()
I am missing something here...

Plotting 3D Decision Boundary From Linear SVM

I've fit a 3 feature data set using sklearn.svm.svc(). I can plot the point for each observation using matplotlib and Axes3D. I want to plot the decision boundary to see the fit. I've tried adapting the 2D examples for plotting the decision boundary to no avail. I understand that clf.coef_ is a vector normal to the decision boundary. How can I plot this to see where it divides the points?
Here is an example on a toy dataset. Note that plotting in 3D is funky with matplotlib. Sometimes points that are behind the plane might appear as though they are in front of it, so you may have to fiddle with rotating the plot to ascertain what's going on.
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.svm import SVC
rs = np.random.RandomState(1234)
# Generate some fake data.
n_samples = 200
# X is the input features by row.
X = np.zeros((200,3))
X[:n_samples/2] = rs.multivariate_normal( np.ones(3), np.eye(3), size=n_samples/2)
X[n_samples/2:] = rs.multivariate_normal(-np.ones(3), np.eye(3), size=n_samples/2)
# Y is the class labels for each row of X.
Y = np.zeros(n_samples); Y[n_samples/2:] = 1
# Fit the data with an svm
svc = SVC(kernel='linear')
svc.fit(X,Y)
# The equation of the separating plane is given by all x in R^3 such that:
# np.dot(svc.coef_[0], x) + b = 0. We should solve for the last coordinate
# to plot the plane in terms of x and y.
z = lambda x,y: (-svc.intercept_[0]-svc.coef_[0][0]*x-svc.coef_[0][1]*y) / svc.coef_[0][2]
tmp = np.linspace(-2,2,51)
x,y = np.meshgrid(tmp,tmp)
# Plot stuff.
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(x, y, z(x,y))
ax.plot3D(X[Y==0,0], X[Y==0,1], X[Y==0,2],'ob')
ax.plot3D(X[Y==1,0], X[Y==1,1], X[Y==1,2],'sr')
plt.show()
Output:
EDIT (Key Mathematical Linear Algebra Statement In Comment Above):
# The equation of the separating plane is given by all x in R^3 such that:
# np.dot(coefficients, x_vector) + intercept_value = 0.
# We should solve for the last coordinate: x_vector[2] == z
# to plot the plane in terms of x and y.
You cannot visualize the decision surface for a lot of features. This is because the dimensions will be too many and there is no way to visualize an N-dimensional surface.
However, you can use 2 features and plot nice decision surfaces as follows.
I have also written an article about this here:
https://towardsdatascience.com/support-vector-machines-svm-clearly-explained-a-python-tutorial-for-classification-problems-29c539f3ad8?source=friends_link&sk=80f72ab272550d76a0cc3730d7c8af35
Case 1: 2D plot for 2 features and using the iris dataset
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()
Case 2: 3D plot for 2 features and using the iris dataset
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from mpl_toolkits.mplot3d import Axes3D
iris = datasets.load_iris()
X = iris.data[:, :3] # we only take the first three features.
Y = iris.target
#make it binary classification problem
X = X[np.logical_or(Y==0,Y==1)]
Y = Y[np.logical_or(Y==0,Y==1)]
model = svm.SVC(kernel='linear')
clf = model.fit(X, Y)
# The equation of the separating plane is given by all x so that np.dot(svc.coef_[0], x) + b = 0.
# Solve for w3 (z)
z = lambda x,y: (-clf.intercept_[0]-clf.coef_[0][0]*x -clf.coef_[0][1]*y) / clf.coef_[0][2]
tmp = np.linspace(-5,5,30)
x,y = np.meshgrid(tmp,tmp)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot3D(X[Y==0,0], X[Y==0,1], X[Y==0,2],'ob')
ax.plot3D(X[Y==1,0], X[Y==1,1], X[Y==1,2],'sr')
ax.plot_surface(x, y, z(x,y))
ax.view_init(30, 60)
plt.show()