Average ROC curve across folds for multi-class classification case in sklearn - matplotlib

I have a multi-class classification problem with 3 classes in total.
I am using LinearDiscriminantAnalysis for the classification and I want to plot the average ROC across KFolds (k = 5).
I am able to do this for a binary classification case but I cannot find a way to make it work for my multi-class case.
Below is my code for the binary case:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
plt.style.use('ggplot')
X, y = make_classification(n_samples=500, random_state=100, flip_y=0.3)
kf = KFold(n_splits = 5, shuffle = True, random_state= 0)
clf = LinearDiscriminantAnalysis()
pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
tprs = []
aucs = []
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']
for i, (train, test) in enumerate(kf.split(X,y)):
model = pipe.fit(X[train], y[train])
y_score = model.predict_proba(X[test])
fpr, tpr, _ = roc_curve(y[test], y_score[:, 1])
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
#plt.plot(fpr, tpr, lw=1, alpha=0.6, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc), c = colors[i])
tpr = interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
std = tprs.std(axis=0)
mean_auc = auc(base_fpr, mean_tprs)
std_auc = np.std(aucs)
tprs_upper = np.minimum(mean_tprs + std, 1)
tprs_lower = mean_tprs - std
plt.figure(figsize=(12, 8))
plt.plot(base_fpr, mean_tprs, 'b', alpha = 0.8, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),)
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color = 'blue', alpha = 0.2)
plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'r', label = 'Luck', alpha= 0.8)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.title('Receiver operating characteristic (ROC) curve')
#plt.axes().set_aspect('equal', 'datalim')
plt.show()
EDIT 1:
My attempt to make it work for the multiclass case using OneVsRestClassifier:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import label_binarize
plt.style.use('ggplot')
plt.figure(figsize=(12, 8))
X, y = make_classification(n_samples=500, random_state=100, n_classes=3,n_clusters_per_class=1, flip_y=0.3)
kf = KFold(n_splits = 5, shuffle = True, random_state= 0)
clf = OneVsRestClassifier(LinearDiscriminantAnalysis())
pipe= Pipeline([('scaler', StandardScaler()), ('clf', clf)])
classes = np.unique(y)
y_true = label_binarize(y, classes=classes)
n_classes = y_true.shape[1]
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']
fpr = dict()
tpr = dict()
roc_auc = dict()
fff=[]
ttt=[]
aucc=[]
# Fit the model for each fold
for i, (train, test) in enumerate(kf.split(X,y)):
model = pipe.fit(X[train], y[train])
y_score = model.predict_proba(X[test])
# Compute ROC curve and ROC area for each class PER FOLD
for j in range(n_classes):
fpr[j], tpr[j], _ = roc_curve(y_true[test][:, j], y_score[:, j])
roc_auc[j] = auc(fpr[j], tpr[j])
# First aggregate all false positive rates per classe for each fold
all_fpr = np.unique(np.concatenate([fpr[j] for j in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for j in range(n_classes):
mean_tpr += interp(all_fpr, fpr[j], tpr[j])
# Finally average it and compute AUC for EACH FOLD
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
fff.append(all_fpr)
ttt.append(mean_tpr)
aucc.append(roc_auc["macro"])
# Compute average across Folds
fff = np.array(fff)
ttt = np.array(ttt)
aucc = np.array(aucc)
all_fpr_folds = np.unique(np.concatenate([fff[j] for j in range(kf.get_n_splits())]))
# Then interpolate all ROC curves at this points
mean_tpr_folds = np.zeros_like(all_fpr_folds)
for j in range(kf.get_n_splits()):
mean_tpr_folds += interp(all_fpr_folds, fff[j], ttt[j])
# Finally average it and compute AUC
mean_tpr_folds /= float(kf.get_n_splits())
mean_mean_tpr_folds= mean_tpr_folds.mean(axis = 0)
std = mean_tpr_folds.std(axis=0)
tprs_upper = np.minimum(mean_mean_tpr_folds + std, 1)
tprs_lower = mean_mean_tpr_folds - std
plt.plot(all_fpr_folds, mean_tpr_folds, 'b', alpha = 0.8, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (aucc.mean(), aucc.std()),)
plt.fill_between(all_fpr_folds, tprs_lower, tprs_upper, color = 'blue', alpha = 0.2)
plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'r', label = 'Luck', alpha= 0.8)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.title('Receiver operating characteristic (ROC) curve')
#plt.axes().set_aspect('equal', 'datalim')
plt.show()
I am missing something here...

Related

The code for running the model accuracy for kmeans clustering takes a long time to execute

I've used kmeans clustering algorithm for training the data and then try to get accuracy with some Classification algorithms such as decision tree, random forest, KNN algorithm etc,. After training of data while running model accuracy it takes long time for running. I've attached the code below.
# lets import the warnings library so that we can avoid warnings
import warnings
warnings.filterwarnings('ignore')
# Lets select the Spending score, and Annual Income Columns from the Data
x = data.loc[:, ['Time', 'V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20']].values
# let's check the shape of x
print(x.shape)
# lets convert this data into a dataframe
x_data = pd.DataFrame(x)
x_data.head()
km = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(x)
# lets find out the Results
a = data['Class']
y_means = pd.DataFrame(y_means)
z = pd.concat([y_means, a], axis = 1)
z = z.rename(columns = {0: 'cluster'})
# lets check the Clusters of each Crops
print("Lets check the Results After Applying the K Means Clustering Analysis \n")
print("First Cluster:", z[z['cluster'] == 0]['Class'].unique())
print("---------------------------------------------------------------")
print("Second Cluster:", z[z['cluster'] == 1]['Class'].unique())
print("---------------------------------------------------------------")
from sklearn.cluster import KMeans
hc=KMeans
hc= KMeans(n_clusters=2)
y_her= hc.fit_predict(x)
# lets find out the Results
b = data['Class']
y_herr = pd.DataFrame(y_her)
w = pd.concat([y_herr, b], axis = 1)
w= w.rename(columns = {0: 'cluster'})
# lets check the Clusters of each Crops
print("K-Means Clustering Analysis \n")
print("Zero Cluster:", w[w['cluster'] == 0]['Class'].unique())
print("---------------------------------------------------------------")
print("First Cluster:", w[w['cluster'] == 1]['Class'].unique())
print("---------------------------------------------------------------")
y = data['Class']
x = data.drop(['Class'], axis = 1)
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print("The Shape of x train:", x_train.shape)
print("The Shape of x test:", x_test.shape)
print("The Shape of y train:", y_train.shape)
print("The Shape of y test:", y_test.shape)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from mlxtend.plotting import plot_confusion_matrix
def evaluator(y_test, y_pred):
# Accuracy:
print('Accuracy is: ', accuracy_score(y_test,y_pred))
print('')
# Classification Report:
print('Classification Report: \n',classification_report(y_test,y_pred))
print('Confusion Matrix: \n\n')
plt.style.use("ggplot")
cm = confusion_matrix(y_test,y_pred)
plot_confusion_matrix(conf_mat = cm,figsize=(10,10),show_normed=True)
plt.title('Confusion Matrix for Logistic Regression', fontsize = 15)
plt.show()
#In this below part where the code is running for a long time.
model_accuracy = pd.DataFrame(columns=['Model','Accuracy'])
models = {
"KNN" : KNeighborsClassifier(),
"DT" : DecisionTreeClassifier(),
'RFC' : RandomForestClassifier(),
'GBC' : GradientBoostingClassifier(),
'XGB' : XGBClassifier()
}
for test, clf in models.items():
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test,y_pred)
train_pred = clf.predict(x_train)
train_acc = accuracy_score(y_train, train_pred)
print("\n", test + ' scores')
print(acc)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('*' * 100,"\n")
model_accuracy = model_accuracy.append({'Model': test, 'Accuracy': acc, 'Train_acc': train_acc}, ignore_index=True)
I want a detailed output as it mentioned in the code.
For KNN algorithm:
enter image description here
enter image description here
same it goes for other algorithms..

I'm having problems with one-hot encoding

I am using logistic regression for a football dataset, but it seems when i try to one-hot encode the home team names and away team names it gives the model a 100% accuracy, even when doing a train_test_split i still get 100. What am i doing wrong?
from sklearn.linear_model
import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
df = pd.read_csv("FIN.csv")
df['Date'] = pd.to_datetime(df["Date"])
df = df[(df["Date"] > '2020/04/01')]
df['BTTS'] = np.where((df.HG > 0) & (df.AG > 0), 1, 0)
#print(df.to_string())
df.dropna(inplace=True)
x = df[['Home', 'Away', 'Res', 'HG', 'AG', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']].values
y = df['BTTS'].values
np.set_printoptions(threshold=np.inf)
model = LogisticRegression()
ohe = OneHotEncoder(categories=[df.Home, df.Away, df.Res], sparse=False)
x = ohe.fit_transform(x)
print(x)
model.fit(x, y)
print(model.score(x, y))
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("accuracy:",
accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
Overfitting would be a situation where your training accuracy is very high, and your test accuracy is very low. That means it's "over fitting" because it essentially just learns what the outcome will be on the training, but doesn't fit well on new, unseen data.
The reason you are getting 100% accuracy is precisely as I stated in the comments, there's a (for lack of a better term) data leakage. You are essentially allowing your model to "cheat". Your target variable y (which is 'BTTS') is feature engineered by the data. It is derived from 'HG' and 'AG', and thus are highly (100%) correlated/associated to your target. You define 'BTTS' as 1 when both 'HG' and 'AG' are greater than 1. And then you have those 2 columns included in your training data. So the model simply picked up that obvious association (Ie, when the home goals is 1 or more, and the away goals are 1 or more -> Both teams scored).
Once the model sees those 2 values greater than 0, it predicts 1, if one of those values is 0, it predicts 0.
Drop 'HG' and 'AG' from the x (features).
Once we remove those 2 columns, you'll see a more realistic performance (albeit poor - slightly better than a flip of the coin) here:
1.0
0.5625
accuracy: 0.5625
precision: 0.6666666666666666
recall: 0.4444444444444444
f1 score: 0.5333333333333333
With the Confusion Matrix:
from sklearn.metrics import confusion_matrix
labels = labels = np.unique(y).tolist()
cf_matrixGNB = confusion_matrix(y_test, y_pred, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrixGNB, annot=True,
cmap='Blues')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
Another option would be to do a calculated field of 'Total_Goals', then see if it can predict on that. Obviously again, it has a little help in the obvious (if 'Total_Goals' is 0 or 1, then 'BTTS' will be 0.). But then if 'Total_Goals' is 2 or more, it'll have to rely on the other features to try to work out if one of the teams got shut out.
Here's that example:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
df = pd.read_csv("FIN.csv")
df['Date'] = pd.to_datetime(df["Date"])
df = df[(df["Date"] > '2020/04/01')]
df['BTTS'] = np.where((df.HG > 0) & (df.AG > 0), 1, 0)
#print(df.to_string())
df.dropna(inplace=True)
df['Total_Goals'] = df['HG'] + df['AG']
x = df[['Home', 'Away', 'Res', 'Total_Goals', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']].values
y = df['BTTS'].values
np.set_printoptions(threshold=np.inf)
model = LogisticRegression()
ohe = OneHotEncoder(sparse=False)
x = ohe.fit_transform(x)
#print(x)
model.fit(x, y)
print(model.score(x, y))
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("accuracy:",
accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix
labels = np.unique(y).tolist()
cf_matrixGNB = confusion_matrix(y_test, y_pred, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrixGNB, annot=True,
cmap='Blues')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
Output:
1.0
0.8
accuracy: 0.8
precision: 0.8536585365853658
recall: 0.7777777777777778
f1 score: 0.8139534883720929
To Predict on new data, you need the new data in the form of the training data. You then also need to apply any transformations you fit on the trianing data, to transform on the new data:
new_data = pd.DataFrame(
data = [['Haka', 'Mariehamn', 3.05, 3.66, 2.35, 3.05, 3.66, 2.52, 2.88, 3.48, 2.32]],
columns = ['Home', 'Away', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']
)
to_predcit = new_data[['Home', 'Away', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']]
to_predict_encoded = ohe.transform(to_predcit)
prediction = model.predict(to_predict_encoded)
prediction_prob = model.predict_proba(to_predict_encoded)
print(f'Predict: {prediction[0]} with {prediction_prob[0][0]} probability.')
Output:
Predict: 0 with 0.8204957018099501 probability.

Show evaluation metrics in decision boundary plot

I am working on imbalanced classification. I wanted to add g-mean, and accuracy in my decision boundary plot. It would be nice to see the differences of these scoring metrics in plot. I don't see any option to compute these scores within this decision boundary plot. Is there way I can add this extra information in my decision boundary plot. I appreciate your time. Thanks!
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
scoring = {'G-mean': gmean, 'Accuracy':'accuracy'}
X, y = make_blobs(n_samples=[1000, 10],centers=[[0.0, 0.0], [2.0, 2.0]],cluster_std= [1.5, 0.5],random_state=0, shuffle=False)
clf1 = LogisticRegression(max_iter=100000)
clf2 = LogisticRegression(class_weight="balanced",max_iter=100000)
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,8))
labels = ['Logistic Regression', 'Weighted Logistic Regression']
for clf, lab, grd in zip([clf1, clf2],
labels,
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()
You can use plt.text() to add g-mean, and accuracy in your decision boundary plot.
For example:
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(15, 8))
labels = ['Logistic Regression', 'Weighted Logistic Regression']
for clf, lab, grd in zip([clf1, clf2],
labels,
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
ax.text(6, 4, "gmean : ", fontsize=10)
ax.text(6, 2, "accuracy : ", fontsize=10)
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
plt.title(lab)
plt.show()

Calculate MAE, MSE and R2 metrics for a DNNRegressor model

I have a DNNRegressor model and I want to calculate some metrics to understand how well my model is predicting. How can I calculate the mean absolute error (MAE), mean squared error (MSE) and the R squared coefficient?
So far I only have the loss so can someone help me calculate MAE, MSE and R2?
# Imports
import itertools
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
COLUMNS = ['Prot', 'Gra', 'Cen', 'Sal', 'TVN', 'Velocidad_Prensa']
FEATURES = ['Prot', 'Gra', 'Cen', 'Sal', 'TVN']
LABEL = ['Velocidad_Prensa']
def get_input_fn(data_set, num_epochs=None, shuffle=True):
return tf.compat.v1.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y=pd.Series(data_set[LABEL].values),
num_epochs=num_epochs,
shuffle=shuffle)
training_set = pd.read_csv("prensa train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
test_set = pd.read_csv("prensa eval.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
training_set.head()
# Model
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
activation_fn = tf.nn.relu, hidden_units=[200, 100, 50, 25, 12])
# Reset the index of training
training_set.reset_index(drop = True, inplace =True)
def input_fn(data_set, pred = False):
if pred == False:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
labels = tf.constant(data_set[LABEL].values)
return feature_cols, labels
if pred == True:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
return feature_cols
# Deep Neural Network Regressor with the training set which contain the data split by train test split
regressor.train(input_fn=lambda: input_fn(training_set), steps=2000)
# Evaluation on the test set created by train_test_split
ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
# Display the score on the testing set
loss_score1 = ev["loss"]
print("Final Loss on the testing set: {0:f}".format(loss_score1))
def input_fn(features, batch_size=256):
return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)
features = ['Prot', 'Gra', 'Cen', 'Sal','TVN']
predict = {}
print("Ingresar caracterĂ­sticas quĂ­micas de la materia prima")
for feature in features:
valid = True
while valid:
val = input(feature + ": ")
if not val.isdigit(): valid = False
predict[feature] = [float(val)]
predictions = regressor.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
print(pred_dict)
sklearn.metrics has dedicated scoring methods for each of the metrics you are asking for.
Just to the following:
# Import metrics
from sklearn import metrics
# Make predictions
predictions = regressor.predict(input_fn=lambda: input_fn(predict))
# Calculate MAE, MSE, R2
print('MAE:', metrics.mean_absolute_error(y_true, predictions))
print('MSE:', metrics.mean_squared_error(y_true, predictions))
print('R2:', metrics.r2_score(y_true, predictions))

I can't get the output I want with Lasso Regression using the Sklearn library

I'm trying to set up a regression model over a random sample data set. But when I try for different alpha values, the predicted output becomes a straight line, everytime. Below you can see my code and the comparison of the outputs. In which part do you think I am going wrong?
#Importing libraries.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
#Define input array with angles from 60deg to 300deg converted to radians
x = np.array([i*np.pi/180 for i in range(60,300,4)])
np.random.seed(10) #Setting seed for reproducibility
y = np.sin(x) + np.random.normal(0,0.15,len(x))
data = pd.DataFrame(np.column_stack([x,y]),columns=['x','y'])
X = data['x']
Y = data['y']
X = X.values.reshape(-1,1)
Y = Y.values.reshape(-1,1)
#Lasso regression
model = Lasso(alpha=0.001) #Alpha = 0.001
model.fit(X,Y)
Y_predicted_lasso = model.predict(X)
#Plot
plt.scatter(X,Y)
plt.plot(X,Y_predicted_lasso,'r')
plt.show()
comparison1
comparison2
Here is what I mean by my comment above:
#Importing libraries.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
#Define input array with angles from 60deg to 300deg converted to radians
x = np.array([i*np.pi/180 for i in range(60,300,4)])
np.random.seed(10) #Setting seed for reproducibility
y = np.sin(x) + np.random.normal(0,0.15,len(x))
x2 = np.power(x, 2)
x3 = np.power(x, 3)
x4 = np.power(x, 4)
data = pd.DataFrame(np.column_stack([x, x2, x3, x4, y]), columns=["x", "x2", "x3", "x4", "y"])
X = data[["x", "x2", "x3", "x4"]]
Y = data["y"]
# X = X.values.reshape(-1,1)
Y = Y.values.reshape(-1, 1)
# Lasso regression
model = Lasso(alpha=0.00001, max_iter=30_000) # Alpha = 0.001
model.fit(X, Y)
Y_predicted_lasso = model.predict(X)
# Plot
plt.scatter(X.x, Y)
plt.plot(X.x, Y_predicted_lasso, "r")
plt.show()