could'nt convert string to float - pandas

i have been trying out this code
data = pd.read_csv("UsedCarsSA_Unclean_EN.csv")
data.head()
print(data)
data.isnull().sum()
data.info()
data.describe()
data.describe(include="all")
data.Carname.unique()
sns.set_style("darkgrid")
plt.figure(figsize=(15, 10))
sns.distplot(data.Price)
# plt.show()
print(data.corr())
plt.figure(figsize=(20, 15))
correlations = data.corr()
sns.heatmap(correlations, cmap="coolwarm", annot=True)
# plt.show()
predict = "Price"
data = data[["Make", "Carname", "Year",
"Origin", "Color", "Options",
"Engine_Size", "Fuel_Type", "Gear_Type",
"Condition", "Mileage", "Region",
"Price", "0" ]]
x = np.array(data.drop([predict], 1))
y = np.array(data[predict])
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
from sklearn.tree import DecisionTreeRegressor
print(type(xtrain))
print(type(ytrain))
model = DecisionTreeRegressor()
model.fit(xtrain, ytrain)
predictions = model.predict(xtest)
from sklearn.metrics import mean_absolute_error
model.score(xtest, predictions)
and the error in
model.fit(xtrain, ytrain)
says that "could not convert string to float: 'Hyundai'"
it basically to train the data to get best price
tried float function and dosent work
tried pd.to_numric also dosent work

Related

The code for running the model accuracy for kmeans clustering takes a long time to execute

I've used kmeans clustering algorithm for training the data and then try to get accuracy with some Classification algorithms such as decision tree, random forest, KNN algorithm etc,. After training of data while running model accuracy it takes long time for running. I've attached the code below.
# lets import the warnings library so that we can avoid warnings
import warnings
warnings.filterwarnings('ignore')
# Lets select the Spending score, and Annual Income Columns from the Data
x = data.loc[:, ['Time', 'V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20']].values
# let's check the shape of x
print(x.shape)
# lets convert this data into a dataframe
x_data = pd.DataFrame(x)
x_data.head()
km = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(x)
# lets find out the Results
a = data['Class']
y_means = pd.DataFrame(y_means)
z = pd.concat([y_means, a], axis = 1)
z = z.rename(columns = {0: 'cluster'})
# lets check the Clusters of each Crops
print("Lets check the Results After Applying the K Means Clustering Analysis \n")
print("First Cluster:", z[z['cluster'] == 0]['Class'].unique())
print("---------------------------------------------------------------")
print("Second Cluster:", z[z['cluster'] == 1]['Class'].unique())
print("---------------------------------------------------------------")
from sklearn.cluster import KMeans
hc=KMeans
hc= KMeans(n_clusters=2)
y_her= hc.fit_predict(x)
# lets find out the Results
b = data['Class']
y_herr = pd.DataFrame(y_her)
w = pd.concat([y_herr, b], axis = 1)
w= w.rename(columns = {0: 'cluster'})
# lets check the Clusters of each Crops
print("K-Means Clustering Analysis \n")
print("Zero Cluster:", w[w['cluster'] == 0]['Class'].unique())
print("---------------------------------------------------------------")
print("First Cluster:", w[w['cluster'] == 1]['Class'].unique())
print("---------------------------------------------------------------")
y = data['Class']
x = data.drop(['Class'], axis = 1)
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print("The Shape of x train:", x_train.shape)
print("The Shape of x test:", x_test.shape)
print("The Shape of y train:", y_train.shape)
print("The Shape of y test:", y_test.shape)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from mlxtend.plotting import plot_confusion_matrix
def evaluator(y_test, y_pred):
# Accuracy:
print('Accuracy is: ', accuracy_score(y_test,y_pred))
print('')
# Classification Report:
print('Classification Report: \n',classification_report(y_test,y_pred))
print('Confusion Matrix: \n\n')
plt.style.use("ggplot")
cm = confusion_matrix(y_test,y_pred)
plot_confusion_matrix(conf_mat = cm,figsize=(10,10),show_normed=True)
plt.title('Confusion Matrix for Logistic Regression', fontsize = 15)
plt.show()
#In this below part where the code is running for a long time.
model_accuracy = pd.DataFrame(columns=['Model','Accuracy'])
models = {
"KNN" : KNeighborsClassifier(),
"DT" : DecisionTreeClassifier(),
'RFC' : RandomForestClassifier(),
'GBC' : GradientBoostingClassifier(),
'XGB' : XGBClassifier()
}
for test, clf in models.items():
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test,y_pred)
train_pred = clf.predict(x_train)
train_acc = accuracy_score(y_train, train_pred)
print("\n", test + ' scores')
print(acc)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print('*' * 100,"\n")
model_accuracy = model_accuracy.append({'Model': test, 'Accuracy': acc, 'Train_acc': train_acc}, ignore_index=True)
I want a detailed output as it mentioned in the code.
For KNN algorithm:
enter image description here
enter image description here
same it goes for other algorithms..

I'm having problems with one-hot encoding

I am using logistic regression for a football dataset, but it seems when i try to one-hot encode the home team names and away team names it gives the model a 100% accuracy, even when doing a train_test_split i still get 100. What am i doing wrong?
from sklearn.linear_model
import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
df = pd.read_csv("FIN.csv")
df['Date'] = pd.to_datetime(df["Date"])
df = df[(df["Date"] > '2020/04/01')]
df['BTTS'] = np.where((df.HG > 0) & (df.AG > 0), 1, 0)
#print(df.to_string())
df.dropna(inplace=True)
x = df[['Home', 'Away', 'Res', 'HG', 'AG', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']].values
y = df['BTTS'].values
np.set_printoptions(threshold=np.inf)
model = LogisticRegression()
ohe = OneHotEncoder(categories=[df.Home, df.Away, df.Res], sparse=False)
x = ohe.fit_transform(x)
print(x)
model.fit(x, y)
print(model.score(x, y))
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("accuracy:",
accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
Overfitting would be a situation where your training accuracy is very high, and your test accuracy is very low. That means it's "over fitting" because it essentially just learns what the outcome will be on the training, but doesn't fit well on new, unseen data.
The reason you are getting 100% accuracy is precisely as I stated in the comments, there's a (for lack of a better term) data leakage. You are essentially allowing your model to "cheat". Your target variable y (which is 'BTTS') is feature engineered by the data. It is derived from 'HG' and 'AG', and thus are highly (100%) correlated/associated to your target. You define 'BTTS' as 1 when both 'HG' and 'AG' are greater than 1. And then you have those 2 columns included in your training data. So the model simply picked up that obvious association (Ie, when the home goals is 1 or more, and the away goals are 1 or more -> Both teams scored).
Once the model sees those 2 values greater than 0, it predicts 1, if one of those values is 0, it predicts 0.
Drop 'HG' and 'AG' from the x (features).
Once we remove those 2 columns, you'll see a more realistic performance (albeit poor - slightly better than a flip of the coin) here:
1.0
0.5625
accuracy: 0.5625
precision: 0.6666666666666666
recall: 0.4444444444444444
f1 score: 0.5333333333333333
With the Confusion Matrix:
from sklearn.metrics import confusion_matrix
labels = labels = np.unique(y).tolist()
cf_matrixGNB = confusion_matrix(y_test, y_pred, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrixGNB, annot=True,
cmap='Blues')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
Another option would be to do a calculated field of 'Total_Goals', then see if it can predict on that. Obviously again, it has a little help in the obvious (if 'Total_Goals' is 0 or 1, then 'BTTS' will be 0.). But then if 'Total_Goals' is 2 or more, it'll have to rely on the other features to try to work out if one of the teams got shut out.
Here's that example:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
df = pd.read_csv("FIN.csv")
df['Date'] = pd.to_datetime(df["Date"])
df = df[(df["Date"] > '2020/04/01')]
df['BTTS'] = np.where((df.HG > 0) & (df.AG > 0), 1, 0)
#print(df.to_string())
df.dropna(inplace=True)
df['Total_Goals'] = df['HG'] + df['AG']
x = df[['Home', 'Away', 'Res', 'Total_Goals', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']].values
y = df['BTTS'].values
np.set_printoptions(threshold=np.inf)
model = LogisticRegression()
ohe = OneHotEncoder(sparse=False)
x = ohe.fit_transform(x)
#print(x)
model.fit(x, y)
print(model.score(x, y))
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("accuracy:",
accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix
labels = np.unique(y).tolist()
cf_matrixGNB = confusion_matrix(y_test, y_pred, labels=labels)
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrixGNB, annot=True,
cmap='Blues')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
Output:
1.0
0.8
accuracy: 0.8
precision: 0.8536585365853658
recall: 0.7777777777777778
f1 score: 0.8139534883720929
To Predict on new data, you need the new data in the form of the training data. You then also need to apply any transformations you fit on the trianing data, to transform on the new data:
new_data = pd.DataFrame(
data = [['Haka', 'Mariehamn', 3.05, 3.66, 2.35, 3.05, 3.66, 2.52, 2.88, 3.48, 2.32]],
columns = ['Home', 'Away', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']
)
to_predcit = new_data[['Home', 'Away', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA']]
to_predict_encoded = ohe.transform(to_predcit)
prediction = model.predict(to_predict_encoded)
prediction_prob = model.predict_proba(to_predict_encoded)
print(f'Predict: {prediction[0]} with {prediction_prob[0][0]} probability.')
Output:
Predict: 0 with 0.8204957018099501 probability.

How to match dimensions in CNN

I'm trying to build a CNN, where the goal is from 3 features to predict the label, but is giving an error of dimension.
Could someone help me?
updated after comments from #M.Innat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
from tensorflow.keras.models import Sequential, load_model
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
import tensorflow as tf
import random
# Create data
n = 8500
l = [2, 3, 4, 5,6]
k = int(np.ceil(n/len(l)))
labels = [item for item in l for i in range(k)]
random.shuffle(labels,random.random)
labels =np.array(labels)
label_unique = np.unique(labels)
x = np.linspace(613000, 615000, num=n) + np.random.uniform(-5, 5, size=n)
y = np.linspace(7763800, 7765800, num=n) + np.random.uniform(-5, 5, size=n)
z = np.linspace(1230, 1260, num=n) + np.random.uniform(-5, 5, size=n)
X = np.column_stack((x,y,z))
Y = labels
# Split the dataset into training and testing.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)
seq_len=len(X_train)
n_features=len(X_train[0])
droprate=0.1
exit_un=len(label_unique)
seq_len=len(X_train)
n_features=len(X_train[0])
droprate=0.1
exit_un=len(label_unique)
print('n_features: {} \n seq_len: {} \n exit_un: {}'.format(n_features,seq_len,exit_un))
X_train = X_train[..., None][None, ...] # add channel axis+batch aix
Y_train = pd.get_dummies(Y_train) # transform to one-hot encoded
drop_prob = 0.5
my_model = Sequential()
my_model.add(Conv2D(input_shape=(seq_len,n_features,1),filters=32,kernel_size=(3,3),padding='same',activation="relu")) # 1 channel of grayscale.
my_model.add(MaxPooling2D(pool_size=(2,1)))
my_model.add(Conv2D(filters=64,kernel_size=(5,5), padding='same',activation="relu"))
my_model.add(MaxPooling2D(pool_size=(2,1)))
my_model.add(Flatten())
my_model.add(Dense(units = 1024, activation="relu"))
my_model.add(Dropout(rate=drop_prob))
my_model.add(Dense(units = exit_un, activation="softmax"))
n_epochs = 100
batch_size = 10
learn_rate = 0.005
# Define the optimizer and then compile.
my_optimizer=Adam(lr=learn_rate)
my_model.compile(loss = "categorical_crossentropy", optimizer = my_optimizer, metrics=['categorical_crossentropy','accuracy'])
my_summary = my_model.fit(X_train, Y_train, epochs=n_epochs, batch_size = batch_size, verbose = 1)
The error I have is:
ValueError: Data cardinality is ambiguous:
x sizes: 1
y sizes: 5950
Make sure all arrays contain the same number of samples.
You're passing the input sample without the channel axis and also the batch axis. Also, according to your loss function, you should transform your integer label to one-hot encoded.
exit_un=len(label_unique)
drop_prob = 0.5
X_train = X_train[..., None][None, ...] # add channel axis+batch aix
X_train = np.repeat(X_train, repeats=100, axis=0) # batch-ing
Y_train = np.repeat(Y_train, repeats=100, axis=0) # batch-ing
Y_train = pd.get_dummies(Y_train) # transform to one-hot encoded
print(X_train.shape, Y_train.shape)
my_model = Sequential()
...
update
Based on the discussion, it seems like you need the conv1d operation in the modeling time and need to reshape your sample as mentioned in the comment. Here is the colab, it should work now.

What format do these lists have to be in to be accepted by Keras Tuners Search function?

This Code reads in a set of testing and training guitar jpg images for the neural net to learn and test from.
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import random
DATADIR = "C:/Users/TheKid/Data/DataMiningProject/DataSet"
CATEGORIES = ["Fender_Jazzmaster", "Gibson_ES"]
CATEGORIES2 = ["Test"]
for category in CATEGORIES:
path = os.path.join(DATADIR,category)
for img in os.listdir(path):
img_array = cv2.imread(os.path.join(path,img),cv2.IMREAD_GRAYSCALE)
IMG_SIZE = 70
new_array = cv2.resize(img_array,(IMG_SIZE,IMG_SIZE))
training_data = []
def create_training_data():
for category in CATEGORIES:
path = os.path.join(DATADIR,category)
class_num = CATEGORIES.index(category)
for img in os.listdir(path):
img_array = cv2.imread(os.path.join(path,img),cv2.IMREAD_GRAYSCALE)
new_array = cv2.resize(img_array,(IMG_SIZE,IMG_SIZE))
training_data.append([new_array,class_num])
create_training_data()
print(len(training_data))
random.shuffle(training_data)
X = []
y = []
for features, label in training_data:
X.append(features)
y.append(label)
X = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
for category in CATEGORIES2:
path2 = os.path.join(DATADIR,category)
for img in os.listdir(path2):
img_array2 = cv2.imread(os.path.join(path2,img),cv2.IMREAD_GRAYSCALE)
IMG_SIZE = 70
new_array2 = cv2.resize(img_array,(IMG_SIZE,IMG_SIZE))
testing_data = []
def create_testing_data():
for category in CATEGORIES2:
path2 = os.path.join(DATADIR,category)
class_num2 = CATEGORIES2.index(category)
for img in os.listdir(path2):
img_array2 = cv2.imread(os.path.join(path2,img),cv2.IMREAD_GRAYSCALE)
new_array2 = cv2.resize(img_array2,(IMG_SIZE,IMG_SIZE))
testing_data.append([new_array2,class_num2])
create_testing_data()
print(len(testing_data))
random.shuffle(testing_data)
X2 = []
y2 = []
for features, label in testing_data:
X2.append(features)
y2.append(label)
X2 = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
import pickle
pickle_out = open("X.pickle" , "wb")
pickle.dump(X, pickle_out)
pickle_out.close()
pickle_out = open("y.pickle" , "wb")
pickle.dump(y, pickle_out)
pickle_out.close()
pickle_in = open("X.pickle", "rb")
X = pickle.load(pickle_in)
pickle_out = open("X2.pickle" , "wb")
pickle.dump(X2, pickle_out)
pickle_out.close()
pickle_out = open("y2.pickle" , "wb")
pickle.dump(y2, pickle_out)
pickle_out.close()
pickle_in = open("X2.pickle", "rb")
X = pickle.load(pickle_in)
This next bit of code takes in the pickle file saved in previous code and is supposed to use Keras tuners search function to run different variants of the neural net with different amounts of conv layer ,layer sizes etc so I can choose the most efficient version. But when run this error gets thrown:
ValueError: Data cardinality is ambiguous:
x sizes: 1312
y sizes: 12
Please provide data which shares the same first dimension.
The Shapes of all the variables are:
(x_train = (1312, 70, 70, 1)
y_train =(1312,)
x_test = (1312, 70, 70, 1)
y_test =(12,)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import TensorBoard
import numpy as np
import time
import pickle
import matplotlib.pyplot as plt
from tensorflow import keras
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
pickle_in = open("X.pickle","rb")
x_train = pickle.load(pickle_in)
pickle_in = open("y.pickle","rb")
y_train = pickle.load(pickle_in)
pickle_in = open("X2.pickle","rb")
x_test = pickle.load(pickle_in)
pickle_in = open("y2.pickle","rb")
y_test = pickle.load(pickle_in)
x_train=np.array(x_train/255.0)
y_train=np.array(y_train)
x_test=np.array(x_test/255.0)
y_test=np.array(y_test)
LOG_DIR = f"{int(time.time())}"
def build_model(hp):
model = keras.models.Sequential()
model.add(Conv2D(hp.Int("input_units",32, 256, 32 ), (3, 3), input_shape=x_train.shape[1:]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
for i in range(hp.Int("n_layers", 1, 4)):
model.add(Conv2D(hp.Int(f"conv-{i}_units",32, 256, 32 ), (3, 3)))
model.add(Activation('relu'))
model.add(Flatten()) # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(10))
model.add(Activation("softmax"))
model.compile(optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy"])
return model
tuner = RandomSearch(
build_model,
objective = "val_accuracy",
max_trials = 1,
executions_per_trial = 1,
directory = LOG_DIR)
tuner.search(x=x_train,
y=y_train,
epochs=1,
batch_size=64,
validation_data=(x_test,y_test))
with open(f"tuner_{int(time.time())}.pkl", "wb") as f:
pickle.dump(tuner, f)
tuner = pickle.load(open(""))
print(tuner.get_best_hyperparameters()[0].values)
How would I go about resolving this error? It's seems like a matrix formatting issue to me but I have little experience in dealing with a problem like this.
As the error message and shape of the data (x_test and y_test) clearly suggests, you have 1312 rows in x_test and 12 rows in y_test. You are feeding this data to validation_data=(x_test,y_test).
Kindly pass the same dimension or same rows of data for x_test and y_test in validation_data and this should fix your error.

Getting Errors with StandardScaler Python

I am trying to scale my training and test data for a Logistic Regression but an error popped-up.
I implemented the answer in this stack: How to standard scale a 3D matrix?
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv('GonzagaTakers.csv')
x_df=df.loc[:, df.columns !='Remarks_P']
features = x_df.keys()
target = 'Remarks_P'
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
scalers ={}
for i in range(X_train.shape[1]):
scalers[i] = StandardScaler()
X_train[:, :, i]=scalers[i].fit_transform(X_train[:, :, i])
for i in range(y_test.shape[1]):
y_test[:,:,i]=scalers[i].fit_transform(y_test[:,:,i])
_model = LogisticRegression(class_weight='balanced')
_model.fit(X_train, y_train)
accuracy = _model.score(X_test, y_test) * 100
Error occurs in this line
X_train[:, :, i]=scalers[i].fit_transform(X_train[:, :, i])
TypeError: '(slice(None, None, None), slice(None, None, None), 0)' is
an invalid key