Error on code: TypeError: '<' not supported between instances of 'NoneType' and 'int' - data-science

This is the code I use and everytime i run it, it keeps giving the error "TypeError: '<' not supported between instances of 'NoneType' and 'int'"
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
pic, points, pos = numpy_landmarks.shape
x = numpy_landmarks.reshape((pic, points*pos))
x = (x - numpy.mean(x)) / numpy.std(x)
scaler = StandardScaler()
y = numpy_labels
train_x, test_x, train_y, test_y = train_test_split (x, y, test_size= 9, random_state =
42, stratify=y)

Related

could'nt convert string to float

i have been trying out this code
data = pd.read_csv("UsedCarsSA_Unclean_EN.csv")
data.head()
print(data)
data.isnull().sum()
data.info()
data.describe()
data.describe(include="all")
data.Carname.unique()
sns.set_style("darkgrid")
plt.figure(figsize=(15, 10))
sns.distplot(data.Price)
# plt.show()
print(data.corr())
plt.figure(figsize=(20, 15))
correlations = data.corr()
sns.heatmap(correlations, cmap="coolwarm", annot=True)
# plt.show()
predict = "Price"
data = data[["Make", "Carname", "Year",
"Origin", "Color", "Options",
"Engine_Size", "Fuel_Type", "Gear_Type",
"Condition", "Mileage", "Region",
"Price", "0" ]]
x = np.array(data.drop([predict], 1))
y = np.array(data[predict])
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
from sklearn.tree import DecisionTreeRegressor
print(type(xtrain))
print(type(ytrain))
model = DecisionTreeRegressor()
model.fit(xtrain, ytrain)
predictions = model.predict(xtest)
from sklearn.metrics import mean_absolute_error
model.score(xtest, predictions)
and the error in
model.fit(xtrain, ytrain)
says that "could not convert string to float: 'Hyundai'"
it basically to train the data to get best price
tried float function and dosent work
tried pd.to_numric also dosent work

pd.scatter_matrix not working on pandas version 1.4.2

Here is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
fruits = pd.read_table('readonly/fruit_data_with_colors.txt')
from matplotlib import cm
X = fruits[['height', 'width', 'mass', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9), cmap=cmap)
My education had pandas version '0.19.2' and pd.scatter_matrix works fine. But I got the error message below when I run it on my Jupyter Notebook with pandas '1.4.2.'.
AttributeError: module 'pandas' has no attribute 'scatter_matrix'
How can I make it run on my Jupyter Notebook?
I guess it has now changed to pandas.plotting.scatter_matrix
Have a look at the document below.
https://pandas.pydata.org/docs/reference/api/pandas.plotting.scatter_matrix.html

Getting Errors with StandardScaler Python

I am trying to scale my training and test data for a Logistic Regression but an error popped-up.
I implemented the answer in this stack: How to standard scale a 3D matrix?
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv('GonzagaTakers.csv')
x_df=df.loc[:, df.columns !='Remarks_P']
features = x_df.keys()
target = 'Remarks_P'
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
scalers ={}
for i in range(X_train.shape[1]):
scalers[i] = StandardScaler()
X_train[:, :, i]=scalers[i].fit_transform(X_train[:, :, i])
for i in range(y_test.shape[1]):
y_test[:,:,i]=scalers[i].fit_transform(y_test[:,:,i])
_model = LogisticRegression(class_weight='balanced')
_model.fit(X_train, y_train)
accuracy = _model.score(X_test, y_test) * 100
Error occurs in this line
X_train[:, :, i]=scalers[i].fit_transform(X_train[:, :, i])
TypeError: '(slice(None, None, None), slice(None, None, None), 0)' is
an invalid key

ValueError: operands could not be broadcast together with shapes (38563,54) (38563,) while using curve_fit()

Note: This question is not a multiplication one and please ignore some of the import statements.
Now the details are as follows, I am using a curve_fit() to fit a periodic pandas dataset.
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import leastsq
#import matplotlib.pyplot as plt
import pylab as plt
from scipy.optimize import curve_fit
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df['holiday'].replace(to_replace = 'None', value = '0', inplace=True)
df.loc[df['holiday'] != '0', 'holiday'] = 1
print(df.shape)
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%Y %H:%M')
df['date_time'] = (df['date_time']- dt.datetime(1970,1,1)).dt.total_seconds()
#print(df['date_time'].head())
non_dummy_cols = ['holiday','temp','rain_1h', 'snow_1h', 'clouds_all','date_time', 'traffic_volume']
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
df = pd.get_dummies(df, columns=dummy_cols)
print(df.shape)
x = df[df.columns.values]
x = x.drop(['traffic_volume'], axis=1)
x = x.drop(['clouds_all'], axis = 1)
y = df['traffic_volume']
print(x.shape)
print(y.shape)
#plt.figure(figsize=(6,4))
#plt.scatter(df.date_time[0:100], df.traffic_volume[0:100], color = 'blue')
#plt.xlabel("Date Time")
#plt.ylabel("Traffic volume")
#plt.show()
x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state= 4)
def my_sin(x, freq, amplitude, phase, offset):
return np.sin(x * freq + phase) * amplitude + offset
#x_train = np.array(x_train)
#y_train = np.array(y_train)
print(x_train)
popt, pcov = curve_fit(my_sin, x_train, y_train)
y_hat = my_sin(x_test, *popt)
Error:
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Download dataset URL
The dataset before any programmatic changes is:
So how do i overcome this error? Is it not possible to use curve_fit for a m*n x_train?
I have also tried by reshaping the y_train to m*1 or [2,2,....[]] like this but that's also not working. So please help me to solve this issue.
The entire error message tells the story just above the last line:
Traceback (most recent call last):
File "temp.py", line 50, in <module>
popt, pcov = curve_fit(my_sin, x_train, y_train)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 736, in curve_fit
res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 377, in leastsq
shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 26, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 454, in func_wrapped
return func(xdata, *params) - ydata
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Curve_fit() is handing your function "my_sin()" data which has shape of (38563, 54) - this is x_train.shape() output - and is returning data with the same shape. The curve_fit code needs the function being fitted to instead return data with the same shape as y_train, so it can subtract the two and calculate error. Since the function does not return data with the same shape as y_train, the subtraction is giving an exception.
I suspect you should be using the linear regression in sklearn, and not the curve_fit routine.

not able to convert string to float in python and how to train the model with this dataset

I have a dataset with columns: age (float type), gender (str type), regions (str type) and charges(float type).
I want to predict charges using age gender and region as features, how can I do that in scikit learn?
I have tried something but it shows "ValueError: could not convert string to float: 'northwest' "
import pandas as pd
import numpy as np
df = pd.read_csv('Desktop/insurance.csv')
X = df.loc[:,['age','sex','region']].values
y = df.loc[:,['charges']].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn import svm
clf = svm.SVC(C=1.0, cache_size=200,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf')
clf.fit(X_train, y_train)
The column region contains strings, which can't be used as such in the SVM classifier as it is not a vector.
Threfore you have to turn this column into something that is usable by the SVM. Here is an example by changing region into a categorical series:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
df = pd.DataFrame({'age':[20,30,40,50],
'sex':['male','female','female','male'],
'region':['northwest','southwest','northeast','southeast'],
'charges':[1000,1000,2000,2000]})
df.sex = (df.sex == 'female')
df.region = pd.Categorical(df.region)
df.region = df.region.cat.codes
X = df.loc[:,['age','sex','region']]
y = df.loc[:,['charges']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = svm.SVC(C=1.0, cache_size=200,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf')
clf.fit(X_train, y_train)
Another way to approach this problem is to use one-hot vector encoding:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
df = pd.DataFrame({'age':[20,30,40,50],
'sex':['male','female','female','male'],
'region':['northwest','southwest','northeast','southeast'],
'charges':[1000,1000,2000,2000]})
df.sex = (df.sex == 'female')
df = pd.concat([df,pd.get_dummies(df.region)],axis = 1).drop('region',1)
X = df.drop('charges',1)
y = df.charges
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = svm.SVC(C=1.0, cache_size=200,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf')
clf.fit(X_train, y_train)
Yet another approach is to perform label encoding:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.region = le.fit_transform(df.region)
This list of methods is of course non-exhaustive, and they perform differently according to your problem.
The use of non-numeric data is a non-trivial one, and requires a bit of knowledge on the existing techniques (I encourage you to go and search in kaggle's forums where you can find valuable informations).