pd.scatter_matrix not working on pandas version 1.4.2 - pandas

Here is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
fruits = pd.read_table('readonly/fruit_data_with_colors.txt')
from matplotlib import cm
X = fruits[['height', 'width', 'mass', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9), cmap=cmap)
My education had pandas version '0.19.2' and pd.scatter_matrix works fine. But I got the error message below when I run it on my Jupyter Notebook with pandas '1.4.2.'.
AttributeError: module 'pandas' has no attribute 'scatter_matrix'
How can I make it run on my Jupyter Notebook?

I guess it has now changed to pandas.plotting.scatter_matrix
Have a look at the document below.
https://pandas.pydata.org/docs/reference/api/pandas.plotting.scatter_matrix.html

Related

Running multiple machine learning models using scikit learn

I am trying to run machine learning on some code. However, I run out of ram or the kernel dies. I tried using dask and dropping lots of data, but the result is the same. I want to run the data on multiple models. Does anyone know a fix?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
%matplotlib inline
data_path = "/Users/natowei/Documents/Youtube Data/YouTubeDataset_withChannelElapsed.csv"
data = pd.read_csv(data_path)
data = data.iloc[500000:]
data.head()
#Predicting the total channel View Count, eliminating datasets that are not valuable in prediction
X = data.drop(['videoViewCount','index','channelId','videoId','videoPublished','dislikes/views','likes/views','comments/views','views/subscribers','views/elapsedtime'\], axis = 1)
Y = data['videoViewCount']
from dask_ml.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
train_data = X_train.join(Y_train)
from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()
import joblib
from dask.distributed import Client
client = Client(processes=False)
with joblib.parallel_backend('dask'):
bayes.fit(X_train_s, Y_train)
bayes.score(X_test_s, Y_test)
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
with joblib.parallel_backend('dask'):
decision.fit(X_train_s, Y_train)
decision.score(X_test_s, Y_test)
I have also tried to chunk the data but it does not seem to help much. Basically I all need is a result score for different machine learning models.

ValueError: operands could not be broadcast together with shapes (38563,54) (38563,) while using curve_fit()

Note: This question is not a multiplication one and please ignore some of the import statements.
Now the details are as follows, I am using a curve_fit() to fit a periodic pandas dataset.
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import leastsq
#import matplotlib.pyplot as plt
import pylab as plt
from scipy.optimize import curve_fit
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df['holiday'].replace(to_replace = 'None', value = '0', inplace=True)
df.loc[df['holiday'] != '0', 'holiday'] = 1
print(df.shape)
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%Y %H:%M')
df['date_time'] = (df['date_time']- dt.datetime(1970,1,1)).dt.total_seconds()
#print(df['date_time'].head())
non_dummy_cols = ['holiday','temp','rain_1h', 'snow_1h', 'clouds_all','date_time', 'traffic_volume']
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
df = pd.get_dummies(df, columns=dummy_cols)
print(df.shape)
x = df[df.columns.values]
x = x.drop(['traffic_volume'], axis=1)
x = x.drop(['clouds_all'], axis = 1)
y = df['traffic_volume']
print(x.shape)
print(y.shape)
#plt.figure(figsize=(6,4))
#plt.scatter(df.date_time[0:100], df.traffic_volume[0:100], color = 'blue')
#plt.xlabel("Date Time")
#plt.ylabel("Traffic volume")
#plt.show()
x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state= 4)
def my_sin(x, freq, amplitude, phase, offset):
return np.sin(x * freq + phase) * amplitude + offset
#x_train = np.array(x_train)
#y_train = np.array(y_train)
print(x_train)
popt, pcov = curve_fit(my_sin, x_train, y_train)
y_hat = my_sin(x_test, *popt)
Error:
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Download dataset URL
The dataset before any programmatic changes is:
So how do i overcome this error? Is it not possible to use curve_fit for a m*n x_train?
I have also tried by reshaping the y_train to m*1 or [2,2,....[]] like this but that's also not working. So please help me to solve this issue.
The entire error message tells the story just above the last line:
Traceback (most recent call last):
File "temp.py", line 50, in <module>
popt, pcov = curve_fit(my_sin, x_train, y_train)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 736, in curve_fit
res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 377, in leastsq
shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 26, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 454, in func_wrapped
return func(xdata, *params) - ydata
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Curve_fit() is handing your function "my_sin()" data which has shape of (38563, 54) - this is x_train.shape() output - and is returning data with the same shape. The curve_fit code needs the function being fitted to instead return data with the same shape as y_train, so it can subtract the two and calculate error. Since the function does not return data with the same shape as y_train, the subtraction is giving an exception.
I suspect you should be using the linear regression in sklearn, and not the curve_fit routine.

Why I can't draw a chart? TypeError: unhashable type: 'numpy.ndarray'

I want to see the results of the regression with the graph. But it turns out a blank chart.
I use also not dataframe just values. but the result was same. And the dataset includes 537577 rows
TypeError: unhashable type: 'numpy.ndarray'
#1. kütüphaneker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# 2. veri ön işleme
veriler = pd.read_csv("BlackFriday.csv")
print(veriler)
#eksikveriler
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
pro2 = veriler.iloc[:,9:11].values
pro2 = imputer.fit_transform(pro2)
print(veriler)
#test-eğitim bölme
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(s,y,test_size=0.33,
random_state=0)
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(s.values,y.values)
plt.scatter(s.values,y.values)
plt.plot(s,lin_reg.predict(s.values))
try this:
plt.scatter([s.values],[y.values])
shuld work with lists

not able to convert string to float in python and how to train the model with this dataset

I have a dataset with columns: age (float type), gender (str type), regions (str type) and charges(float type).
I want to predict charges using age gender and region as features, how can I do that in scikit learn?
I have tried something but it shows "ValueError: could not convert string to float: 'northwest' "
import pandas as pd
import numpy as np
df = pd.read_csv('Desktop/insurance.csv')
X = df.loc[:,['age','sex','region']].values
y = df.loc[:,['charges']].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn import svm
clf = svm.SVC(C=1.0, cache_size=200,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf')
clf.fit(X_train, y_train)
The column region contains strings, which can't be used as such in the SVM classifier as it is not a vector.
Threfore you have to turn this column into something that is usable by the SVM. Here is an example by changing region into a categorical series:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
df = pd.DataFrame({'age':[20,30,40,50],
'sex':['male','female','female','male'],
'region':['northwest','southwest','northeast','southeast'],
'charges':[1000,1000,2000,2000]})
df.sex = (df.sex == 'female')
df.region = pd.Categorical(df.region)
df.region = df.region.cat.codes
X = df.loc[:,['age','sex','region']]
y = df.loc[:,['charges']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = svm.SVC(C=1.0, cache_size=200,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf')
clf.fit(X_train, y_train)
Another way to approach this problem is to use one-hot vector encoding:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
df = pd.DataFrame({'age':[20,30,40,50],
'sex':['male','female','female','male'],
'region':['northwest','southwest','northeast','southeast'],
'charges':[1000,1000,2000,2000]})
df.sex = (df.sex == 'female')
df = pd.concat([df,pd.get_dummies(df.region)],axis = 1).drop('region',1)
X = df.drop('charges',1)
y = df.charges
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = svm.SVC(C=1.0, cache_size=200,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf')
clf.fit(X_train, y_train)
Yet another approach is to perform label encoding:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.region = le.fit_transform(df.region)
This list of methods is of course non-exhaustive, and they perform differently according to your problem.
The use of non-numeric data is a non-trivial one, and requires a bit of knowledge on the existing techniques (I encourage you to go and search in kaggle's forums where you can find valuable informations).

Regression on large dataset: Why does accuracy drop?

I am trying to predict the views on olx's ads. I write a scraper to scrape all the data(50000) ads. When I perform linear regression (on 1400 samples) I got 66% accuracy.But after that I perform on 52000 samples it dropped to 8%. Here is the Imgcount vs Views and Price vs Views stats.
Is there any problem with my data? or How can I perform regression on this. I know that this data is very polarized.
I wanted to know what's the problem why my accuracy dropped when I used large dataset.
Thank you for the help.`
CODE:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
url = '/home/msz/olx/olx/with_images.csv'
df = pd.read_csv(url, index_col='url')
df['price'] = df['price'].str.replace('.', '')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('Rs', '')
df['price'] = df['price'].astype(int)
df['text'] = df['text'].str.replace(',', ' ')
df['text'] = df['text'].str.replace('\t', '')
df['text'] = df['text'].str.replace('\n', '')
X = df[['price', 'img']]
y = df['views']
print ("X is like ", X.shape)
print ("Y is like ", y.shape)
df.plot(y='views', x='img', style='x')
plt.title('ImgCount vs Views')
plt.xlabel('ImgCount')
plt.ylabel('Views')
plt.show()
df.plot(y='views', x='price', style='x')
plt.title('Price vs Views')
plt.xlabel('Price')
plt.ylabel('Views')
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.451, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print('Accuracy is : ',score*100)
Regression is the basic algorithm which works on linear datasets mostly but if you have a large and non liner dataset you have to use another algorithm like k-nearest neighbour or may be decision tree. But I prefer to use Naives Bayes classifier and others.