Running multiple machine learning models using scikit learn - dataframe

I am trying to run machine learning on some code. However, I run out of ram or the kernel dies. I tried using dask and dropping lots of data, but the result is the same. I want to run the data on multiple models. Does anyone know a fix?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
%matplotlib inline
data_path = "/Users/natowei/Documents/Youtube Data/YouTubeDataset_withChannelElapsed.csv"
data = pd.read_csv(data_path)
data = data.iloc[500000:]
data.head()
#Predicting the total channel View Count, eliminating datasets that are not valuable in prediction
X = data.drop(['videoViewCount','index','channelId','videoId','videoPublished','dislikes/views','likes/views','comments/views','views/subscribers','views/elapsedtime'\], axis = 1)
Y = data['videoViewCount']
from dask_ml.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
train_data = X_train.join(Y_train)
from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()
import joblib
from dask.distributed import Client
client = Client(processes=False)
with joblib.parallel_backend('dask'):
bayes.fit(X_train_s, Y_train)
bayes.score(X_test_s, Y_test)
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
with joblib.parallel_backend('dask'):
decision.fit(X_train_s, Y_train)
decision.score(X_test_s, Y_test)
I have also tried to chunk the data but it does not seem to help much. Basically I all need is a result score for different machine learning models.

Related

pd.scatter_matrix not working on pandas version 1.4.2

Here is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
fruits = pd.read_table('readonly/fruit_data_with_colors.txt')
from matplotlib import cm
X = fruits[['height', 'width', 'mass', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9), cmap=cmap)
My education had pandas version '0.19.2' and pd.scatter_matrix works fine. But I got the error message below when I run it on my Jupyter Notebook with pandas '1.4.2.'.
AttributeError: module 'pandas' has no attribute 'scatter_matrix'
How can I make it run on my Jupyter Notebook?
I guess it has now changed to pandas.plotting.scatter_matrix
Have a look at the document below.
https://pandas.pydata.org/docs/reference/api/pandas.plotting.scatter_matrix.html

how to create tf.feature_columns with data have no header(csv file)?

I am dealing with multi-class_classification_of_handwritten_digits in the following link google colab
Then I tried to put the code in my way to re write, feed and train the DNN.
Due to the csv file has no header I am not able to create my feature columns, so I cannot train my model.
Can you please help me to figure out how it has been done in the link or how it need to be for my code? Thanks in advance.
import pandas as pd
import seaborn as sns
import tensorflow as tf
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.columns
hand_df = mnist_df[0]
hand_df.head()
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,
num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.LinearClassifier(feature_columns=feat_cols,
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
The example code is already splitting the dataset into training and validation sets.
And I don't think this has anything to do with the header in the CSV.
training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])
validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])
So the training code is here separately.
import pandas as pd
import tensorflow as tf
from tensorflow.python.data import Dataset
import numpy as np
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",sep=",",header=None)
mnist_df = mnist_df.head(10000)
dataset = mnist_df[:7500]
labels = dataset[0]
print ( labels.shape )
# DataFrame.loc index ranges are inclusive at both ends.
features = dataset.loc[:, 1:784]
print ( features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
features = features / 255
def create_training_input_fn(feature, label, batch_size, num_epochs=None, shuffle=True):
"""A custom input_fn for sending MNIST data to the estimator for training.
Args:
features: The training features.
labels: The training labels.
batch_size: Batch size to use during training.
Returns:
A function that returns batches of training features and labels during
training.
"""
def _input_fn(num_epochs=None, shuffle=True):
# Input pipelines are reset with each call to .train(). To ensure model
# gets a good sampling of data, even when number of steps is small, we
# shuffle all the data before creating the Dataset object
idx = np.random.permutation(feature.index)
raw_features = {"pixels": feature.reindex(idx)}
raw_targets = np.array(label[idx])
ds = Dataset.from_tensor_slices((raw_features, raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.LinearClassifier(feature_columns=set([tf.feature_column.numeric_column('pixels', shape=784)]),
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=create_training_input_fn(features, labels, batch_size=10),steps=1000)
Similarly you have a function for preparing the validation set for prediction. You could use this pattern as it is.
But if you are splitting the dataframe using train_test_split you can try this.
X_train, X_test = train_test_split(mnist_df, test_size=0.2)
You have to repeat the following procedure for X_test as well to get the validation features and labels.
X_train_labels = X_train[0]
print ( X_train_labels.shape )
# DataFrame.loc index ranges are inclusive at both ends.
X_train_features = X_train.loc[:, 1:784]
print ( X_train_features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
X_train_features = X_train_features / 255
Rather than trying to find a way to use data without any column names, I have had an idea that :) I have named all my columns and append them into cols=[] then it was easy to assign and use by feature_columns = cols.
Here is my full working code for my own question.
Thanks.
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn import metrics
from tensorflow.python.data import Dataset
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.describe()
mnist_df.columns
hand_df = mnist_df[0]
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
hand_df.head()
#creating cols array and append a1 to a784 in order to name columns
cols=[]
for i in range(785):
if i!=0:
a = '{}{}'.format('a',i)
cols.append(a)
matrix_df.columns = cols
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
#naming columns so I will not get error while assigning feature_columns
for i in range(len(cols)):
a=i+1
b='{}{}'.format('a',a)
cols[i] = tf.feature_column.numeric_column(str(b))
matrix_df.head()
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.DNNClassifier(feature_columns=cols,
hidden_units=[32,64],
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
batch_size=50,
num_epochs=1,
shuffle=False)
pred_gen = model.predict(predict_input_func)
predictions = list(pred_gen)
predictions[0]

Regression on large dataset: Why does accuracy drop?

I am trying to predict the views on olx's ads. I write a scraper to scrape all the data(50000) ads. When I perform linear regression (on 1400 samples) I got 66% accuracy.But after that I perform on 52000 samples it dropped to 8%. Here is the Imgcount vs Views and Price vs Views stats.
Is there any problem with my data? or How can I perform regression on this. I know that this data is very polarized.
I wanted to know what's the problem why my accuracy dropped when I used large dataset.
Thank you for the help.`
CODE:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
url = '/home/msz/olx/olx/with_images.csv'
df = pd.read_csv(url, index_col='url')
df['price'] = df['price'].str.replace('.', '')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('Rs', '')
df['price'] = df['price'].astype(int)
df['text'] = df['text'].str.replace(',', ' ')
df['text'] = df['text'].str.replace('\t', '')
df['text'] = df['text'].str.replace('\n', '')
X = df[['price', 'img']]
y = df['views']
print ("X is like ", X.shape)
print ("Y is like ", y.shape)
df.plot(y='views', x='img', style='x')
plt.title('ImgCount vs Views')
plt.xlabel('ImgCount')
plt.ylabel('Views')
plt.show()
df.plot(y='views', x='price', style='x')
plt.title('Price vs Views')
plt.xlabel('Price')
plt.ylabel('Views')
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.451, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print('Accuracy is : ',score*100)
Regression is the basic algorithm which works on linear datasets mostly but if you have a large and non liner dataset you have to use another algorithm like k-nearest neighbour or may be decision tree. But I prefer to use Naives Bayes classifier and others.

Possible compatibility issue with Keras, TensorFlow and scikit (tf.global_variables())

I'm trying to do a small test with my dataset on Keras Regressor (using TensorFlow), but I'm having a small issue. The error seems to be on the function cross_val_score from scikit. It starts on it and the last error message is:
File "/usr/local/lib/python2.7/dist-packages/Keras-2.0.2-py2.7.egg/keras/backend/tensorflow_backend.py", line 298, in _initialize_variables
variables = tf.global_variables()
AttributeError: 'module' object has no attribute 'global_variables'
My full code is basically the example found in http://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/ with small changes.
I've looked upon the " 'module' object has no attribute 'global_variables' " error and it seems to be about the Tensorflow version, but I'm using the most recent one (1.0) and there is no function in the code that works directly with tf that I can change. Below is my full code, is there anyway i can change it so it works? Thanks for the help
import numpy
import pandas
import sys
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_svmlight_file
# define base mode
def baseline_model():
# create model
model = Sequential()
model.add(Dense(68, activation="relu", kernel_initializer="normal", input_dim=68))
model.add(Dense(1, kernel_initializer="normal"))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
X, y, query_id = load_svmlight_file(str(sys.argv[1]), query_id=True)
scaler = StandardScaler()
X = scaler.fit_transform(X.toarray())
# fix random seed for reproducibility
seed = 1
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(estimator, X, y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))
You are probably using an older Tensorflow version install tensorflow 1.2.0rc2 and you should be fine.

How can I improve numpy's broadcast

I'm trying implementing k-NN with Mahalanobis's distance in python with numpy. However, the code below works very slowly when I use broadcasting.
Please teach me how can I improve numpy speed or implement this better.
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))
mnist_X = mnist_X/255.0
train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2)
k = 2
def data_gen(n):
return train_X[train_y == n]
train_X_num = [data_gen(i) for i in range(10)]
inv_cov = [np.linalg.inv(np.cov(train_X_num[i], rowvar=0)+np.eye(784)*0.00001) for i in range(10)] # Making Inverse covariance matrices
for i in range(10):
ivec = train_X_num[i] # ivec size is (number of 'i' data, 784)
ivec = ivec - test_X[:, np.newaxis, :] # This code is too much slowly, and using huge memory
iinv_cov = inv_cov[i]
d[i] = np.add.reduce(np.dot(ivec, iinv_cov)*ivec, axis=2).sort(1)[:, :k+1] # Calculate x.T inverse(sigma) x, and extract k-minimal distance