How can I improve numpy's broadcast - numpy

I'm trying implementing k-NN with Mahalanobis's distance in python with numpy. However, the code below works very slowly when I use broadcasting.
Please teach me how can I improve numpy speed or implement this better.
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))
mnist_X = mnist_X/255.0
train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2)
k = 2
def data_gen(n):
return train_X[train_y == n]
train_X_num = [data_gen(i) for i in range(10)]
inv_cov = [np.linalg.inv(np.cov(train_X_num[i], rowvar=0)+np.eye(784)*0.00001) for i in range(10)] # Making Inverse covariance matrices
for i in range(10):
ivec = train_X_num[i] # ivec size is (number of 'i' data, 784)
ivec = ivec - test_X[:, np.newaxis, :] # This code is too much slowly, and using huge memory
iinv_cov = inv_cov[i]
d[i] = np.add.reduce(np.dot(ivec, iinv_cov)*ivec, axis=2).sort(1)[:, :k+1] # Calculate x.T inverse(sigma) x, and extract k-minimal distance

Related

Running multiple machine learning models using scikit learn

I am trying to run machine learning on some code. However, I run out of ram or the kernel dies. I tried using dask and dropping lots of data, but the result is the same. I want to run the data on multiple models. Does anyone know a fix?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
%matplotlib inline
data_path = "/Users/natowei/Documents/Youtube Data/YouTubeDataset_withChannelElapsed.csv"
data = pd.read_csv(data_path)
data = data.iloc[500000:]
data.head()
#Predicting the total channel View Count, eliminating datasets that are not valuable in prediction
X = data.drop(['videoViewCount','index','channelId','videoId','videoPublished','dislikes/views','likes/views','comments/views','views/subscribers','views/elapsedtime'\], axis = 1)
Y = data['videoViewCount']
from dask_ml.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
train_data = X_train.join(Y_train)
from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()
import joblib
from dask.distributed import Client
client = Client(processes=False)
with joblib.parallel_backend('dask'):
bayes.fit(X_train_s, Y_train)
bayes.score(X_test_s, Y_test)
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
with joblib.parallel_backend('dask'):
decision.fit(X_train_s, Y_train)
decision.score(X_test_s, Y_test)
I have also tried to chunk the data but it does not seem to help much. Basically I all need is a result score for different machine learning models.

How to match dimensions in CNN

I'm trying to build a CNN, where the goal is from 3 features to predict the label, but is giving an error of dimension.
Could someone help me?
updated after comments from #M.Innat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
from tensorflow.keras.models import Sequential, load_model
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
import tensorflow as tf
import random
# Create data
n = 8500
l = [2, 3, 4, 5,6]
k = int(np.ceil(n/len(l)))
labels = [item for item in l for i in range(k)]
random.shuffle(labels,random.random)
labels =np.array(labels)
label_unique = np.unique(labels)
x = np.linspace(613000, 615000, num=n) + np.random.uniform(-5, 5, size=n)
y = np.linspace(7763800, 7765800, num=n) + np.random.uniform(-5, 5, size=n)
z = np.linspace(1230, 1260, num=n) + np.random.uniform(-5, 5, size=n)
X = np.column_stack((x,y,z))
Y = labels
# Split the dataset into training and testing.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)
seq_len=len(X_train)
n_features=len(X_train[0])
droprate=0.1
exit_un=len(label_unique)
seq_len=len(X_train)
n_features=len(X_train[0])
droprate=0.1
exit_un=len(label_unique)
print('n_features: {} \n seq_len: {} \n exit_un: {}'.format(n_features,seq_len,exit_un))
X_train = X_train[..., None][None, ...] # add channel axis+batch aix
Y_train = pd.get_dummies(Y_train) # transform to one-hot encoded
drop_prob = 0.5
my_model = Sequential()
my_model.add(Conv2D(input_shape=(seq_len,n_features,1),filters=32,kernel_size=(3,3),padding='same',activation="relu")) # 1 channel of grayscale.
my_model.add(MaxPooling2D(pool_size=(2,1)))
my_model.add(Conv2D(filters=64,kernel_size=(5,5), padding='same',activation="relu"))
my_model.add(MaxPooling2D(pool_size=(2,1)))
my_model.add(Flatten())
my_model.add(Dense(units = 1024, activation="relu"))
my_model.add(Dropout(rate=drop_prob))
my_model.add(Dense(units = exit_un, activation="softmax"))
n_epochs = 100
batch_size = 10
learn_rate = 0.005
# Define the optimizer and then compile.
my_optimizer=Adam(lr=learn_rate)
my_model.compile(loss = "categorical_crossentropy", optimizer = my_optimizer, metrics=['categorical_crossentropy','accuracy'])
my_summary = my_model.fit(X_train, Y_train, epochs=n_epochs, batch_size = batch_size, verbose = 1)
The error I have is:
ValueError: Data cardinality is ambiguous:
x sizes: 1
y sizes: 5950
Make sure all arrays contain the same number of samples.
You're passing the input sample without the channel axis and also the batch axis. Also, according to your loss function, you should transform your integer label to one-hot encoded.
exit_un=len(label_unique)
drop_prob = 0.5
X_train = X_train[..., None][None, ...] # add channel axis+batch aix
X_train = np.repeat(X_train, repeats=100, axis=0) # batch-ing
Y_train = np.repeat(Y_train, repeats=100, axis=0) # batch-ing
Y_train = pd.get_dummies(Y_train) # transform to one-hot encoded
print(X_train.shape, Y_train.shape)
my_model = Sequential()
...
update
Based on the discussion, it seems like you need the conv1d operation in the modeling time and need to reshape your sample as mentioned in the comment. Here is the colab, it should work now.

Calculate MAE, MSE and R2 metrics for a DNNRegressor model

I have a DNNRegressor model and I want to calculate some metrics to understand how well my model is predicting. How can I calculate the mean absolute error (MAE), mean squared error (MSE) and the R squared coefficient?
So far I only have the loss so can someone help me calculate MAE, MSE and R2?
# Imports
import itertools
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
COLUMNS = ['Prot', 'Gra', 'Cen', 'Sal', 'TVN', 'Velocidad_Prensa']
FEATURES = ['Prot', 'Gra', 'Cen', 'Sal', 'TVN']
LABEL = ['Velocidad_Prensa']
def get_input_fn(data_set, num_epochs=None, shuffle=True):
return tf.compat.v1.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y=pd.Series(data_set[LABEL].values),
num_epochs=num_epochs,
shuffle=shuffle)
training_set = pd.read_csv("prensa train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
test_set = pd.read_csv("prensa eval.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
training_set.head()
# Model
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
activation_fn = tf.nn.relu, hidden_units=[200, 100, 50, 25, 12])
# Reset the index of training
training_set.reset_index(drop = True, inplace =True)
def input_fn(data_set, pred = False):
if pred == False:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
labels = tf.constant(data_set[LABEL].values)
return feature_cols, labels
if pred == True:
feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
return feature_cols
# Deep Neural Network Regressor with the training set which contain the data split by train test split
regressor.train(input_fn=lambda: input_fn(training_set), steps=2000)
# Evaluation on the test set created by train_test_split
ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
# Display the score on the testing set
loss_score1 = ev["loss"]
print("Final Loss on the testing set: {0:f}".format(loss_score1))
def input_fn(features, batch_size=256):
return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)
features = ['Prot', 'Gra', 'Cen', 'Sal','TVN']
predict = {}
print("Ingresar caracterĂ­sticas quĂ­micas de la materia prima")
for feature in features:
valid = True
while valid:
val = input(feature + ": ")
if not val.isdigit(): valid = False
predict[feature] = [float(val)]
predictions = regressor.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
print(pred_dict)
sklearn.metrics has dedicated scoring methods for each of the metrics you are asking for.
Just to the following:
# Import metrics
from sklearn import metrics
# Make predictions
predictions = regressor.predict(input_fn=lambda: input_fn(predict))
# Calculate MAE, MSE, R2
print('MAE:', metrics.mean_absolute_error(y_true, predictions))
print('MSE:', metrics.mean_squared_error(y_true, predictions))
print('R2:', metrics.r2_score(y_true, predictions))

how to create tf.feature_columns with data have no header(csv file)?

I am dealing with multi-class_classification_of_handwritten_digits in the following link google colab
Then I tried to put the code in my way to re write, feed and train the DNN.
Due to the csv file has no header I am not able to create my feature columns, so I cannot train my model.
Can you please help me to figure out how it has been done in the link or how it need to be for my code? Thanks in advance.
import pandas as pd
import seaborn as sns
import tensorflow as tf
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.columns
hand_df = mnist_df[0]
hand_df.head()
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,
num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.LinearClassifier(feature_columns=feat_cols,
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
The example code is already splitting the dataset into training and validation sets.
And I don't think this has anything to do with the header in the CSV.
training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])
validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])
So the training code is here separately.
import pandas as pd
import tensorflow as tf
from tensorflow.python.data import Dataset
import numpy as np
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",sep=",",header=None)
mnist_df = mnist_df.head(10000)
dataset = mnist_df[:7500]
labels = dataset[0]
print ( labels.shape )
# DataFrame.loc index ranges are inclusive at both ends.
features = dataset.loc[:, 1:784]
print ( features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
features = features / 255
def create_training_input_fn(feature, label, batch_size, num_epochs=None, shuffle=True):
"""A custom input_fn for sending MNIST data to the estimator for training.
Args:
features: The training features.
labels: The training labels.
batch_size: Batch size to use during training.
Returns:
A function that returns batches of training features and labels during
training.
"""
def _input_fn(num_epochs=None, shuffle=True):
# Input pipelines are reset with each call to .train(). To ensure model
# gets a good sampling of data, even when number of steps is small, we
# shuffle all the data before creating the Dataset object
idx = np.random.permutation(feature.index)
raw_features = {"pixels": feature.reindex(idx)}
raw_targets = np.array(label[idx])
ds = Dataset.from_tensor_slices((raw_features, raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.LinearClassifier(feature_columns=set([tf.feature_column.numeric_column('pixels', shape=784)]),
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=create_training_input_fn(features, labels, batch_size=10),steps=1000)
Similarly you have a function for preparing the validation set for prediction. You could use this pattern as it is.
But if you are splitting the dataframe using train_test_split you can try this.
X_train, X_test = train_test_split(mnist_df, test_size=0.2)
You have to repeat the following procedure for X_test as well to get the validation features and labels.
X_train_labels = X_train[0]
print ( X_train_labels.shape )
# DataFrame.loc index ranges are inclusive at both ends.
X_train_features = X_train.loc[:, 1:784]
print ( X_train_features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
X_train_features = X_train_features / 255
Rather than trying to find a way to use data without any column names, I have had an idea that :) I have named all my columns and append them into cols=[] then it was easy to assign and use by feature_columns = cols.
Here is my full working code for my own question.
Thanks.
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn import metrics
from tensorflow.python.data import Dataset
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.describe()
mnist_df.columns
hand_df = mnist_df[0]
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
hand_df.head()
#creating cols array and append a1 to a784 in order to name columns
cols=[]
for i in range(785):
if i!=0:
a = '{}{}'.format('a',i)
cols.append(a)
matrix_df.columns = cols
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
#naming columns so I will not get error while assigning feature_columns
for i in range(len(cols)):
a=i+1
b='{}{}'.format('a',a)
cols[i] = tf.feature_column.numeric_column(str(b))
matrix_df.head()
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.DNNClassifier(feature_columns=cols,
hidden_units=[32,64],
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
batch_size=50,
num_epochs=1,
shuffle=False)
pred_gen = model.predict(predict_input_func)
predictions = list(pred_gen)
predictions[0]

Regression on large dataset: Why does accuracy drop?

I am trying to predict the views on olx's ads. I write a scraper to scrape all the data(50000) ads. When I perform linear regression (on 1400 samples) I got 66% accuracy.But after that I perform on 52000 samples it dropped to 8%. Here is the Imgcount vs Views and Price vs Views stats.
Is there any problem with my data? or How can I perform regression on this. I know that this data is very polarized.
I wanted to know what's the problem why my accuracy dropped when I used large dataset.
Thank you for the help.`
CODE:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
url = '/home/msz/olx/olx/with_images.csv'
df = pd.read_csv(url, index_col='url')
df['price'] = df['price'].str.replace('.', '')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('Rs', '')
df['price'] = df['price'].astype(int)
df['text'] = df['text'].str.replace(',', ' ')
df['text'] = df['text'].str.replace('\t', '')
df['text'] = df['text'].str.replace('\n', '')
X = df[['price', 'img']]
y = df['views']
print ("X is like ", X.shape)
print ("Y is like ", y.shape)
df.plot(y='views', x='img', style='x')
plt.title('ImgCount vs Views')
plt.xlabel('ImgCount')
plt.ylabel('Views')
plt.show()
df.plot(y='views', x='price', style='x')
plt.title('Price vs Views')
plt.xlabel('Price')
plt.ylabel('Views')
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.451, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print('Accuracy is : ',score*100)
Regression is the basic algorithm which works on linear datasets mostly but if you have a large and non liner dataset you have to use another algorithm like k-nearest neighbour or may be decision tree. But I prefer to use Naives Bayes classifier and others.