Convert KMeans Labels into Source data in Sklearn - numpy

import numpy as np
from sklearn.cluster import DBSCAN, MiniBatchKMeans
data = np.random.rand(5,5)
print data
km = MiniBatchKMeans(n_clusters=3, n_init=10, max_iter=5)
km.fit(data)
labels = km.labels_
print labels
[1 2 0 2 2]
How can I reconstruct my data using the labels? I mean making my data composed of labels in each pixel.

If you want to do 1d clustering, then reshape your data to a 1d array, cluster the points and then reshape back your labels:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
data = np.random.rand(5, 5)
data_to_cluster = np.reshape(data, (data.size, 1))
km = MiniBatchKMeans(n_clusters=3, n_init=10, max_iter=5)
km.fit(data_to_cluster)
labels = km.labels_
labels = np.reshape(labels, (5, 5))

Related

How to access pytorch embeddings lookup table as a tensor

I want to show my embeddings with the tensorboard projector. I would like to access the embeddings matrix (lookup table) of one of my layers so I can write it to the logs.
I instantiate my layer as this:
self.embeddings_user = torch.nn.Embedding(30,300)
And I'm looking for the tensor with shape (30,300) of 30 users with embedding on 300 to dimensions to replace it with the vectors variable in this sample code:
import numpy as np
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
from torch.utils.tensorboard import SummaryWriter
vectors = np.array([[0,0,1], [0,1,0], [1,0,0], [1,1,1]])
metadata = ['001', '010', '100', '111'] # labels
writer = SummaryWriter()
writer.add_embedding(vectors, metadata)
writer.close()
Embeddings layers have weight attributes corresponding to the lookup table. You can access it as follows.
vectors = self.embeddings_user.weight
So now you can visualize it with tensorboard.
import numpy as np
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
from torch.utils.tensorboard import SummaryWriter
vectors = self.embeddings_user.weight
metadata = ['001', '010', '100', '111', ...] # labels
writer = SummaryWriter()
writer.add_embedding(vectors, metadata)
writer.close()

Getting TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0] while doing multi class classification

from sklearn.naive_bayes import CategoricalNB
from sklearn.datasets import make_multilabel_classification
X, y = make_multilabel_classification(sparse = True, n_labels = 15,
return_indicator = 'sparse', allow_unlabeled = False)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
I tried using X.todense() but the error is still raised.
X_train = X_train.todense()
X_test = X_test.todense()
Training on the dataset
from skmultilearn.adapt import MLkNN
from sklearn.metrics import accuracy_score
classifier = MLkNN(k=20)
classifier.fit(X_train, y_train)
predicting the output of trained dataset.
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)
np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)
You are trying to get the length from a matrix, which is ambigious:
len(y_pred)
Your matrix y_pred has the dimension (25,5), as seen with y_pred.shape.
So instead of len(y_pred), you could use y_pred.shape[0], which would return 25.
But then you will encounter a problem when you are using y_pred.reshape(y_pred.shape[0],1)
ValueError: cannot reshape array of size 125 into shape (25, 1)
(previously: y_pred.reshape(len(y_pred),1))
This error makes sense, because you are trying to reshape a matrix with 125 values into a matrix with only 25 values. You need to rethink your code here.

ValueError: operands could not be broadcast together with shapes (38563,54) (38563,) while using curve_fit()

Note: This question is not a multiplication one and please ignore some of the import statements.
Now the details are as follows, I am using a curve_fit() to fit a periodic pandas dataset.
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import leastsq
#import matplotlib.pyplot as plt
import pylab as plt
from scipy.optimize import curve_fit
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df['holiday'].replace(to_replace = 'None', value = '0', inplace=True)
df.loc[df['holiday'] != '0', 'holiday'] = 1
print(df.shape)
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%Y %H:%M')
df['date_time'] = (df['date_time']- dt.datetime(1970,1,1)).dt.total_seconds()
#print(df['date_time'].head())
non_dummy_cols = ['holiday','temp','rain_1h', 'snow_1h', 'clouds_all','date_time', 'traffic_volume']
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
df = pd.get_dummies(df, columns=dummy_cols)
print(df.shape)
x = df[df.columns.values]
x = x.drop(['traffic_volume'], axis=1)
x = x.drop(['clouds_all'], axis = 1)
y = df['traffic_volume']
print(x.shape)
print(y.shape)
#plt.figure(figsize=(6,4))
#plt.scatter(df.date_time[0:100], df.traffic_volume[0:100], color = 'blue')
#plt.xlabel("Date Time")
#plt.ylabel("Traffic volume")
#plt.show()
x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state= 4)
def my_sin(x, freq, amplitude, phase, offset):
return np.sin(x * freq + phase) * amplitude + offset
#x_train = np.array(x_train)
#y_train = np.array(y_train)
print(x_train)
popt, pcov = curve_fit(my_sin, x_train, y_train)
y_hat = my_sin(x_test, *popt)
Error:
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Download dataset URL
The dataset before any programmatic changes is:
So how do i overcome this error? Is it not possible to use curve_fit for a m*n x_train?
I have also tried by reshaping the y_train to m*1 or [2,2,....[]] like this but that's also not working. So please help me to solve this issue.
The entire error message tells the story just above the last line:
Traceback (most recent call last):
File "temp.py", line 50, in <module>
popt, pcov = curve_fit(my_sin, x_train, y_train)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 736, in curve_fit
res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 377, in leastsq
shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 26, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
File "/usr/lib/python3/dist-packages/scipy/optimize/minpack.py", line 454, in func_wrapped
return func(xdata, *params) - ydata
ValueError: operands could not be broadcast together with shapes (38563,54) (38563,)
Curve_fit() is handing your function "my_sin()" data which has shape of (38563, 54) - this is x_train.shape() output - and is returning data with the same shape. The curve_fit code needs the function being fitted to instead return data with the same shape as y_train, so it can subtract the two and calculate error. Since the function does not return data with the same shape as y_train, the subtraction is giving an exception.
I suspect you should be using the linear regression in sklearn, and not the curve_fit routine.

Unable to use FeatureUnion to combine processed numeric and categorical features in Python

I am trying to use Age and Gender to predict Med, but I am new to Pipeline and FeatureUnion of Scikit-learn, and encountered some issue. I read through some tutorial and answer, and that's how I wrote the codes below, but I don't have a good grasp on how to feed the split data into the pipeline functions.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
# Import data into Pandas data frame
data_directory = 'C:/Users/Asus/'
file_name = 'Example.csv'
df = pd.read_csv(data_directory + file_name)
df_len = len(df)
# Get a lit of all variables
print (list(df))
# Class that identifies Column type
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit (self, X, y=None, **fit_params):
return self
def transform(self, X):
return X[self.names]
numeric = [] # list of numeric column names
categorical = [] # list of categorical column names
# Creating random subsample for fast model building
def sample_n(df, n, replace=False, weight=None, seed=None):
"""Sample n rows from a DataFrame at random"""
rs = np.random.RandomState(seed)
locs = rs.choice(df.shape[0], size=n, replace=replace, p=weight)
return df.take(locs, axis=0)
df = sample_n(df, n=300, seed=1123)
# Merge FG-LAI, SG-LAI and Both-LAI together into one group (MED=3)
df.ix[(df['MED']==4)|(df['MED']==5), 'MED']=3
# Remove No-Med (MED=1) and Both-LAI (MED=5) cases
df = df.drop(df[(df['MED']==1)|(df['MED']==5)].index)
# Separate target from training features
y = df['MED']
X = df.drop('MED', axis=1)
# Retain only the needed predictors
X = X.filter(['age', 'gender'])
# Find the numerical columns, exclude categorical columns
X_num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.5,
random_state=567,
stratify=y)
# Pipeline
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical),OneHotEncoder(sparse=False)))
])),
('model', LogisticRegression())
])
# Declare hyperparameters
hyperparameters = {'logisticregression__c' : [0.01, 0.1, 1.0, 10.0],
'logisticregression__penalty' : ['l1', 'l2'],
'logisticregression__multi_class': ['ovr'],
'logisticregression__class_weight': ['balanced', None],
}
# SKlearn cross-validation with pipeline
clf = GridSearchCV(pipe, hyperparameters, cv=10)
# Fit and tune model
clf.fit(X_train, y_train)
Errors:
ValueError: Invalid parameter logisticregression for estimator Pipeline(memory=None,
steps=[('features', FeatureUnion(n_jobs=1,
transformer_list=[('numeric', Pipeline(memory=None,
steps=[('columns', Columns(names=[])), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('categorical', Pipeline(memory=None,
steps=[('columns', Columns(nam...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
Edits:
print (pipe.get_params().keys())
gives
dict_keys(['memory', 'steps', 'features', 'LR_model', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__numeric', 'features__categorical', 'features__numeric__memory', 'features__numeric__steps', 'features__numeric__columns', 'features__numeric__standardscaler', 'features__numeric__columns__names', 'features__numeric__standardscaler__copy', 'features__numeric__standardscaler__with_mean', 'features__numeric__standardscaler__with_std', 'features__categorical__memory', 'features__categorical__steps', 'features__categorical__columns', 'features__categorical__onehotencoder', 'features__categorical__columns__names', 'features__categorical__onehotencoder__categorical_features', 'features__categorical__onehotencoder__dtype', 'features__categorical__onehotencoder__handle_unknown', 'features__categorical__onehotencoder__n_values', 'features__categorical__onehotencoder__sparse', 'LR_model__C', 'LR_model__class_weight', 'LR_model__dual', 'LR_model__fit_intercept', 'LR_model__intercept_scaling', 'LR_model__max_iter', 'LR_model__multi_class', 'LR_model__n_jobs', 'LR_model__penalty', 'LR_model__random_state', 'LR_model__solver', 'LR_model__tol', 'LR_model__verbose', 'LR_model__warm_start'])
After changing into 'model__', I am getting the new error:
ValueError: Found array with 0 feature(s) (shape=(109, 0)) while a minimum of 1 is required by StandardScaler.
Edits 2:
# Retain only the needed predictors
#X = X.filter(['age', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num'])
X_selected = X.filter(['age', 'Geo', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num',
'DAD_readmit', 'Num_DAD_readmit', 'ED_readmit', 'NUmber_ED_readmit'
'Fail_renew', 'FR_num'])
# from the selected X, further choose categorical only
X_selected_cat = X_selected.filter(['Geo', 'ccis']) # hand selected since some cat var has value 0, 1
# Find the numerical columns, exclude categorical columns
X_num_cols = X_selected.columns[X_selected.dtypes.apply(lambda c: np.issubdtype(c, np.number))] # list of numeric column names, automated here
X_cat_cols = X_selected_cat.columns # list of categorical column names, previously hand-slected
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y,
test_size=0.5,
random_state=567,
stratify=y)
# Pipeline
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
('categorical', make_pipeline(Columns(names=X_cat_cols),OneHotEncoder(sparse=False)))
])),
('LR_model', LogisticRegression())
])
Errors:
ValueError: could not convert string to float: 'Urban'
The input array of OneHotEncoder is int but you provided string to it. You could use LabelEncoder or LabelBinarizer to convert string to int. Then, you will be allowed to use OneHotEncoder.
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
('categorical', make_pipeline(Columns(names=X_cat_cols),LabelEncoder(), OneHotEncoder(sparse=False)))
])),
('LR_model', LogisticRegression())
])

How can I improve numpy's broadcast

I'm trying implementing k-NN with Mahalanobis's distance in python with numpy. However, the code below works very slowly when I use broadcasting.
Please teach me how can I improve numpy speed or implement this better.
from __future__ import division
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target.astype('int32'))
mnist_X = mnist_X/255.0
train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y, test_size=0.2)
k = 2
def data_gen(n):
return train_X[train_y == n]
train_X_num = [data_gen(i) for i in range(10)]
inv_cov = [np.linalg.inv(np.cov(train_X_num[i], rowvar=0)+np.eye(784)*0.00001) for i in range(10)] # Making Inverse covariance matrices
for i in range(10):
ivec = train_X_num[i] # ivec size is (number of 'i' data, 784)
ivec = ivec - test_X[:, np.newaxis, :] # This code is too much slowly, and using huge memory
iinv_cov = inv_cov[i]
d[i] = np.add.reduce(np.dot(ivec, iinv_cov)*ivec, axis=2).sort(1)[:, :k+1] # Calculate x.T inverse(sigma) x, and extract k-minimal distance