Tensorflow: How to zip together multiple concatenated datasets? - tensorflow

I am attempting to zip together multiple tf.data.Dataset objects for a multi-input keras model. Each tf.data.Dataset object is a concatenation of multiple dataframes, each with the same number of columns, but not necessarily the same number of lines.
I am able to create the full dataset, but then when I try to pass the dataset in a keras model I get an error :
TypeError: Inputs to a layer should be tensors. Got: <tensorflow.python.data.ops.dataset_ops._NestedVariant object
The problem is that I would really like to take advantage of the lazy structure of the tf.data.Dataset since I am using the window function, but I am having difficulty aggregating all the datasets together.
How do I zip together multiple datasets in a way that the can be passed into the model.fit() function?
Any help would be appreciated.
Here is a simple functional code that recreates my problem :
import pandas as pd
import numpy as np
import tensorflow as tf
# Create dataframes with 4 features and target
dataframe1 = pd.DataFrame(np.random.randn(1000, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
dataframe2 = pd.DataFrame(np.random.randn(800, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
# Convert dataframes to datasets
def get_dataset(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].iloc[4:].to_numpy())
return dataset
def get_dataset_windowed(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].to_numpy()).window(5, shift=1, stride=1,
drop_remainder=True)
return dataset
windowed_dataset = [get_dataset_windowed(x, ["feature3", "feature4"]) for x in [dataframe1, dataframe2]]
windowed_dataset = tf.data.Dataset.from_tensor_slices(windowed_dataset)
windowed_dataset = windowed_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
static_dataset = [get_dataset(x, ["feature1", "feature2"]) for x in [dataframe1, dataframe2]]
static_dataset = tf.data.Dataset.from_tensor_slices(static_dataset)
static_dataset = static_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
targets = [get_dataset(x, ["target"]) for x in [dataframe1, dataframe2]]
targets = tf.data.Dataset.from_tensor_slices(targets)
targets = targets.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
# Zip datasets together
full_dataset = tf.data.Dataset.zip(
(
{
"short_term_ts_input": windowed_dataset,
"static_input": static_dataset,
},
{
"output": targets,
}
)
)
full_dataset = full_dataset.shuffle(buffer_size=1024).batch(128)
# Creating, compiling and fitting model
short_term_ts_input = tf.keras.Input(shape=(5, 2), name="short_term_ts_input")
static_input = tf.keras.Input(shape=(2), name="static_input")
targets = tf.keras.Input(shape=(1,), name="output")
short_term_ts_features = tf.keras.layers.LSTM(32, return_sequences=False)(short_term_ts_input)
short_term_ts_features = tf.keras.layers.Dense(8)(short_term_ts_features)
static_features = tf.keras.layers.Dense(16)(static_input)
x_concat = tf.keras.layers.concatenate([short_term_ts_features, static_features])
x_concat = tf.keras.layers.Dense(32)(x_concat)
output = tf.keras.layers.Dense(1)(x_concat)
model = tf.keras.Model(inputs=[short_term_ts_input, static_input], outputs=[output])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
tf.keras.utils.plot_model(model, "model_test.png", show_shapes=True)
model.fit(full_dataset)`

Maybe something like this:
import pandas as pd
import numpy as np
import tensorflow as tf
# Create dataframes with 4 features and target
dataframe1 = pd.DataFrame(np.random.randn(1000, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
dataframe2 = pd.DataFrame(np.random.randn(800, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
# Convert dataframes to datasets
def get_dataset(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].iloc[4:].to_numpy())
return dataset
def get_dataset_windowed(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].to_numpy()).window(5, shift=1, stride=1,
drop_remainder=True)
return dataset
windowed_dataset = [get_dataset_windowed(x, ["feature3", "feature4"]) for x in [dataframe1, dataframe2]]
windowed_dataset = tf.data.Dataset.from_tensor_slices(windowed_dataset)
windowed_dataset = windowed_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE).flat_map(lambda z: z.batch(5))
static_dataset = [get_dataset(x, ["feature1", "feature2"]) for x in [dataframe1, dataframe2]]
static_dataset = tf.data.Dataset.from_tensor_slices(static_dataset)
static_dataset = static_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
targets = [get_dataset(x, ["target"]) for x in [dataframe1, dataframe2]]
targets = tf.data.Dataset.from_tensor_slices(targets)
targets = targets.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
# Zip datasets together
full_dataset = tf.data.Dataset.zip(
(
{
"short_term_ts_input": windowed_dataset,
"static_input": static_dataset,
},
)
)
full_dataset = tf.data.Dataset.zip((full_dataset, targets))
full_dataset = full_dataset.shuffle(buffer_size=1024).batch(128)
# Creating, compiling and fitting model
short_term_ts_input = tf.keras.Input(shape=(5, 2), name="short_term_ts_input")
static_input = tf.keras.Input(shape=(2), name="static_input")
short_term_ts_features = tf.keras.layers.LSTM(32, return_sequences=False)(short_term_ts_input)
short_term_ts_features = tf.keras.layers.Dense(8)(short_term_ts_features)
static_features = tf.keras.layers.Dense(16)(static_input)
x_concat = tf.keras.layers.concatenate([short_term_ts_features, static_features])
x_concat = tf.keras.layers.Dense(32)(x_concat)
output = tf.keras.layers.Dense(1)(x_concat)
model = tf.keras.Model(inputs=[short_term_ts_input, static_input], outputs=[output])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')
tf.keras.utils.plot_model(model, "model_test.png", show_shapes=True)
model.fit(full_dataset)

Related

How to implement tf.gather with some other tf ops

I want to replace tf.gather with some simple and common tf ops like concat,stack,reshape,slice etc to achieve the same result, because tf.gather is not supported in some deployment framework currently(e.g. the operators that ncnn supports).
Test code:
import numpy as np
import tensorflow as tf
params = np.random.rand(5, 2)
params = tf.constant(params)
print("==>> params: ", params)
indices = np.random.randint(low=0,high=5, size=(3,3))
indices = tf.constant(indices)
print("==>> indices: ", indices)
result = tf.gather(params, indices) # to reimplement it
print("==>> result: ", result)
print("==>> result.shape: ", result.shape)
It is easy, they can create two different images with eye-seeing information or as in the video game we play for safe available bandwidths.
Tf.Gather can implement in many useful ways that include selective information.
[ Sample ]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Working
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
IMAGE_1 = plt.imread("C:\\Users\\Jirayu Kaewprateep\\Pictures\\Cats\\206480490_10158968022281077_7829901115614154740_n_10158968022276077.jpg")
IMAGE_1 = tf.image.resize(IMAGE_1, [log_scales_height, log_scales_width]).numpy()
IMAGE_1 = tf.constant( IMAGE_1, dtype=tf.int32 ).numpy()
params = plt.imread("C:\\Users\\Jirayu Kaewprateep\\Pictures\\Cats\\206480490_10158968022281077_7829901115614154740_n_10158968022276077.jpg")
params = tf.reshape( params, ( params.shape[0] * params.shape[1], 3 ))
limit = tf.math.divide( params.shape[0] * params.shape[1], delta, name='Devide' ).numpy()
limit = 414720
limit = 691200
indices = tf.range(start, limit, delta)
result = tf.gather(params, indices)
result = tf.reshape( result, (240, 288, 3) )
plt.figure(figsize=(1,2))
plt.subplot(1,2,1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(IMAGE_1)
plt.xlabel(" rescaling ")
plt.subplot(1,2,2)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(result)
plt.xlabel(" logarithms pictires ")
plt.show()
input('...')
[ Output ]:
See the attache picture.

How to use multiple CSV files for machine learning anomaly detection

I have a question about how to get my data in a shape that I can use for my ML model. I have multiple CSV files that I want to fit in an algorithm for anomaly detection. My data consists of many files with each being the recorded data from a sensor with two features (intensity and depth) and one timestamp per data point. Each file is labeled with 0 = faulty data and 1 = good data.
Let's say I have 20 files: y should be the label per file y = [[1], [0], ...] and X should be all the data from the sensor X = [[data_file0], [data_file1], ..., [data_file19]] that I can use to train my models. What can I do to get my data in the right format? I tried appending the data frame of every file to a list and transformed it to a dataset and a np.array and so on. I tried different shapes too.
all_files = glob.glob(path + "/*.txt")
df_list = []
snr_list = []
for filename in all_files:
#Für jede Datei wird ein df angelegt und unwichtige features entfernt
#try with dataset with filename and all_files
dataset = tf.data.Dataset.from_tensor_slices(all_files)
def parse_fn(filename):
return tf.data.Dataset.range(10)
dataset = dataset.interleave(lambda x:
tf.data.TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
cycle_length=4, block_length=16)
#try df_list
df = pd.read_csv(filename, index_col=0, header=0, decimal = '.', delimiter = ';')
df.drop(columns=['ET_original', 'Auslenkung_ET', 'ET_unkorrigiert'], axis = 1, inplace = True)
#Zu jedem Zeitpunkt wird der Startzeitpunkt abgezogen: t0 = 1 ... tn = t_n - t0
starttime = df.Zeit_ET[0]
for row in df.itertuples():
df.at[row.Index, 'Zeit_ET'] = df.Zeit_ET[row.Index] - starttime
df.Zeit_ET[0] = 1
#alle arrays einer List hinzufügen
df_list.append(df.to_numpy().reshape(-1, 1700, 3))
#other testings
#test = tf.constant(pd.DataFrame(dic, columns=['1', '1', ' 1']))
#ps=pd.DataFrame(dic, index=['dsf'])
#df_list, test_df (1 df), und tf_const (1 df) zurückgeben
return df_list, df.to_numpy().reshape(-1, 1700, 3), tf.constant(df.to_numpy().reshape(1, 1700, 3), dtype = tf.float32)
#nur für Testzwecke
df_list, test_df, tf_const = Alle_OCT_txt_Daten()
It sounds like the files are the same, but each has a distinct time stamp, right. Juts load everything into a dataframe and run your AI or ML algo on the dataframe.
# import necessary libraries
import pandas as pd
import os
import glob
# use glob to get all the csv files
# in the folder
path = 'C:\\your_path_here\\'
csv_files = glob.glob(os.path.join(path, "*.csv"))
li = []
for filename in csv_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
print(frame)

Passing pandas NumPy arrays as feature vectors in scikit learn?

I have a vector of 5 different values that I use as my sample value, and the label is a single integer of 0, 1, or 3. The machine learning algorithms work when I pass an array as a sample, but I get this warning. How do I pass feature vectors without getting this warning?
import numpy as np
from numpy import random
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import pandas as pd
filepath = 'test.csv'
# example label values
index = [0,1,3,1,1,1,0,0]
# example sample arrays
data = []
for i in range(len(index)):
d = []
for i in range(6):
d.append(random.randint(50,200))
data.append(d)
feat1 = 'brightness'
feat2, feat3, feat4 = ['h', 's', 'v']
feat5 = 'median hue'
feat6 = 'median value'
features = [feat1, feat2, feat3, feat4, feat5, feat6]
df = pd.DataFrame(data, columns=features, index=index)
df.index.name = 'state'
with open(filepath, 'a') as f:
df.to_csv(f, header=f.tell() == 0)
states = pd.read_csv(filepath, usecols=['state'])
df_partial = pd.read_csv(filepath, usecols=features)
states = states.astype(np.float32)
states = states.values
labels = states
samples = np.array([])
for i, row in df_partial.iterrows():
r = row.values
samples = np.vstack((samples, r)) if samples.size else r
n_neighbors = 5
test_size = .3
labels, test_labels, samples, test_samples = train_test_split(labels, samples, test_size=test_size)
clf1 = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf1 = clf1.fit(samples, labels)
score1 = clf1.score(test_samples, test_labels)
print("Here's how the models performed \nknn: %d %%" %(score1 * 100))
Warning:
"DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). clf1 = clf1.fit(samples, labels)"
sklearn documentation for fit(self, X, Y)
Try replacing
states = states.values by states = states.values.flatten()
OR
clf1 = clf1.fit(samples, labels) by clf1 = clf1.fit(samples, labels.flatten()).
states = states.values holds the correct labels that were stored in your panda dataframe, however they are getting stored on different rows. Using .flatten() put all those labels on the same row. (https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.ndarray.flatten.html)
In Sklearn's KNeighborsClassifier documentation
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), they show in their example that the labels must be stored on the same row: y = [0, 0, 1, 1].
When you retrieve data from dataframe states, it is stored in multiple rows (column vector) whereas it expected values in single row.
You can also try using ravel() function which is used to create a contiguous flattened array.
numpy.ravel(array, order = ‘C’) : returns contiguous flattened array (1D array with all the input-array elements and with the same type as it)
Try:
states = states.values.ravel() in place of states = states.values

why does tf.estimator.DNNRegressor predict negative y value?

It is so weird for the predict() function in tf.estimator.DNNRegressor because it predict negative y value, but the training dataset has no negative y value. I found this when I reduced the value of y by 1000 times, say if y was 12000 before, now I change it to 12. The range of y is [3-400] now, but after I did this, the predict() function output some negative values. I didn't set the active function in tf.estimator.DNNRegressor, so the default active function is relu which range is [0-max], but why it predicts negative value? is some bug in tf.estimator.DNNRegressor? or is there no active function applied for y? Thank you.
The code is:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import pandas as pd
import tensorflow as tf
from sklearn import datasets, metrics
import csv
tf.logging.set_verbosity(tf.logging.INFO)
COLUMNS = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col210","col211","col212","col213","col214"]
FEATURES = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col211","col212","col213"]
LABEL = "col214"
def get_input_fn(data_set, num_epochs=None, shuffle=True):
return tf.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y=pd.Series(data_set[LABEL].values),
num_epochs=num_epochs,
shuffle=shuffle)
def get_mae(y_pre, y_target):
absError = []
for i in range(len(y_pre)):
absError.append(abs(y_pre[i] - y_target[i]))
return sum(absError) / len(absError)
def get_mse(y_pre, y_target):
squaredError = []
for i in range(len(y_pre)):
val = y_pre[i] - y_target[i]
squaredError.append(val * val)
return sum(squaredError) / len (squaredError)
training_set = pd.read_csv("train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
test_set = pd.read_csv("test.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
predict_set = pd.read_csv("predict.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[250, 200, 100, 50], model_dir="./model")
regressor.train(input_fn=get_input_fn(training_set), steps=8000)
ev = regressor.evaluate(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
loss_score = ev["loss"]
print("Loss: {0:f}".format(loss_score))
predict = regressor.predict(input_fn=get_input_fn(predict_set, num_epochs=1, shuffle=False))
y_predict = predict_set[LABEL].values.tolist()
print(type(y_predict))
print(y_predict)
list_predict = list(predict)
print(type(list_predict))
y_predicted = []
for i in range(len(list_predict)):
y_predicted.append(list_predict[i]['predictions'][0])
print(y_predicted)
fileObject = open('time_prediction.txt', 'w')
for time in y_predicted:
fileObject.write(str(time))
fileObject.write('\n')
fileObject.close()
mae = get_mae(y_predict, y_predicted)
mse = get_mse(y_predict, y_predicted)
print("Mean Absolute Error:" + str(mae) + " Mean Squared Error:" + str(mse))
#mae = tf.metrics.mean_absolute_error(y_predict, list_predict)
#print(mea)
This is the 3 data records of the dataset:
2399.998,4,100,100,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,2,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,1,4,13,4,0,11,14,15,10,8,0,0,3,1,0,0,0,0,0,0,0,0,0,0,1,364,123428,1397595,16772133,56,103,16772153,22,22,11
1919.9984,2,30,30,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,25
479.9996,2,60,60,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,168
The last column is y.

OneHotEncoding mapping issue between training data and test data

I've transformed training and test data set by sklearn OneHotEncoding method. However, trnsformed results have different type shape. So It is impossible to apply to other algorithms like logistic regression.
How do I reshape the test data in accordance with the training data set's shape?
Best Regardings, Chris
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
def data_transformation(data, dummy):
le = LabelEncoder()
# Encoding the columns with multiple categorical levels
for col1 in dummy:
le.fit(data[col1])
data[col1] = le.transform(data[col1])
dummy_data = np.array(data[dummy])
enc = OneHotEncoder()
enc.fit(dummy_data)
dummy_data = enc.transform(dummy_data).toarray()
if __name__ == '__main__':
data = pd.read_csv('train.data', delimiter=',')
data_test = pd.read_csv('test.data', delimiter=',')
dummy_columns = ['Column1', 'Column2']
data = data_transformation(data, dummy_columns)
data_test = data_transformation(data_test, dummy_columns)
# result
# data shape : (200000, 71 )
# data_test shape : ( 15000, 32)
Thank you so much, Vivek! I've solved this issue due to your help.
def data_transformation2(data, data_test, dummy):
le = LabelEncoder()
# Encoding the columns with multiple categorical levels
for col in dummy:
le.fit(data[col])
data[col] = le.transform(data[col])
for col in dummy:
le.fit(data_test[col])
data_test[col] = le.transform(data_test[col])
enc = OneHotEncoder()
dummy_data = np.array(data[dummy])
dummy_data_test = np.array(data_test[dummy])
enc.fit(dummy_data)
dummy_data = enc.transform(dummy_data).toarray()
dummy_data_test = enc.transform(dummy_data_test).toarray()
print(dummy_data.shape)
print(dummy_data_test.shape)