OneHotEncoding mapping issue between training data and test data - pandas
I've transformed training and test data set by sklearn OneHotEncoding method. However, trnsformed results have different type shape. So It is impossible to apply to other algorithms like logistic regression.
How do I reshape the test data in accordance with the training data set's shape?
Best Regardings, Chris
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
def data_transformation(data, dummy):
le = LabelEncoder()
# Encoding the columns with multiple categorical levels
for col1 in dummy:
le.fit(data[col1])
data[col1] = le.transform(data[col1])
dummy_data = np.array(data[dummy])
enc = OneHotEncoder()
enc.fit(dummy_data)
dummy_data = enc.transform(dummy_data).toarray()
if __name__ == '__main__':
data = pd.read_csv('train.data', delimiter=',')
data_test = pd.read_csv('test.data', delimiter=',')
dummy_columns = ['Column1', 'Column2']
data = data_transformation(data, dummy_columns)
data_test = data_transformation(data_test, dummy_columns)
# result
# data shape : (200000, 71 )
# data_test shape : ( 15000, 32)
Thank you so much, Vivek! I've solved this issue due to your help.
def data_transformation2(data, data_test, dummy):
le = LabelEncoder()
# Encoding the columns with multiple categorical levels
for col in dummy:
le.fit(data[col])
data[col] = le.transform(data[col])
for col in dummy:
le.fit(data_test[col])
data_test[col] = le.transform(data_test[col])
enc = OneHotEncoder()
dummy_data = np.array(data[dummy])
dummy_data_test = np.array(data_test[dummy])
enc.fit(dummy_data)
dummy_data = enc.transform(dummy_data).toarray()
dummy_data_test = enc.transform(dummy_data_test).toarray()
print(dummy_data.shape)
print(dummy_data_test.shape)
Related
Tensorflow: How to zip together multiple concatenated datasets?
I am attempting to zip together multiple tf.data.Dataset objects for a multi-input keras model. Each tf.data.Dataset object is a concatenation of multiple dataframes, each with the same number of columns, but not necessarily the same number of lines. I am able to create the full dataset, but then when I try to pass the dataset in a keras model I get an error : TypeError: Inputs to a layer should be tensors. Got: <tensorflow.python.data.ops.dataset_ops._NestedVariant object The problem is that I would really like to take advantage of the lazy structure of the tf.data.Dataset since I am using the window function, but I am having difficulty aggregating all the datasets together. How do I zip together multiple datasets in a way that the can be passed into the model.fit() function? Any help would be appreciated. Here is a simple functional code that recreates my problem : import pandas as pd import numpy as np import tensorflow as tf # Create dataframes with 4 features and target dataframe1 = pd.DataFrame(np.random.randn(1000, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"]) dataframe2 = pd.DataFrame(np.random.randn(800, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"]) # Convert dataframes to datasets def get_dataset(df: pd.DataFrame, features): dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].iloc[4:].to_numpy()) return dataset def get_dataset_windowed(df: pd.DataFrame, features): dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].to_numpy()).window(5, shift=1, stride=1, drop_remainder=True) return dataset windowed_dataset = [get_dataset_windowed(x, ["feature3", "feature4"]) for x in [dataframe1, dataframe2]] windowed_dataset = tf.data.Dataset.from_tensor_slices(windowed_dataset) windowed_dataset = windowed_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE) static_dataset = [get_dataset(x, ["feature1", "feature2"]) for x in [dataframe1, dataframe2]] static_dataset = tf.data.Dataset.from_tensor_slices(static_dataset) static_dataset = static_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE) targets = [get_dataset(x, ["target"]) for x in [dataframe1, dataframe2]] targets = tf.data.Dataset.from_tensor_slices(targets) targets = targets.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE) # Zip datasets together full_dataset = tf.data.Dataset.zip( ( { "short_term_ts_input": windowed_dataset, "static_input": static_dataset, }, { "output": targets, } ) ) full_dataset = full_dataset.shuffle(buffer_size=1024).batch(128) # Creating, compiling and fitting model short_term_ts_input = tf.keras.Input(shape=(5, 2), name="short_term_ts_input") static_input = tf.keras.Input(shape=(2), name="static_input") targets = tf.keras.Input(shape=(1,), name="output") short_term_ts_features = tf.keras.layers.LSTM(32, return_sequences=False)(short_term_ts_input) short_term_ts_features = tf.keras.layers.Dense(8)(short_term_ts_features) static_features = tf.keras.layers.Dense(16)(static_input) x_concat = tf.keras.layers.concatenate([short_term_ts_features, static_features]) x_concat = tf.keras.layers.Dense(32)(x_concat) output = tf.keras.layers.Dense(1)(x_concat) model = tf.keras.Model(inputs=[short_term_ts_input, static_input], outputs=[output]) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)) tf.keras.utils.plot_model(model, "model_test.png", show_shapes=True) model.fit(full_dataset)`
Maybe something like this: import pandas as pd import numpy as np import tensorflow as tf # Create dataframes with 4 features and target dataframe1 = pd.DataFrame(np.random.randn(1000, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"]) dataframe2 = pd.DataFrame(np.random.randn(800, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"]) # Convert dataframes to datasets def get_dataset(df: pd.DataFrame, features): dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].iloc[4:].to_numpy()) return dataset def get_dataset_windowed(df: pd.DataFrame, features): dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].to_numpy()).window(5, shift=1, stride=1, drop_remainder=True) return dataset windowed_dataset = [get_dataset_windowed(x, ["feature3", "feature4"]) for x in [dataframe1, dataframe2]] windowed_dataset = tf.data.Dataset.from_tensor_slices(windowed_dataset) windowed_dataset = windowed_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE).flat_map(lambda z: z.batch(5)) static_dataset = [get_dataset(x, ["feature1", "feature2"]) for x in [dataframe1, dataframe2]] static_dataset = tf.data.Dataset.from_tensor_slices(static_dataset) static_dataset = static_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE) targets = [get_dataset(x, ["target"]) for x in [dataframe1, dataframe2]] targets = tf.data.Dataset.from_tensor_slices(targets) targets = targets.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE) # Zip datasets together full_dataset = tf.data.Dataset.zip( ( { "short_term_ts_input": windowed_dataset, "static_input": static_dataset, }, ) ) full_dataset = tf.data.Dataset.zip((full_dataset, targets)) full_dataset = full_dataset.shuffle(buffer_size=1024).batch(128) # Creating, compiling and fitting model short_term_ts_input = tf.keras.Input(shape=(5, 2), name="short_term_ts_input") static_input = tf.keras.Input(shape=(2), name="static_input") short_term_ts_features = tf.keras.layers.LSTM(32, return_sequences=False)(short_term_ts_input) short_term_ts_features = tf.keras.layers.Dense(8)(short_term_ts_features) static_features = tf.keras.layers.Dense(16)(static_input) x_concat = tf.keras.layers.concatenate([short_term_ts_features, static_features]) x_concat = tf.keras.layers.Dense(32)(x_concat) output = tf.keras.layers.Dense(1)(x_concat) model = tf.keras.Model(inputs=[short_term_ts_input, static_input], outputs=[output]) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse') tf.keras.utils.plot_model(model, "model_test.png", show_shapes=True) model.fit(full_dataset)
How to use multiple CSV files for machine learning anomaly detection
I have a question about how to get my data in a shape that I can use for my ML model. I have multiple CSV files that I want to fit in an algorithm for anomaly detection. My data consists of many files with each being the recorded data from a sensor with two features (intensity and depth) and one timestamp per data point. Each file is labeled with 0 = faulty data and 1 = good data. Let's say I have 20 files: y should be the label per file y = [[1], [0], ...] and X should be all the data from the sensor X = [[data_file0], [data_file1], ..., [data_file19]] that I can use to train my models. What can I do to get my data in the right format? I tried appending the data frame of every file to a list and transformed it to a dataset and a np.array and so on. I tried different shapes too. all_files = glob.glob(path + "/*.txt") df_list = [] snr_list = [] for filename in all_files: #Für jede Datei wird ein df angelegt und unwichtige features entfernt #try with dataset with filename and all_files dataset = tf.data.Dataset.from_tensor_slices(all_files) def parse_fn(filename): return tf.data.Dataset.range(10) dataset = dataset.interleave(lambda x: tf.data.TextLineDataset(x).map(parse_fn, num_parallel_calls=1), cycle_length=4, block_length=16) #try df_list df = pd.read_csv(filename, index_col=0, header=0, decimal = '.', delimiter = ';') df.drop(columns=['ET_original', 'Auslenkung_ET', 'ET_unkorrigiert'], axis = 1, inplace = True) #Zu jedem Zeitpunkt wird der Startzeitpunkt abgezogen: t0 = 1 ... tn = t_n - t0 starttime = df.Zeit_ET[0] for row in df.itertuples(): df.at[row.Index, 'Zeit_ET'] = df.Zeit_ET[row.Index] - starttime df.Zeit_ET[0] = 1 #alle arrays einer List hinzufügen df_list.append(df.to_numpy().reshape(-1, 1700, 3)) #other testings #test = tf.constant(pd.DataFrame(dic, columns=['1', '1', ' 1'])) #ps=pd.DataFrame(dic, index=['dsf']) #df_list, test_df (1 df), und tf_const (1 df) zurückgeben return df_list, df.to_numpy().reshape(-1, 1700, 3), tf.constant(df.to_numpy().reshape(1, 1700, 3), dtype = tf.float32) #nur für Testzwecke df_list, test_df, tf_const = Alle_OCT_txt_Daten()
It sounds like the files are the same, but each has a distinct time stamp, right. Juts load everything into a dataframe and run your AI or ML algo on the dataframe. # import necessary libraries import pandas as pd import os import glob # use glob to get all the csv files # in the folder path = 'C:\\your_path_here\\' csv_files = glob.glob(os.path.join(path, "*.csv")) li = [] for filename in csv_files: df = pd.read_csv(filename, index_col=None, header=0) li.append(df) frame = pd.concat(li, axis=0, ignore_index=True) print(frame)
Passing pandas NumPy arrays as feature vectors in scikit learn?
I have a vector of 5 different values that I use as my sample value, and the label is a single integer of 0, 1, or 3. The machine learning algorithms work when I pass an array as a sample, but I get this warning. How do I pass feature vectors without getting this warning? import numpy as np from numpy import random from sklearn import neighbors from sklearn.model_selection import train_test_split import pandas as pd filepath = 'test.csv' # example label values index = [0,1,3,1,1,1,0,0] # example sample arrays data = [] for i in range(len(index)): d = [] for i in range(6): d.append(random.randint(50,200)) data.append(d) feat1 = 'brightness' feat2, feat3, feat4 = ['h', 's', 'v'] feat5 = 'median hue' feat6 = 'median value' features = [feat1, feat2, feat3, feat4, feat5, feat6] df = pd.DataFrame(data, columns=features, index=index) df.index.name = 'state' with open(filepath, 'a') as f: df.to_csv(f, header=f.tell() == 0) states = pd.read_csv(filepath, usecols=['state']) df_partial = pd.read_csv(filepath, usecols=features) states = states.astype(np.float32) states = states.values labels = states samples = np.array([]) for i, row in df_partial.iterrows(): r = row.values samples = np.vstack((samples, r)) if samples.size else r n_neighbors = 5 test_size = .3 labels, test_labels, samples, test_samples = train_test_split(labels, samples, test_size=test_size) clf1 = neighbors.KNeighborsClassifier(n_neighbors, weights='distance') clf1 = clf1.fit(samples, labels) score1 = clf1.score(test_samples, test_labels) print("Here's how the models performed \nknn: %d %%" %(score1 * 100)) Warning: "DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). clf1 = clf1.fit(samples, labels)" sklearn documentation for fit(self, X, Y)
Try replacing states = states.values by states = states.values.flatten() OR clf1 = clf1.fit(samples, labels) by clf1 = clf1.fit(samples, labels.flatten()). states = states.values holds the correct labels that were stored in your panda dataframe, however they are getting stored on different rows. Using .flatten() put all those labels on the same row. (https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.ndarray.flatten.html) In Sklearn's KNeighborsClassifier documentation (https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), they show in their example that the labels must be stored on the same row: y = [0, 0, 1, 1].
When you retrieve data from dataframe states, it is stored in multiple rows (column vector) whereas it expected values in single row. You can also try using ravel() function which is used to create a contiguous flattened array. numpy.ravel(array, order = ‘C’) : returns contiguous flattened array (1D array with all the input-array elements and with the same type as it) Try: states = states.values.ravel() in place of states = states.values
why does tf.estimator.DNNRegressor predict negative y value?
It is so weird for the predict() function in tf.estimator.DNNRegressor because it predict negative y value, but the training dataset has no negative y value. I found this when I reduced the value of y by 1000 times, say if y was 12000 before, now I change it to 12. The range of y is [3-400] now, but after I did this, the predict() function output some negative values. I didn't set the active function in tf.estimator.DNNRegressor, so the default active function is relu which range is [0-max], but why it predicts negative value? is some bug in tf.estimator.DNNRegressor? or is there no active function applied for y? Thank you. The code is: from __future__ import absolute_import from __future__ import division from __future__ import print_function import itertools import pandas as pd import tensorflow as tf from sklearn import datasets, metrics import csv tf.logging.set_verbosity(tf.logging.INFO) COLUMNS = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col210","col211","col212","col213","col214"] FEATURES = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col211","col212","col213"] LABEL = "col214" def get_input_fn(data_set, num_epochs=None, shuffle=True): return tf.estimator.inputs.pandas_input_fn( x=pd.DataFrame({k: data_set[k].values for k in FEATURES}), y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs, shuffle=shuffle) def get_mae(y_pre, y_target): absError = [] for i in range(len(y_pre)): absError.append(abs(y_pre[i] - y_target[i])) return sum(absError) / len(absError) def get_mse(y_pre, y_target): squaredError = [] for i in range(len(y_pre)): val = y_pre[i] - y_target[i] squaredError.append(val * val) return sum(squaredError) / len (squaredError) training_set = pd.read_csv("train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS) test_set = pd.read_csv("test.csv", skipinitialspace=True, skiprows=1, names=COLUMNS) predict_set = pd.read_csv("predict.csv", skipinitialspace=True, skiprows=1, names=COLUMNS) feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES] regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[250, 200, 100, 50], model_dir="./model") regressor.train(input_fn=get_input_fn(training_set), steps=8000) ev = regressor.evaluate(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False)) loss_score = ev["loss"] print("Loss: {0:f}".format(loss_score)) predict = regressor.predict(input_fn=get_input_fn(predict_set, num_epochs=1, shuffle=False)) y_predict = predict_set[LABEL].values.tolist() print(type(y_predict)) print(y_predict) list_predict = list(predict) print(type(list_predict)) y_predicted = [] for i in range(len(list_predict)): y_predicted.append(list_predict[i]['predictions'][0]) print(y_predicted) fileObject = open('time_prediction.txt', 'w') for time in y_predicted: fileObject.write(str(time)) fileObject.write('\n') fileObject.close() mae = get_mae(y_predict, y_predicted) mse = get_mse(y_predict, y_predicted) print("Mean Absolute Error:" + str(mae) + " Mean Squared Error:" + str(mse)) #mae = tf.metrics.mean_absolute_error(y_predict, list_predict) #print(mea) This is the 3 data records of the dataset: 2399.998,4,100,100,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,2,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,1,4,13,4,0,11,14,15,10,8,0,0,3,1,0,0,0,0,0,0,0,0,0,0,1,364,123428,1397595,16772133,56,103,16772153,22,22,11 1919.9984,2,30,30,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,25 479.9996,2,60,60,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,168 The last column is y.
One-hot encoding Tensorflow Strings
I have a list of strings as labels for training a neural network. Now I want to convert them via one_hot encoding so that I can use them for my tensorflow network. My input list looks like this: labels = ['"car"', '"pedestrian"', '"car"', '"truck"', '"car"'] The requested outcome should be something like one_hot [0,1,0,2,0] What is the easiest way to do this? Any help would be much appreciated. Cheers, Andi
the desired outcome looks like LabelEncoder in sklearn, not like OneHotEncoder - in tf you need CategoryEncoder - BUT it is A preprocessing layer which encodes integer features.: inp = layers.Input(shape=[X.shape[0]]) x0 = layers.CategoryEncoding( num_tokens=3, output_mode="multi_hot")(inp) model = keras.Model(inputs=[inp], outputs=[x0]) model.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=[tf.keras.metrics.CategoricalCrossentropy()]) print(model.summary()) this part gets encoding of unique values... And you can make another branch in this model to input your initial vector & fit it according labels from this reference-branch (it is like join reference-table with fact-table in any database) -- here will be ensemble of referenced-data & your needed data & output... pay attention to -- num_tokens=3, output_mode="multi_hot" -- are being given explicitly... AND numbers from class_names get apriory to model use, as is Feature Engineering - like this (in pd.DataFrame) import numpy as np import pandas as pd d = {'transport_col':['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']} dataset_df = pd.DataFrame(data=d) classes = dataset_df['transport_col'].unique().tolist() print(f"Label classes: {classes}") df= dataset_df['transport_col'].map(classes.index).copy() print(df) from manual example REF: Encode the categorical label into an integer. Details: This stage is necessary if your classification label is represented as a string. Note: Keras expected classification labels to be integers.
in another architecture, perhaps, you could use StringLookup vocab= np.array(np.unique(labels)) inp = tf.keras.Input(shape= labels.shape[0], dtype=tf.string) x = tf.keras.layers.StringLookup(vocabulary=vocab)(inp) but labels are dependent vars usually, as opposed to features, and shouldn't be used at Input Everything in keras.docs possible FULL CODE: import numpy as np import pandas as pd import keras X = np.array([['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']]) vocab= np.unique(X) print(vocab) y= np.array([[0,1,0,2,0]]) inp = layers.Input(shape=[X.shape[0]], dtype='string') x0= tf.keras.layers.StringLookup(vocabulary=vocab, name='finish')(inp) model = keras.Model(inputs=[inp], outputs=[x0]) model.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=[tf.keras.metrics.categorical_crossentropy]) print(model.summary()) from tensorflow.keras import backend as K for layerIndex, layer in enumerate(model.layers): print(layerIndex) func = K.function([model.get_layer(index=0).input], layer.output) layerOutput = func([X]) # input_data is a numpy array print(layerOutput) if layerIndex==1: # the last layer here scale = lambda x: x - 1 print(scale(layerOutput)) res: [[0 1 0 2 0]]
another possible Solution for your case - layers.TextVectorization import numpy as np import keras input_array = np.atleast_2d(np.array(['"car"', '"pedestrian"', '"car"', '"truck"', '"car"'])) vocab= np.unique(input_array) input_data = keras.Input(shape=(None,), dtype='string') layer = layers.TextVectorization( max_tokens=None, standardize=None, split=None, output_mode="int", vocabulary=vocab) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) print(output_dataset) # starts from 2 ... probably [0, 1] somehow concerns binarization ? scale = lambda x: x - 2 print(scale(output_dataset)) result: array([[0, 1, 0, 2, 0]])