How to use multiple CSV files for machine learning anomaly detection - tensorflow

I have a question about how to get my data in a shape that I can use for my ML model. I have multiple CSV files that I want to fit in an algorithm for anomaly detection. My data consists of many files with each being the recorded data from a sensor with two features (intensity and depth) and one timestamp per data point. Each file is labeled with 0 = faulty data and 1 = good data.
Let's say I have 20 files: y should be the label per file y = [[1], [0], ...] and X should be all the data from the sensor X = [[data_file0], [data_file1], ..., [data_file19]] that I can use to train my models. What can I do to get my data in the right format? I tried appending the data frame of every file to a list and transformed it to a dataset and a np.array and so on. I tried different shapes too.
all_files = glob.glob(path + "/*.txt")
df_list = []
snr_list = []
for filename in all_files:
#Für jede Datei wird ein df angelegt und unwichtige features entfernt
#try with dataset with filename and all_files
dataset = tf.data.Dataset.from_tensor_slices(all_files)
def parse_fn(filename):
return tf.data.Dataset.range(10)
dataset = dataset.interleave(lambda x:
tf.data.TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
cycle_length=4, block_length=16)
#try df_list
df = pd.read_csv(filename, index_col=0, header=0, decimal = '.', delimiter = ';')
df.drop(columns=['ET_original', 'Auslenkung_ET', 'ET_unkorrigiert'], axis = 1, inplace = True)
#Zu jedem Zeitpunkt wird der Startzeitpunkt abgezogen: t0 = 1 ... tn = t_n - t0
starttime = df.Zeit_ET[0]
for row in df.itertuples():
df.at[row.Index, 'Zeit_ET'] = df.Zeit_ET[row.Index] - starttime
df.Zeit_ET[0] = 1
#alle arrays einer List hinzufügen
df_list.append(df.to_numpy().reshape(-1, 1700, 3))
#other testings
#test = tf.constant(pd.DataFrame(dic, columns=['1', '1', ' 1']))
#ps=pd.DataFrame(dic, index=['dsf'])
#df_list, test_df (1 df), und tf_const (1 df) zurückgeben
return df_list, df.to_numpy().reshape(-1, 1700, 3), tf.constant(df.to_numpy().reshape(1, 1700, 3), dtype = tf.float32)
#nur für Testzwecke
df_list, test_df, tf_const = Alle_OCT_txt_Daten()

It sounds like the files are the same, but each has a distinct time stamp, right. Juts load everything into a dataframe and run your AI or ML algo on the dataframe.
# import necessary libraries
import pandas as pd
import os
import glob
# use glob to get all the csv files
# in the folder
path = 'C:\\your_path_here\\'
csv_files = glob.glob(os.path.join(path, "*.csv"))
li = []
for filename in csv_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
print(frame)

Related

Tensorflow: How to zip together multiple concatenated datasets?

I am attempting to zip together multiple tf.data.Dataset objects for a multi-input keras model. Each tf.data.Dataset object is a concatenation of multiple dataframes, each with the same number of columns, but not necessarily the same number of lines.
I am able to create the full dataset, but then when I try to pass the dataset in a keras model I get an error :
TypeError: Inputs to a layer should be tensors. Got: <tensorflow.python.data.ops.dataset_ops._NestedVariant object
The problem is that I would really like to take advantage of the lazy structure of the tf.data.Dataset since I am using the window function, but I am having difficulty aggregating all the datasets together.
How do I zip together multiple datasets in a way that the can be passed into the model.fit() function?
Any help would be appreciated.
Here is a simple functional code that recreates my problem :
import pandas as pd
import numpy as np
import tensorflow as tf
# Create dataframes with 4 features and target
dataframe1 = pd.DataFrame(np.random.randn(1000, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
dataframe2 = pd.DataFrame(np.random.randn(800, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
# Convert dataframes to datasets
def get_dataset(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].iloc[4:].to_numpy())
return dataset
def get_dataset_windowed(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].to_numpy()).window(5, shift=1, stride=1,
drop_remainder=True)
return dataset
windowed_dataset = [get_dataset_windowed(x, ["feature3", "feature4"]) for x in [dataframe1, dataframe2]]
windowed_dataset = tf.data.Dataset.from_tensor_slices(windowed_dataset)
windowed_dataset = windowed_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
static_dataset = [get_dataset(x, ["feature1", "feature2"]) for x in [dataframe1, dataframe2]]
static_dataset = tf.data.Dataset.from_tensor_slices(static_dataset)
static_dataset = static_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
targets = [get_dataset(x, ["target"]) for x in [dataframe1, dataframe2]]
targets = tf.data.Dataset.from_tensor_slices(targets)
targets = targets.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
# Zip datasets together
full_dataset = tf.data.Dataset.zip(
(
{
"short_term_ts_input": windowed_dataset,
"static_input": static_dataset,
},
{
"output": targets,
}
)
)
full_dataset = full_dataset.shuffle(buffer_size=1024).batch(128)
# Creating, compiling and fitting model
short_term_ts_input = tf.keras.Input(shape=(5, 2), name="short_term_ts_input")
static_input = tf.keras.Input(shape=(2), name="static_input")
targets = tf.keras.Input(shape=(1,), name="output")
short_term_ts_features = tf.keras.layers.LSTM(32, return_sequences=False)(short_term_ts_input)
short_term_ts_features = tf.keras.layers.Dense(8)(short_term_ts_features)
static_features = tf.keras.layers.Dense(16)(static_input)
x_concat = tf.keras.layers.concatenate([short_term_ts_features, static_features])
x_concat = tf.keras.layers.Dense(32)(x_concat)
output = tf.keras.layers.Dense(1)(x_concat)
model = tf.keras.Model(inputs=[short_term_ts_input, static_input], outputs=[output])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
tf.keras.utils.plot_model(model, "model_test.png", show_shapes=True)
model.fit(full_dataset)`
Maybe something like this:
import pandas as pd
import numpy as np
import tensorflow as tf
# Create dataframes with 4 features and target
dataframe1 = pd.DataFrame(np.random.randn(1000, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
dataframe2 = pd.DataFrame(np.random.randn(800, 5), columns=["feature1", "feature2", "feature3", "feature4", "target"])
# Convert dataframes to datasets
def get_dataset(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].iloc[4:].to_numpy())
return dataset
def get_dataset_windowed(df: pd.DataFrame, features):
dataset = tf.data.Dataset.from_tensor_slices(df.loc[:, features].to_numpy()).window(5, shift=1, stride=1,
drop_remainder=True)
return dataset
windowed_dataset = [get_dataset_windowed(x, ["feature3", "feature4"]) for x in [dataframe1, dataframe2]]
windowed_dataset = tf.data.Dataset.from_tensor_slices(windowed_dataset)
windowed_dataset = windowed_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE).flat_map(lambda z: z.batch(5))
static_dataset = [get_dataset(x, ["feature1", "feature2"]) for x in [dataframe1, dataframe2]]
static_dataset = tf.data.Dataset.from_tensor_slices(static_dataset)
static_dataset = static_dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
targets = [get_dataset(x, ["target"]) for x in [dataframe1, dataframe2]]
targets = tf.data.Dataset.from_tensor_slices(targets)
targets = targets.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)
# Zip datasets together
full_dataset = tf.data.Dataset.zip(
(
{
"short_term_ts_input": windowed_dataset,
"static_input": static_dataset,
},
)
)
full_dataset = tf.data.Dataset.zip((full_dataset, targets))
full_dataset = full_dataset.shuffle(buffer_size=1024).batch(128)
# Creating, compiling and fitting model
short_term_ts_input = tf.keras.Input(shape=(5, 2), name="short_term_ts_input")
static_input = tf.keras.Input(shape=(2), name="static_input")
short_term_ts_features = tf.keras.layers.LSTM(32, return_sequences=False)(short_term_ts_input)
short_term_ts_features = tf.keras.layers.Dense(8)(short_term_ts_features)
static_features = tf.keras.layers.Dense(16)(static_input)
x_concat = tf.keras.layers.concatenate([short_term_ts_features, static_features])
x_concat = tf.keras.layers.Dense(32)(x_concat)
output = tf.keras.layers.Dense(1)(x_concat)
model = tf.keras.Model(inputs=[short_term_ts_input, static_input], outputs=[output])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')
tf.keras.utils.plot_model(model, "model_test.png", show_shapes=True)
model.fit(full_dataset)

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)
Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Passing pandas NumPy arrays as feature vectors in scikit learn?

I have a vector of 5 different values that I use as my sample value, and the label is a single integer of 0, 1, or 3. The machine learning algorithms work when I pass an array as a sample, but I get this warning. How do I pass feature vectors without getting this warning?
import numpy as np
from numpy import random
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import pandas as pd
filepath = 'test.csv'
# example label values
index = [0,1,3,1,1,1,0,0]
# example sample arrays
data = []
for i in range(len(index)):
d = []
for i in range(6):
d.append(random.randint(50,200))
data.append(d)
feat1 = 'brightness'
feat2, feat3, feat4 = ['h', 's', 'v']
feat5 = 'median hue'
feat6 = 'median value'
features = [feat1, feat2, feat3, feat4, feat5, feat6]
df = pd.DataFrame(data, columns=features, index=index)
df.index.name = 'state'
with open(filepath, 'a') as f:
df.to_csv(f, header=f.tell() == 0)
states = pd.read_csv(filepath, usecols=['state'])
df_partial = pd.read_csv(filepath, usecols=features)
states = states.astype(np.float32)
states = states.values
labels = states
samples = np.array([])
for i, row in df_partial.iterrows():
r = row.values
samples = np.vstack((samples, r)) if samples.size else r
n_neighbors = 5
test_size = .3
labels, test_labels, samples, test_samples = train_test_split(labels, samples, test_size=test_size)
clf1 = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf1 = clf1.fit(samples, labels)
score1 = clf1.score(test_samples, test_labels)
print("Here's how the models performed \nknn: %d %%" %(score1 * 100))
Warning:
"DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). clf1 = clf1.fit(samples, labels)"
sklearn documentation for fit(self, X, Y)
Try replacing
states = states.values by states = states.values.flatten()
OR
clf1 = clf1.fit(samples, labels) by clf1 = clf1.fit(samples, labels.flatten()).
states = states.values holds the correct labels that were stored in your panda dataframe, however they are getting stored on different rows. Using .flatten() put all those labels on the same row. (https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.ndarray.flatten.html)
In Sklearn's KNeighborsClassifier documentation
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), they show in their example that the labels must be stored on the same row: y = [0, 0, 1, 1].
When you retrieve data from dataframe states, it is stored in multiple rows (column vector) whereas it expected values in single row.
You can also try using ravel() function which is used to create a contiguous flattened array.
numpy.ravel(array, order = ‘C’) : returns contiguous flattened array (1D array with all the input-array elements and with the same type as it)
Try:
states = states.values.ravel() in place of states = states.values

OneHotEncoding mapping issue between training data and test data

I've transformed training and test data set by sklearn OneHotEncoding method. However, trnsformed results have different type shape. So It is impossible to apply to other algorithms like logistic regression.
How do I reshape the test data in accordance with the training data set's shape?
Best Regardings, Chris
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
def data_transformation(data, dummy):
le = LabelEncoder()
# Encoding the columns with multiple categorical levels
for col1 in dummy:
le.fit(data[col1])
data[col1] = le.transform(data[col1])
dummy_data = np.array(data[dummy])
enc = OneHotEncoder()
enc.fit(dummy_data)
dummy_data = enc.transform(dummy_data).toarray()
if __name__ == '__main__':
data = pd.read_csv('train.data', delimiter=',')
data_test = pd.read_csv('test.data', delimiter=',')
dummy_columns = ['Column1', 'Column2']
data = data_transformation(data, dummy_columns)
data_test = data_transformation(data_test, dummy_columns)
# result
# data shape : (200000, 71 )
# data_test shape : ( 15000, 32)
Thank you so much, Vivek! I've solved this issue due to your help.
def data_transformation2(data, data_test, dummy):
le = LabelEncoder()
# Encoding the columns with multiple categorical levels
for col in dummy:
le.fit(data[col])
data[col] = le.transform(data[col])
for col in dummy:
le.fit(data_test[col])
data_test[col] = le.transform(data_test[col])
enc = OneHotEncoder()
dummy_data = np.array(data[dummy])
dummy_data_test = np.array(data_test[dummy])
enc.fit(dummy_data)
dummy_data = enc.transform(dummy_data).toarray()
dummy_data_test = enc.transform(dummy_data_test).toarray()
print(dummy_data.shape)
print(dummy_data_test.shape)

Visualizing class labels in self-organizing map plot or iris dataset

I am trying to produce a visualization of the SOM mapping for the Iris dataset ( https://archive.ics.uci.edu/ml/datasets/Iris).
My code so far:
from sklearn.datasets import load_iris
from mvpa2.suite import *
import pandas as pd
import numpy as np
df = pd.read_csv(filepath_or_buffer='data/iris.data', header=None, sep=',')
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.dropna(how="all", inplace=True) # drops the empty line at file-end
# split the data table into feature data x and class labels y
x = df.ix[:,0:4].values # the first 4 columns are the features
y = df.ix[:,4].values # the last column is the class label
t = np.zeros(len(y), dtype=int)
t[y == 'Iris-setosa'] = 0
t[y == 'Iris-versicolor'] = 1
t[y == 'Iris-virginica'] = 2
som = SimpleSOMMapper((240, 320), 100, learning_rate=0.05)
som.train(x)
pl.imshow(som.K, origin='lower')
mapped = som(x)
for i, m in enumerate(mapped):
pl.text(m[1], m[0], t[i], ha='center', va='center',
bbox=dict(facecolor='white', alpha=0.5, lw=0))
pl.show()
which produces this mapping:
Is there any way to customize the palette so it looks nicer like this one? (taken from https://github.com/JustGlowing/minisom)?
Basically I am trying to use a nicer palette (perhaps with fewer colors) and mark the class labels in a nicer way.
Thank you.
I will answer my own question: it turns out that I forgot to slice my data:
pl.imshow(som.K[:,:,0], origin='lower')
Everything looks fine now: