Combining CSV of different shapes into one CSV - pandas

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)

Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Related

Exec in function write unwanted variables in workspace

Problem description:
I have written a code to load files from an folder into a function which puts tdms files into one single dataframe. After putting this code into a function problems appeared. I know the root of the problem is around defining the variables in the scope. I would like my function to only output "dataFrame". Instead the global in the exec function leads to the dataFrame_1,2,... in the workspace. How can I avoid this from happening?
My code in a function:
#%% Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, sqlite3
import tkinter as tk
from tkinter import filedialog
from nptdms import TdmsFile
#%% Load data
def get_dataframe():
"""
The function takes a folder path with a path dialogue and put all
tdms-files in one dataframe.
Returns
-------
TYPE
Dataframe.
"""
# select folder to load data from
def select_folder():
root = tk.Tk()
root.attributes('-topmost',True)
root.withdraw()
print("Please select a folder with the tdms-files inside...")
folder_root = filedialog.askdirectory()
return folder_root
folder = select_folder()
os.chdir(folder)
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("global tdms_file%d; tdms_file%d = TdmsFile.read(file)"
% (i,i))
exec("tdms_file%d.close()" % (i))
exec("global dataFrame_%d; global tdms_file%d; \
dataFrame_%d = tdms_file%d.\
as_dataframe(time_index=True)" % (i,i,i,i))
exec("global tdms_file%d; del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1
dataFrame = pd.concat([eval(element) for element in df_list], axis=1)
Burst_name = ["Burst {0}".format(i) for i in range(dataFrame.shape[1])]
dataFrame.columns = Burst_name
return dataFrame
dataFrame = get_dataframe()
Outside of the function this part works fine:
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("tdms_file%d = TdmsFile.read(file)" % (i))
exec("tdms_file%d.close()" % (i))
exec("dataFrame_%d = tdms_file%d.as_dataframe(time_index=True)" % (i,i))
exec("del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1

How to use multiple CSV files for machine learning anomaly detection

I have a question about how to get my data in a shape that I can use for my ML model. I have multiple CSV files that I want to fit in an algorithm for anomaly detection. My data consists of many files with each being the recorded data from a sensor with two features (intensity and depth) and one timestamp per data point. Each file is labeled with 0 = faulty data and 1 = good data.
Let's say I have 20 files: y should be the label per file y = [[1], [0], ...] and X should be all the data from the sensor X = [[data_file0], [data_file1], ..., [data_file19]] that I can use to train my models. What can I do to get my data in the right format? I tried appending the data frame of every file to a list and transformed it to a dataset and a np.array and so on. I tried different shapes too.
all_files = glob.glob(path + "/*.txt")
df_list = []
snr_list = []
for filename in all_files:
#Für jede Datei wird ein df angelegt und unwichtige features entfernt
#try with dataset with filename and all_files
dataset = tf.data.Dataset.from_tensor_slices(all_files)
def parse_fn(filename):
return tf.data.Dataset.range(10)
dataset = dataset.interleave(lambda x:
tf.data.TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
cycle_length=4, block_length=16)
#try df_list
df = pd.read_csv(filename, index_col=0, header=0, decimal = '.', delimiter = ';')
df.drop(columns=['ET_original', 'Auslenkung_ET', 'ET_unkorrigiert'], axis = 1, inplace = True)
#Zu jedem Zeitpunkt wird der Startzeitpunkt abgezogen: t0 = 1 ... tn = t_n - t0
starttime = df.Zeit_ET[0]
for row in df.itertuples():
df.at[row.Index, 'Zeit_ET'] = df.Zeit_ET[row.Index] - starttime
df.Zeit_ET[0] = 1
#alle arrays einer List hinzufügen
df_list.append(df.to_numpy().reshape(-1, 1700, 3))
#other testings
#test = tf.constant(pd.DataFrame(dic, columns=['1', '1', ' 1']))
#ps=pd.DataFrame(dic, index=['dsf'])
#df_list, test_df (1 df), und tf_const (1 df) zurückgeben
return df_list, df.to_numpy().reshape(-1, 1700, 3), tf.constant(df.to_numpy().reshape(1, 1700, 3), dtype = tf.float32)
#nur für Testzwecke
df_list, test_df, tf_const = Alle_OCT_txt_Daten()
It sounds like the files are the same, but each has a distinct time stamp, right. Juts load everything into a dataframe and run your AI or ML algo on the dataframe.
# import necessary libraries
import pandas as pd
import os
import glob
# use glob to get all the csv files
# in the folder
path = 'C:\\your_path_here\\'
csv_files = glob.glob(os.path.join(path, "*.csv"))
li = []
for filename in csv_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
print(frame)

Concatenate CSVs into XLSX files based on filename (pandas)

I have a bunch of CSVs with names '<3-letter-string> YYYY.csv'. There are four different versions of <3-letter-string>, and I want to sort the csvs into four xlsxs, each identified by that three letter string.
My code:
import pandas as pd
import os
full_df = pd.DataFrame()
for filename in os.listdir('C:/Users/XXXXXX/ZZZZZZ'):
if filename.endswith(".csv"):
print(filename)
df = pd.read_csv(filename, skiprows=1, names=['ID','Units Sold','Retail Dollars'])
df['Year'] = filename[-8:-4]
full_df = pd.concat([full_df, df])
full_df.to_excel(filename[0:3] + '.xlsx', index=False)
This makes four different xlsxs, which is what I want, but they're all a mixture of the different csvs.
How do I tell pandas to group them into four separate xlsxs according to the filename? My initial thought is to include filename slicing in the penultimate line and create four different concatenated full_df dataframes to write separately, but I'm not sure how.
import pandas as pd
import os
def Get_Yo_Fantasy_Hennnnnyyyyy():
full_df = pd.DataFrame()
for filename in os.listdir("path"):
if filename.endswith(".csv"):
print(filename)
df = pd.read_csv(
filename,
skiprows=1,
names=["ID", "Units Sold", "Retail Dollars"])
df["Year"] = filename[-8:-4]
df["Type"] = filename[0:3]
full_df = pd.concat([full_df, df])
for i in list(full_df.Type.unique()):
full_df[full_df.Type.str.contains(i)].to_excel(
"{}".format(i) + ".xlsx", index=False)
Get_Yo_Fantasy_Hennnnnyyyyy()

How to read multiple files and save a single output with pandas use command line arguments

I have multiple files in TXT format how to get all the values ​​with a single output Merge values ​​into a single file use command line arguments in pandas
like this:
python3 file1.txt file2.txt file3.txt
Code:
import pandas as pd
import socket, struct
import os
import glob
import sys
try:
file = sys.argv[1]
except Exception:
print("Usage: python3 {} [file]".format(sys.argv[0]))
sys.exit()
os.chdir('/Users/roc/Desktop/js/projj')
fileList = glob.glob('*.txt')
appended_data = []
for file in fileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int)
df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
print(result_df.to_csv("/Users/roc/Desktop/js/projj/delegated2.txt",sep=' ', index=False, header=False))
Use the below to iterate through a folder and append all files to a single dataframe
import os
import glob
os.chdir('C:\\path_to_folder\\')
Filelist = glob.glob('*.txt')
appended_data = []
for file in FileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
Once you have the df which is combined of all the data from all files, use the next part of the code:
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int) df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
result_df.to_csv("/Users/roc/Desktop/output.txt",sep=' ', index=False, header=False)

Jupyter notebook display two pandas tables side by side

I have two pandas dataframes and I would like to display them in Jupyter notebook.
Doing something like:
display(df1)
display(df2)
Shows them one below another:
I would like to have a second dataframe on the right of the first one. There is a similar question, but it looks like there a person is satisfied either with merging them in one dataframe of showing the difference between them.
This will not work for me. In my case dataframes can represent completely different (non-comparable elements) and the size of them can be different. Thus my main goal is to save space.
I have ended up writing a function that can do this:
[update: added titles based on suggestions (thnx #Antony_Hatchkins et al.)]
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
html_str=''
for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
html_str+='<th style="text-align:center"><td style="vertical-align:top">'
html_str+=f'<h2 style="text-align: center;">{title}</h2>'
html_str+=df.to_html().replace('table','table style="display:inline"')
html_str+='</td></th>'
display_html(html_str,raw=True)
Example usage:
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])
display_side_by_side(df1,df2,df1, titles=['Foo','Foo Bar']) #we left 3rd empty...
You could override the CSS of the output code. It uses flex-direction: column by default. Try changing it to row instead. Here's an example:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
CSS = """
.output {
flex-direction: row;
}
"""
HTML('<style>{}</style>'.format(CSS))
You could, of course, customize the CSS further as you wish.
If you wish to target only one cell's output, try using the :nth-child() selector. For example, this code will modify the CSS of the output of only the 5th cell in the notebook:
CSS = """
div.cell:nth-child(5) .output {
flex-direction: row;
}
"""
Starting from pandas 0.17.1 the visualization of DataFrames can be directly modified with pandas styling methods
To display two DataFrames side by side you must use set_table_attributes with the argument "style='display:inline'" as suggested in ntg answer. This will return two Styler objects. To display the aligned dataframes just pass their joined HTML representation through the display_html method from IPython.
With this method is also easier to add other styling options. Here's how to add a caption, as requested here:
import numpy as np
import pandas as pd
from IPython.display import display_html
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])
df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Caption table 1')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Caption table 2')
display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)
Combining approaches of gibbone (to set styles and captions) and stevi (adding space) I made my version of function, which outputs pandas dataframes as tables side-by-side:
from IPython.core.display import display, HTML
def display_side_by_side(dfs:list, captions:list):
"""Display tables side by side to save vertical space
Input:
dfs: list of pandas.DataFrame
captions: list of table captions
"""
output = ""
combined = dict(zip(captions, dfs))
for caption, df in combined.items():
output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
output += "\xa0\xa0\xa0"
display(HTML(output))
Usage:
display_side_by_side([df1, df2, df3], ['caption1', 'caption2', 'caption3'])
Output:
My solution just builds a table in HTML without any CSS hacks and outputs it:
import pandas as pd
from IPython.display import display,HTML
def multi_column_df_display(list_dfs, cols=3):
html_table = "<table style='width:100%; border:0px'>{content}</table>"
html_row = "<tr style='border:0px'>{content}</tr>"
html_cell = "<td style='width:{width}%;vertical-align:top;border:0px'>{{content}}</td>"
html_cell = html_cell.format(width=100/cols)
cells = [ html_cell.format(content=df.to_html()) for df in list_dfs ]
cells += (cols - (len(list_dfs)%cols)) * [html_cell.format(content="")] # pad
rows = [ html_row.format(content="".join(cells[i:i+cols])) for i in range(0,len(cells),cols)]
display(HTML(html_table.format(content="".join(rows))))
list_dfs = []
list_dfs.append( pd.DataFrame(2*[{"x":"hello"}]) )
list_dfs.append( pd.DataFrame(2*[{"x":"world"}]) )
multi_column_df_display(2*list_dfs)
Here's another variation of the display_side_by_side() function introduced by #Anton Golubev that combines gibbone (to set styles and captions) and stevi (adding space), I added an extra argument to change spacing between tables at run-time.
from IPython.core.display import display, HTML
def display_side_by_side(dfs:list, captions:list, tablespacing=5):
"""Display tables side by side to save vertical space
Input:
dfs: list of pandas.DataFrame
captions: list of table captions
"""
output = ""
for (caption, df) in zip(captions, dfs):
output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
output += tablespacing * "\xa0"
display(HTML(output))
display_side_by_side([df1, df2, df3], ['caption1', 'caption2', 'caption3'])
The tablespacing=5 default argument value (shown = 5 here) determines the vertical spacing between tables.
This adds (optional) headers, index and Series support to #nts's answer:
from IPython.display import display_html
def mydisplay(dfs, names=[], index=False):
def to_df(x):
if isinstance(x, pd.Series):
return pd.DataFrame(x)
else:
return x
html_str = ''
if names:
html_str += ('<tr>' +
''.join(f'<td style="text-align:center">{name}</td>' for name in names) +
'</tr>')
html_str += ('<tr>' +
''.join(f'<td style="vertical-align:top"> {to_df(df).to_html(index=index)}</td>'
for df in dfs) +
'</tr>')
html_str = f'<table>{html_str}</table>'
html_str = html_str.replace('table','table style="display:inline"')
display_html(html_str, raw=True)
Here is Jake Vanderplas' solution I came across just the other day:
import numpy as np
import pandas as pd
class display(object):
"""Display HTML representation of multiple objects"""
template = """<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
</div>"""
def __init__(self, *args):
self.args = args
def _repr_html_(self):
return '\n'.join(self.template.format(a, eval(a)._repr_html_())
for a in self.args)
def __repr__(self):
return '\n\n'.join(a + '\n' + repr(eval(a))
for a in self.args)
Credit: https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb
#zarak code is pretty small but affects the layout of the whole notebook. Other options are a bit messy for me.
I've added some clear CSS to this answer affecting only current cell output. Also you are able to add anything below or above dataframes.
from ipywidgets import widgets, Layout
from IPython import display
import pandas as pd
import numpy as np
# sample data
df1 = pd.DataFrame(np.random.randn(8, 3))
df2 = pd.DataFrame(np.random.randn(8, 3))
# create output widgets
widget1 = widgets.Output()
widget2 = widgets.Output()
# render in output widgets
with widget1:
display.display(df1.style.set_caption('First dataframe'))
df1.info()
with widget2:
display.display(df2.style.set_caption('Second dataframe'))
df1.info()
# add some CSS styles to distribute free space
box_layout = Layout(display='flex',
flex_flow='row',
justify_content='space-around',
width='auto'
)
# create Horisontal Box container
hbox = widgets.HBox([widget1, widget2], layout=box_layout)
# render hbox
hbox
I ended up using HBOX
import ipywidgets as ipyw
def get_html_table(target_df, title):
df_style = target_df.style.set_table_attributes("style='border:2px solid;font-size:10px;margin:10px'").set_caption(title)
return df_style._repr_html_()
df_2_html_table = get_html_table(df_2, 'Data from Google Sheet')
df_4_html_table = get_html_table(df_4, 'Data from Jira')
ipyw.HBox((ipyw.HTML(df_2_html_table),ipyw.HTML(df_4_html_table)))
Gibbone's answer worked for me! If you want extra space between the tables go to the code he proposed and add this "\xa0\xa0\xa0" to the following code line.
display_html(df1_styler._repr_html_()+"\xa0\xa0\xa0"+df2_styler._repr_html_(), raw=True)
I decided to add some extra functionality to Yasin's elegant answer, where one can choose both the number of cols and rows; any extra dfs are then added to the bottom.
Additionally one can choose in which order to fill the grid (simply change fill keyword to 'cols' or 'rows' as needed)
import pandas as pd
from IPython.display import display,HTML
def grid_df_display(list_dfs, rows = 2, cols=3, fill = 'cols'):
html_table = "<table style='width:100%; border:0px'>{content}</table>"
html_row = "<tr style='border:0px'>{content}</tr>"
html_cell = "<td style='width:{width}%;vertical-align:top;border:0px'>{{content}}</td>"
html_cell = html_cell.format(width=100/cols)
cells = [ html_cell.format(content=df.to_html()) for df in list_dfs[:rows*cols] ]
cells += cols * [html_cell.format(content="")] # pad
if fill == 'rows': #fill in rows first (first row: 0,1,2,... col-1)
grid = [ html_row.format(content="".join(cells[i:i+cols])) for i in range(0,rows*cols,cols)]
if fill == 'cols': #fill columns first (first column: 0,1,2,..., rows-1)
grid = [ html_row.format(content="".join(cells[i:rows*cols:rows])) for i in range(0,rows)]
display(HTML(html_table.format(content="".join(grid))))
#add extra dfs to bottom
[display(list_dfs[i]) for i in range(rows*cols,len(list_dfs))]
list_dfs = []
list_dfs.extend((pd.DataFrame(2*[{"x":"hello"}]),
pd.DataFrame(2*[{"x":"world"}]),
pd.DataFrame(2*[{"x":"gdbye"}])))
grid_df_display(3*list_dfs)
test output
Extension of antony's answer If you want to limit de visualization of tables to some numer of blocks by row, use the maxTables variable.
def mydisplay(dfs, names=[]):
count = 0
maxTables = 6
if not names:
names = [x for x in range(len(dfs))]
html_str = ''
html_th = ''
html_td = ''
for df, name in zip(dfs, names):
if count <= (maxTables):
html_th += (''.join(f'<th style="text-align:center">{name}</th>'))
html_td += (''.join(f'<td style="vertical-align:top"> {df.to_html(index=False)}</td>'))
count += 1
else:
html_str += f'<tr>{html_th}</tr><tr>{html_td}</tr>'
html_th = f'<th style="text-align:center">{name}</th>'
html_td = f'<td style="vertical-align:top"> {df.to_html(index=False)}</td>'
count = 0
if count != 0:
html_str += f'<tr>{html_th}</tr><tr>{html_td}</tr>'
html_str += f'<table>{html_str}</table>'
html_str = html_str.replace('table','table style="display:inline"')
display_html(html_str, raw=True)