Exec in function write unwanted variables in workspace - dataframe

Problem description:
I have written a code to load files from an folder into a function which puts tdms files into one single dataframe. After putting this code into a function problems appeared. I know the root of the problem is around defining the variables in the scope. I would like my function to only output "dataFrame". Instead the global in the exec function leads to the dataFrame_1,2,... in the workspace. How can I avoid this from happening?
My code in a function:
#%% Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, sqlite3
import tkinter as tk
from tkinter import filedialog
from nptdms import TdmsFile
#%% Load data
def get_dataframe():
"""
The function takes a folder path with a path dialogue and put all
tdms-files in one dataframe.
Returns
-------
TYPE
Dataframe.
"""
# select folder to load data from
def select_folder():
root = tk.Tk()
root.attributes('-topmost',True)
root.withdraw()
print("Please select a folder with the tdms-files inside...")
folder_root = filedialog.askdirectory()
return folder_root
folder = select_folder()
os.chdir(folder)
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("global tdms_file%d; tdms_file%d = TdmsFile.read(file)"
% (i,i))
exec("tdms_file%d.close()" % (i))
exec("global dataFrame_%d; global tdms_file%d; \
dataFrame_%d = tdms_file%d.\
as_dataframe(time_index=True)" % (i,i,i,i))
exec("global tdms_file%d; del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1
dataFrame = pd.concat([eval(element) for element in df_list], axis=1)
Burst_name = ["Burst {0}".format(i) for i in range(dataFrame.shape[1])]
dataFrame.columns = Burst_name
return dataFrame
dataFrame = get_dataframe()
Outside of the function this part works fine:
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("tdms_file%d = TdmsFile.read(file)" % (i))
exec("tdms_file%d.close()" % (i))
exec("dataFrame_%d = tdms_file%d.as_dataframe(time_index=True)" % (i,i))
exec("del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1

Related

Return None or empty dataframe when reading in input that is empty in PYSPARK

So I am trying to read in a folder that may be empty sometimes
The folder is called ABC.csv that has no csv in it.
df = spark.read.parquet("/Users/test/Downloads/ABC.csv")
How do I return None or an empty dataframe when reading it in as sometimes it may have contents.
Sample code snippet. Please modify based on your input files.
import glob
list_of_files = glob.glob("D:/data/in/dcad_data/*.csv")
if list_of_files:
# create dataFrame
# df = spark.read.
pass
else:
df = None
print(df)
You can check if the folder is empty or not by using python like this,
import os
# path of the directory
path = "/Users/test/Downloads/ABC.csv"
# Getting the list of directories
dir = os.listdir(path)
# Checking if the list is empty or not
if len(dir) == 0:
df = spark.createDataFrame([], StructType([]))
else:
df = spark.read.parquet("/Users/test/Downloads/ABC.csv")
or if you want to search only if parquet files are present in the folder or not, then do this,
import glob
import os.path
# path of the directory
path = "/Users/test/Downloads/ABC.csv"
parquet_files = glob.glob(os.path.join(path, '*.parquet'))
# Checking if the list is empty or not
if len(parquet_files) == 0:
df = spark.createDataFrame([], StructType([]))
else:
df = spark.read.parquet("/Users/test/Downloads/ABC.csv")

How to read each txt, convert it into image, and save them in different images?

I would like to let my code read my txt file one by one, convert it into image, and save it with different image, i.e 300s, 600s, 900s,....
I made the code down and it says only the path but did not proceed with its next code.
Could you give me some advice or find the missing or mistaken part of my codes?
import numpy as np
import matplotlib.pyplot as plt
import glob
import cv2
import os
path = './Master_thesis/Code/dnn_simulation_result/'
interval = 300
folders = []
#r=root, d=dirctories, f=files
for r, d, f in os.walk(path):
if not d:
folders.append(r)
for f in folders:
print(r)
def txt2image(folders, skiprows) :
for folder_name in folders:
IsFile=(glob.glob(folder_name+"/*.*"))
for file in IsFile:
myArray = np.loadtxt(path, skiprows = skiprows)
# Set the nodata values to nan
myArray[myArray == -9999] = np.nan
# PRISM data is stored as an integer but scaled by 100
myArray *= 1
# Plot PRISM array again
fig, ax = plt.subplots()
ax.set_title('Flood area')
# Get the img object in order to pass it to the colorbar function
img_plot = ax.imshow(myArray, cmap='jet')
# Place a colorbar next to the map
cbar = fig.colorbar(img_plot)
ax.grid(True)
plt.show()
txt2image = cv2.imwrite('D:/Master_thesis/Code/dnn_simulation_result/dnn_simulation_result/{}.jpg', img_plot)
return txt2image
txt2image(folders, 0)

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)
Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Import a growing list() of csv files only to append after imoprting [duplicate]

This question already has answers here:
Import multiple CSV files into pandas and concatenate into one DataFrame
(20 answers)
Closed 3 years ago.
So I am building a dataset with a growing set of csv's. Rather than adding the new df# = pd.read_csv(filename, index...) I would prefer to just create a function to read the list of csv's and then append them upon importing. Any recommendations? I put the code down below for what I currently have.
import glob
files = glob.glob('*.csv')
files
alg1_2018_2019 = pd.read_csv('alg1_2018_2019.csv', index_col=False)
alg1_2017_2018 = pd.read_csv('alg1_2017_2018.csv', index_col=False)
geometry_2018_2019 = pd.read_csv('geometry_2018_2019.csv', index_col=False)
geom_8_2017_2018 = pd.read_csv('geom_8_2017_2018.csv', index_col=False)
alg2_2016_2017 = pd.read_csv('alg2_2016_2017.csv', index_col=False)
alg1_2016_2017 = pd.read_csv('alg1_2016_2017.csv', index_col=False)
geom_2016_2017 = pd.read_csv('geom_2016_2017.csv', index_col=False)
geom_2015_2016 = pd.read_csv('geom_2015_2016.csv', index_col=False)
alg2_2015_2016 = pd.read_csv('alg2_2015_2016.csv', index_col=False)
alg1_part2_2015_2016 = pd.read_csv('alg1_part2_2015_2016.csv', index_col=False)```
i'm using the following function:
import pandas as pd
from pathlib import Path
def glob_filemask(filemask):
"""
allows to "glob" files using file masks with full path
Usage:
for file in glob_filemask("/path/to/file_*.txt"):
# process file here
or:
files = list(glob_filemask("/path/to/file_*.txt"))
:param filemask: wildcards can be used only in the last part
(file name or extension), but NOT in the directory part
:return: Pathlib glob generator, for all matching files
Example:
glob_filemask("/root/subdir/data_*.csv") -
will return a Pathlib glob generator for all matching files
glob_filemask("/root/subdir/single_file.csv") -
will return a Pathlib glob generator for a single file
"""
p = Path(filemask)
try:
if p.is_file():
return [p]
except OSError:
return p.parent.glob(p.name)
Usage:
df = pd.concat([pd.read_csv(f) for f in glob_filemask("/path/to/file_*.csv")],
ignore_index=True)

IPYTHON Script that takes an arbitrary number of input files, strips + stores the header, and saves data points

I am trying to take any number of files (1-10 for simplicity?) that are specified on the command line, strip + store the headers, and saves data points (Then later plots it using MATPLOT. The last command line specified name will be the name of an output file.
What I have below is a script that takes 2 command line specified files and plots them, with the 3rd command line name specifying the name of the output file.
import sys
sys.argv
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D
import numpy as np
# Reads Arguments
ReadFile1 = sys.argv[1]
ReadFile2 = sys.argv[2]
WriteFile = sys.argv[3]
# opens file
RedFile1 = open(ReadFile1)
RedFile2 = open(ReadFile2)
###
### Start collecting data ###
###
# Removes header from file 1 and stores it
header1 = RedFile1.readline().rstrip()
# Creates a list from values in file 1 , reads argument specified file,
# puts data into the list, and converts into an array.
Data1 = []
for line in RedFile1:
Data1.append(line.rstrip().split(','))
Reference1 = np.array(Data1).astype(np.float)
# Seperates array into x and y values in the first document
# based on the columb it is in
xvalues1 = Reference1[:, 0]
yvalues1 = Reference1[:, 1]
######
###### moving on to the second part ######
######
# Removes header from file 2, stores it, and puts it in an array.
header2 = RedFile2.readline().rstrip()
# Creates a list from values in file 2 , reads argument specified file,
# puts data into the list, and converts into an array.
Data2 = []
for line in RedFile2:
Data2.append(line.rstrip().split(','))
Reference2 = np.array(Data2).astype(np.float)
# Seperates array into x and y values in the second document
# based on the columb it is in
xvalues2 = Reference2[:,0]
yvalues2 = Reference2[:,1]
######
###### Graphing ######
######
# Spits out a graph of file 1 and 2 together
line1 = plt.plot(xvalues1, yvalues1, 'o-', label = header1)
line2 = plt.plot(xvalues2, yvalues2, 'o-', label = header2)
plt.ylabel('Cell Count')
plt.xlabel('Time (min)')
plt.legend(numpoints=1, loc=0)
# Saves the graph into the argument specified name
plt.savefig(WriteFile)
My pathetic attempt looks like:
import sys
sys.argv
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
# Takes arguments, places them in list rawr, then the for loop opens each file, strips
# the header, and places the headers into allheaders
rawr = sys.argv[1:-1]
#Write file name
WriteFile = sys.argv[-1]
#Opens a file, removes header and stores it, stores data in an array
#The array is accessed to create a list of x adn y values
#plot is constructed for one file
#loop continues until no more files
for n in rawr:
open(n)
header = n.readline().rstrip()
Data = []
for line in n:
Data.append(line.rstrip().split(','))
Reference1 = np.array(Data1).astype(np.float)
xvalues = Reference[:, 0]
yvalues = Reference[:, 1]
plt.plot(xvalues, yvalues, label=header)
# Spits out a graph of file(s) together
plt.ylabel('Cell count')
plt.xlabel('Time min')
plt.legend(numpoints=1, loc=0)
# Saves the graph into the argument specified name
plt.savefig(WriteFile + '.png')
EDIT: I figured it out, here is the answer, the for loop should look like:
for n in rawr:
eachfile = open(n)
header = eachfile.readline().rstrip()
Data = []
for line in eachfile:
Data.append(line.rstrip().split(','))
Reference = np.array(Data).astype(np.float)
xvalues = Reference[:, 0]
yvalues = Reference[:, 1]
line = plt.plot(xvalues, yvalues, 'o-', label=header)