Return None or empty dataframe when reading in input that is empty in PYSPARK - dataframe

So I am trying to read in a folder that may be empty sometimes
The folder is called ABC.csv that has no csv in it.
df = spark.read.parquet("/Users/test/Downloads/ABC.csv")
How do I return None or an empty dataframe when reading it in as sometimes it may have contents.

Sample code snippet. Please modify based on your input files.
import glob
list_of_files = glob.glob("D:/data/in/dcad_data/*.csv")
if list_of_files:
# create dataFrame
# df = spark.read.
pass
else:
df = None
print(df)

You can check if the folder is empty or not by using python like this,
import os
# path of the directory
path = "/Users/test/Downloads/ABC.csv"
# Getting the list of directories
dir = os.listdir(path)
# Checking if the list is empty or not
if len(dir) == 0:
df = spark.createDataFrame([], StructType([]))
else:
df = spark.read.parquet("/Users/test/Downloads/ABC.csv")
or if you want to search only if parquet files are present in the folder or not, then do this,
import glob
import os.path
# path of the directory
path = "/Users/test/Downloads/ABC.csv"
parquet_files = glob.glob(os.path.join(path, '*.parquet'))
# Checking if the list is empty or not
if len(parquet_files) == 0:
df = spark.createDataFrame([], StructType([]))
else:
df = spark.read.parquet("/Users/test/Downloads/ABC.csv")

Related

Returning all the column names as lists from multiple Parquet Files in Python

I have more than 100 Parquet files in a folder. I am not sure if all the files are having same feature name(column name). I want to write some python codes, through pandas which could read all the file in directory and return the name of columns with file name as prefix.
I tried 'for loop', but not sure how to structure the query. Being a beginner I could not write looped script.
import glob
path = r'C:\Users\NewFOlder1\NewFOlder\Folder'
all_files = glob.glob(path + '\*.gzip')
col=[]
for paths in all_files:
df=pd.read_parquet(paths)
col.append(df.columns)
print(col)
IIUC, use pandas.concat with pandas.DataFrame.columns :
import glob
import pandas as pd
path = r'C:\Users\NewFOlder1\NewFOlder\Folder'
all_files = glob.glob(path + '\*.gzip')
list_dfs = []
for paths in all_files:
df = pd.read_parquet(paths)
list_dfs.append(df)
col_names = pd.concat(list_dfs).columns.tolist()
Can you try this:
import glob
import pandas as pd
path = r'C:\Users\NewFOlder1\NewFOlder\Folder'
all_files = glob.glob(path + '\*.gzip')
col=[]
for paths in all_files:
df=pd.read_parquet(paths)
col.append(list(df.columns + '_' + paths))
print(col)
if the filenames are like this: "abcd.parquet" (if not please provide sample of filename), you can try something like this to find the differences:
replaced_cols=[i.split("_",1)[0] for i in col]
differences=[]
for i in col:
val=i.split("_", 1)[0]
if not val in replaced_cols:
differences.append(i)

Exec in function write unwanted variables in workspace

Problem description:
I have written a code to load files from an folder into a function which puts tdms files into one single dataframe. After putting this code into a function problems appeared. I know the root of the problem is around defining the variables in the scope. I would like my function to only output "dataFrame". Instead the global in the exec function leads to the dataFrame_1,2,... in the workspace. How can I avoid this from happening?
My code in a function:
#%% Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, sqlite3
import tkinter as tk
from tkinter import filedialog
from nptdms import TdmsFile
#%% Load data
def get_dataframe():
"""
The function takes a folder path with a path dialogue and put all
tdms-files in one dataframe.
Returns
-------
TYPE
Dataframe.
"""
# select folder to load data from
def select_folder():
root = tk.Tk()
root.attributes('-topmost',True)
root.withdraw()
print("Please select a folder with the tdms-files inside...")
folder_root = filedialog.askdirectory()
return folder_root
folder = select_folder()
os.chdir(folder)
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("global tdms_file%d; tdms_file%d = TdmsFile.read(file)"
% (i,i))
exec("tdms_file%d.close()" % (i))
exec("global dataFrame_%d; global tdms_file%d; \
dataFrame_%d = tdms_file%d.\
as_dataframe(time_index=True)" % (i,i,i,i))
exec("global tdms_file%d; del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1
dataFrame = pd.concat([eval(element) for element in df_list], axis=1)
Burst_name = ["Burst {0}".format(i) for i in range(dataFrame.shape[1])]
dataFrame.columns = Burst_name
return dataFrame
dataFrame = get_dataframe()
Outside of the function this part works fine:
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("tdms_file%d = TdmsFile.read(file)" % (i))
exec("tdms_file%d.close()" % (i))
exec("dataFrame_%d = tdms_file%d.as_dataframe(time_index=True)" % (i,i))
exec("del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1

How can to determine excel files's name that be have error while reading by pandas?

I have many Excel files (about 500 files). The Excel files have different sheets. The purpose of the written code is to extract data from specific columns and rows in the second sheet. The output of the desired code is the collection of extracted data in Excel file format named 'FluidsVolumeReport.xlsx'.
import pandas as pd
import glob
import numpy as np
filenames = glob.glob('*.xlsx')
dlist = []
for file in filenames:
Dict = {}
xl = pd.ExcelFile(file)
Sheet_Names_list = xl.sheet_names
df = pd.DataFrame()
df = pd.read_excel(file, sheet_name= Sheet_Names_list[1])
for row in range(3, 8):
Dict.update({"Date": df.iat[2, 2]})
Dict.update({df.iat[9, 25]: df.iat[9, 35]})
Dict.update({df.iat[10, 25]: df.iat[10, 35]})
Dict.update({df.iat[row, 40]: df.iat[row, 47]})
dlist.append(Dict)
dflist = []
for i in range(0, len(dlist)):
Dict = dlist[i]
df = pd.DataFrame(data=Dict, index=[0])
dflist.append(df)
df = pd.concat(dflist, axis=0, sort=False, ignore_index=True)
df.sort_values(by='Date', inplace=True)
df.replace(0.0, np.nan, inplace=True)
df.to_excel('FluidsVolumeReport.xlsx')
The code has error while reading some Excel files due to the file not opening or not matching the specified range, and as a result the code stops giving an error.
IndexError: index 25 is out of bounds for axis 0 with size 14
My aim is to write a code that ignores any Excel files that have errors and reports the names of these files. Please help me complete my code.
As suggested by #rahlf23, you could do something like that (to keep it simple):
...
try:
for file in filenames:
...
dlist.append(Dict)
except IndexError:
print(file)
pass
dflist = []
...

Loading/analyzing a bunch of text files in Pandas/SQL

I have a few thousand files of text and would like to analyze them for trends/word patterns, etc. I am familiar with both Pandas and SQL but am not sure how to "load" all these files into a table/system such that I can run code on them. Any advice?
If you have all the same columns in all the text files you can use something like this.
import pandas as pd
import glob
path = r'C:/location_rawdata_files'#use the path where you stored all txt's
all_files = glob.glob(path + "/*.txt")
lst = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None)
lst .append(df)
df= pd.concat(lst, axis=0, ignore_index=True)

Import a growing list() of csv files only to append after imoprting [duplicate]

This question already has answers here:
Import multiple CSV files into pandas and concatenate into one DataFrame
(20 answers)
Closed 3 years ago.
So I am building a dataset with a growing set of csv's. Rather than adding the new df# = pd.read_csv(filename, index...) I would prefer to just create a function to read the list of csv's and then append them upon importing. Any recommendations? I put the code down below for what I currently have.
import glob
files = glob.glob('*.csv')
files
alg1_2018_2019 = pd.read_csv('alg1_2018_2019.csv', index_col=False)
alg1_2017_2018 = pd.read_csv('alg1_2017_2018.csv', index_col=False)
geometry_2018_2019 = pd.read_csv('geometry_2018_2019.csv', index_col=False)
geom_8_2017_2018 = pd.read_csv('geom_8_2017_2018.csv', index_col=False)
alg2_2016_2017 = pd.read_csv('alg2_2016_2017.csv', index_col=False)
alg1_2016_2017 = pd.read_csv('alg1_2016_2017.csv', index_col=False)
geom_2016_2017 = pd.read_csv('geom_2016_2017.csv', index_col=False)
geom_2015_2016 = pd.read_csv('geom_2015_2016.csv', index_col=False)
alg2_2015_2016 = pd.read_csv('alg2_2015_2016.csv', index_col=False)
alg1_part2_2015_2016 = pd.read_csv('alg1_part2_2015_2016.csv', index_col=False)```
i'm using the following function:
import pandas as pd
from pathlib import Path
def glob_filemask(filemask):
"""
allows to "glob" files using file masks with full path
Usage:
for file in glob_filemask("/path/to/file_*.txt"):
# process file here
or:
files = list(glob_filemask("/path/to/file_*.txt"))
:param filemask: wildcards can be used only in the last part
(file name or extension), but NOT in the directory part
:return: Pathlib glob generator, for all matching files
Example:
glob_filemask("/root/subdir/data_*.csv") -
will return a Pathlib glob generator for all matching files
glob_filemask("/root/subdir/single_file.csv") -
will return a Pathlib glob generator for a single file
"""
p = Path(filemask)
try:
if p.is_file():
return [p]
except OSError:
return p.parent.glob(p.name)
Usage:
df = pd.concat([pd.read_csv(f) for f in glob_filemask("/path/to/file_*.csv")],
ignore_index=True)