Python date comparison not working in .exe app - pandas

I have created a python script that works fine when running it in Spyder. I then freeze it with pyinstaller. When I run the .exe app, I get the following error.
Here is the relevent code:
import pandas as pd
import os
from datetime import datetime, time
import teradata as td
import numpy as np
import smtplib
import xlrd #needed for .exe
### Import Fleet Plan file ###
path = '\\\PHX43XCIFSC0001\Planning'
folder = '\Aircraft Availability'
file = '\\NP Fleet Plan.xlsx'
sheet = 'Mainline'
colnames = [0,2]
link = path + folder + file
update = pd.Timestamp.date(pd.Timestamp(datetime.fromtimestamp(
os.path.getmtime(link)), unit='s'))
mydata = pd.read_excel(link, sheet_name = sheet, header=colnames, index=None)
df = mydata
# Flatten multiindex to single columns
df.columns = (['{}:{}'.format(i[0], i[1]) for i in df])
df = df.reset_index()
df = df.rename(columns={'index':'mDate', df.columns[1]:'DOW'})
# Remove blank columns and Fleet level columns
xcolunassigned = [col for col in df.columns if 'Unnamed' in col]
df = df.drop(xcolunassigned, axis=1)
xcolfleet = [col for col in df.columns if 'FLEET' in col]
df = df.drop(xcolfleet, axis=1)
# Transpose data in to vectors
dft = pd.melt(df, id_vars=['mDate', 'DOW'], var_name='Status', value_name='mCount')
# Split Subfleets, join Legacy, remove 0 and NaN
dft[['Status', 'SubFleet']] = dft.Status.str.split(':',expand=True)
sDate = min(dft.mDate)
dft = dft.dropna()
dft = dft.reset_index(drop=True)
dft = dft[dft['mCount'] != 0]
dft = dft.reset_index(drop=True)
# Delete all data prior to today
dft = dft[dft['mDate'] >= datetime.combine(datetime.today(), time.min) ]
dft = dft.reset_index(drop=True)
I am wondering if there is a dependency that I need to explicitly import like I had to for the xlrd library.
Thanks for the assistance.

There ended up being an issue with the .exe not loading some of the dependencies for the libraries I needed. After explicitly calling the dependencies in my code, the .exe application worked perfectly.

Related

Problem with merging multiply excel files from python

My dtype is changing after i unhash the foo and groupby i get # we require a list, but not a 'str'.
I wanted if the value (in my case Date) in the 1 column is the same then the text from the 3 column goes there after a ',' sign, in my final project
import os
import pandas as pd
import dateutil
from pandas import DataFrame
from datetime import datetime, timedelta
data_file_folder = '.\Data'
df = []
for file in os.listdir(data_file_folder):
if file.endswith('.xlsx'):
print('Loading File {0}...'.format(file))
df.append(pd.read_excel(os.path.join(data_file_folder,file),sheet_name='Sheet1'))
df_master = pd.concat(df,axis=0)
df_master['Date'] = df_master['Date'].dt.date
#foo = lambda a: ", ".join(a)
#df_master = df_master.groupby(by='Date').agg({'Tweet': foo}).reset_index()
#df_master.to_excel('.\NewFolder\example.xlsx',index=False)
#df_master

How to add avwap to pandas_ta?

I am trying to get anchored vwap from specific date using pandas_ta. How to set anchor to specific date?
import pandas as pd
import yfinance as yf
import pandas_ta as ta
from datetime import datetime, timedelta, date
import warnings
import plac
data = yf.download("aapl", start="2021-07-01", end="2022-08-01")
df = pd.DataFrame(data)
df1 = df.ta.vwap(anchor = "D")
df14 = pd.concat([df, df1],axis=1)
print(df14)
pandas_ta.vwap anchor depending on the index values, as pandas-ta said(reference)
anchor (str): How to anchor VWAP. Depending on the index values, it will
implement various Timeseries Offset Aliases as listed here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
Default: "D".
In further words, you can't specify a specific date as TradingView did.
To anchor a date ourself,
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_ta as ta
# set anchor date
anchored_date = pd.to_datetime('2022-01-30')
data = yf.download("aapl", start="2022-01-01", end="2022-08-01")
df = pd.DataFrame(data)
df1 = df.ta.vwap(anchor = "D")
df14 = pd.concat([df, df1],axis=1)
# I create a column 'typical_price', it should be identical with 'VWAP_D'
df14['typical_price'] = (df14['High'] + df14['Low'] + df14['Close'])/3
tpp_d = ((df14['High'] + df14['Low'] + df14['Close'])*df14['Volume'])/3
df14['anchored_VWAP'] = tpp_d.where(df14.index >= anchored_date).groupby(df14.index >= anchored_date).cumsum()/df14['Volume'].where(df14.index >= anchored_date).groupby(df14.index >= anchored_date).cumsum()
df14
Plot

Exec in function write unwanted variables in workspace

Problem description:
I have written a code to load files from an folder into a function which puts tdms files into one single dataframe. After putting this code into a function problems appeared. I know the root of the problem is around defining the variables in the scope. I would like my function to only output "dataFrame". Instead the global in the exec function leads to the dataFrame_1,2,... in the workspace. How can I avoid this from happening?
My code in a function:
#%% Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, sqlite3
import tkinter as tk
from tkinter import filedialog
from nptdms import TdmsFile
#%% Load data
def get_dataframe():
"""
The function takes a folder path with a path dialogue and put all
tdms-files in one dataframe.
Returns
-------
TYPE
Dataframe.
"""
# select folder to load data from
def select_folder():
root = tk.Tk()
root.attributes('-topmost',True)
root.withdraw()
print("Please select a folder with the tdms-files inside...")
folder_root = filedialog.askdirectory()
return folder_root
folder = select_folder()
os.chdir(folder)
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("global tdms_file%d; tdms_file%d = TdmsFile.read(file)"
% (i,i))
exec("tdms_file%d.close()" % (i))
exec("global dataFrame_%d; global tdms_file%d; \
dataFrame_%d = tdms_file%d.\
as_dataframe(time_index=True)" % (i,i,i,i))
exec("global tdms_file%d; del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1
dataFrame = pd.concat([eval(element) for element in df_list], axis=1)
Burst_name = ["Burst {0}".format(i) for i in range(dataFrame.shape[1])]
dataFrame.columns = Burst_name
return dataFrame
dataFrame = get_dataframe()
Outside of the function this part works fine:
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("tdms_file%d = TdmsFile.read(file)" % (i))
exec("tdms_file%d.close()" % (i))
exec("dataFrame_%d = tdms_file%d.as_dataframe(time_index=True)" % (i,i))
exec("del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)
Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Dask Dataframe: Defining meta for date diff in groubpy

I'm trying to find inter-purchase times (i.e., days between orders) for customers. Although my code is working correctly without defining meta, I would like to get it working properly and no longer see the warning asking me to provide meta.
Also, I would appreciate any suggestions on how to use map or map_partitions instead of apply.
So far I've tried:
meta={'days_since_last_order': 'datetime64[ns]'}
meta={'days_since_last_order': 'f8'}
meta={'ORDER_DATE_DT':'datetime64[ns]','days_since_last_order': 'datetime64[ns]'}
meta={'ORDER_DATE_DT':'f8','days_since_last_order': 'f8'}
meta=('days_since_last_order', 'f8')
meta=('days_since_last_order', 'datetime64[ns]')
Here is my code:
import numpy as np
import pandas as pd
import datetime as dt
import dask.dataframe as dd
from dask.distributed import wait, Client
client = Client(processes=True)
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')
d = (end - start).days + 1
np.random.seed(0)
df = pd.DataFrame()
df['CUSTOMER_ID'] = np.random.randint(1, 4, 10)
df['ORDER_DATE_DT'] = start + pd.to_timedelta(np.random.randint(1, d, 10), unit='d')
print(df.sort_values(['CUSTOMER_ID','ORDER_DATE_DT']))
print(df)
ddf = dd.from_pandas(df, npartitions=2)
# setting ORDER_DATE_DT as index to sort by date
ddf = ddf.set_index('ORDER_DATE_DT')
ddf = client.persist(ddf)
wait(ddf)
ddf = ddf.reset_index()
grp = ddf.groupby('CUSTOMER_ID')[['ORDER_DATE_DT']].apply(
lambda df: df.assign(days_since_last_order=df.ORDER_DATE_DT.diff(1))
# meta=????
)
# for some reason, I'm unable to print grp unless I reset_index()
grp = grp.reset_index()
print(grp.compute())
Here is the printout of df.sort_values(['CUSTOMER_ID','ORDER_DATE_DT'])
Here is the printout of grp.compute()