How to read multiple files and save a single output with pandas use command line arguments - pandas

I have multiple files in TXT format how to get all the values ​​with a single output Merge values ​​into a single file use command line arguments in pandas
like this:
python3 file1.txt file2.txt file3.txt
Code:
import pandas as pd
import socket, struct
import os
import glob
import sys
try:
file = sys.argv[1]
except Exception:
print("Usage: python3 {} [file]".format(sys.argv[0]))
sys.exit()
os.chdir('/Users/roc/Desktop/js/projj')
fileList = glob.glob('*.txt')
appended_data = []
for file in fileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int)
df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
print(result_df.to_csv("/Users/roc/Desktop/js/projj/delegated2.txt",sep=' ', index=False, header=False))

Use the below to iterate through a folder and append all files to a single dataframe
import os
import glob
os.chdir('C:\\path_to_folder\\')
Filelist = glob.glob('*.txt')
appended_data = []
for file in FileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
Once you have the df which is combined of all the data from all files, use the next part of the code:
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int) df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
result_df.to_csv("/Users/roc/Desktop/output.txt",sep=' ', index=False, header=False)

Related

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

Exec in function write unwanted variables in workspace

Problem description:
I have written a code to load files from an folder into a function which puts tdms files into one single dataframe. After putting this code into a function problems appeared. I know the root of the problem is around defining the variables in the scope. I would like my function to only output "dataFrame". Instead the global in the exec function leads to the dataFrame_1,2,... in the workspace. How can I avoid this from happening?
My code in a function:
#%% Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, sqlite3
import tkinter as tk
from tkinter import filedialog
from nptdms import TdmsFile
#%% Load data
def get_dataframe():
"""
The function takes a folder path with a path dialogue and put all
tdms-files in one dataframe.
Returns
-------
TYPE
Dataframe.
"""
# select folder to load data from
def select_folder():
root = tk.Tk()
root.attributes('-topmost',True)
root.withdraw()
print("Please select a folder with the tdms-files inside...")
folder_root = filedialog.askdirectory()
return folder_root
folder = select_folder()
os.chdir(folder)
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("global tdms_file%d; tdms_file%d = TdmsFile.read(file)"
% (i,i))
exec("tdms_file%d.close()" % (i))
exec("global dataFrame_%d; global tdms_file%d; \
dataFrame_%d = tdms_file%d.\
as_dataframe(time_index=True)" % (i,i,i,i))
exec("global tdms_file%d; del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1
dataFrame = pd.concat([eval(element) for element in df_list], axis=1)
Burst_name = ["Burst {0}".format(i) for i in range(dataFrame.shape[1])]
dataFrame.columns = Burst_name
return dataFrame
dataFrame = get_dataframe()
Outside of the function this part works fine:
# put data in one dataframe
i = 1
df_list = []
for path, subdirs, files in os.walk(folder):
for file in files:
if file.endswith(".tdms"):
# print(os.path.join(path, file))
os.chdir(path)
exec("tdms_file%d = TdmsFile.read(file)" % (i))
exec("tdms_file%d.close()" % (i))
exec("dataFrame_%d = tdms_file%d.as_dataframe(time_index=True)" % (i,i))
exec("del tdms_file%d" % (i,i))
df_list.append("dataFrame_%d" % (i))
i += 1

How can I add each new dataframe to the csv that is created?

My problem is that only the most recent url request is saved. How can I save all the responses? I tried using df.to_csv('complete.csv', 'a') but that creates a jumbled file.
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
# main code
with open('list.txt', 'r') as f_in:
for line in map(str.strip, f_in):
if not line:
continue
response = requests.get(line)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
linecodes = []
partnos = []
for tbody in soup.select('tbody[id^="listingcontainer"]'):
tmp = tbody.find('span', class_='listing-final-manufacturer')
linecodes.append(tmp.text if tmp else '-')
tmp = tbody.find('span', class_='listing-final-partnumber as-link-if-js buyers-guide-color')
partnos.append(tmp.text if tmp else '-')
# create dataframe
df = pd.DataFrame(zip(linecodes,partnos), columns=['linecode', 'partno'])
# save to csv
df.to_csv('complete.csv')
print(df)
list.txt
https://www.rockauto.com/en/catalog/ford,2010,f-150,6.2l+v8,1447337,brake+&+wheel+hub,brake+pad,1684
https://www.rockauto.com/en/catalog/ford,2015,f-150,5.0l+v8,3308775,brake+&+wheel+hub,brake+pad,1684
You are saving the dataframe after each iterations, which is just overwriting the previous save. So you need to append the dataframes after each iterations. after it completes the loop, then save that final dataframe. So something like:
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
# main code
with open('list.txt', 'r') as f_in:
final_df = pd.DataFrame()
for line in map(str.strip, f_in):
if not line:
continue
response = requests.get(line)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
linecodes = []
partnos = []
for tbody in soup.select('tbody[id^="listingcontainer"]'):
tmp = tbody.find('span', class_='listing-final-manufacturer')
linecodes.append(tmp.text if tmp else '-')
tmp = tbody.find('span', class_='listing-final-partnumber as-link-if-js buyers-guide-color')
partnos.append(tmp.text if tmp else '-')
# create dataframe
df = pd.DataFrame(zip(linecodes,partnos), columns=['linecode', 'partno'])
print(df)
final_df = final_df.append(df, sort=False).reset_index(drop=True)
# save to csv
final_df.to_csv('complete.csv')
print(final_df)

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)
Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Python date comparison not working in .exe app

I have created a python script that works fine when running it in Spyder. I then freeze it with pyinstaller. When I run the .exe app, I get the following error.
Here is the relevent code:
import pandas as pd
import os
from datetime import datetime, time
import teradata as td
import numpy as np
import smtplib
import xlrd #needed for .exe
### Import Fleet Plan file ###
path = '\\\PHX43XCIFSC0001\Planning'
folder = '\Aircraft Availability'
file = '\\NP Fleet Plan.xlsx'
sheet = 'Mainline'
colnames = [0,2]
link = path + folder + file
update = pd.Timestamp.date(pd.Timestamp(datetime.fromtimestamp(
os.path.getmtime(link)), unit='s'))
mydata = pd.read_excel(link, sheet_name = sheet, header=colnames, index=None)
df = mydata
# Flatten multiindex to single columns
df.columns = (['{}:{}'.format(i[0], i[1]) for i in df])
df = df.reset_index()
df = df.rename(columns={'index':'mDate', df.columns[1]:'DOW'})
# Remove blank columns and Fleet level columns
xcolunassigned = [col for col in df.columns if 'Unnamed' in col]
df = df.drop(xcolunassigned, axis=1)
xcolfleet = [col for col in df.columns if 'FLEET' in col]
df = df.drop(xcolfleet, axis=1)
# Transpose data in to vectors
dft = pd.melt(df, id_vars=['mDate', 'DOW'], var_name='Status', value_name='mCount')
# Split Subfleets, join Legacy, remove 0 and NaN
dft[['Status', 'SubFleet']] = dft.Status.str.split(':',expand=True)
sDate = min(dft.mDate)
dft = dft.dropna()
dft = dft.reset_index(drop=True)
dft = dft[dft['mCount'] != 0]
dft = dft.reset_index(drop=True)
# Delete all data prior to today
dft = dft[dft['mDate'] >= datetime.combine(datetime.today(), time.min) ]
dft = dft.reset_index(drop=True)
I am wondering if there is a dependency that I need to explicitly import like I had to for the xlrd library.
Thanks for the assistance.
There ended up being an issue with the .exe not loading some of the dependencies for the libraries I needed. After explicitly calling the dependencies in my code, the .exe application worked perfectly.