I want my new columns that are the output in a new sheet named "Analyzed Data" for multiple files. Each file has a different amount of columns with varying names.
import os
import pandas as pd
path = r'C:\Users\Me\1Test'
filelist = []
for root, dirs, files in os.walk(path):
for f in files:
if not f.endswith('.txt'):
continue
filelist.append(os.path.join(root, f))
for f in filelist:
df = pd.read_table(f)
col = df.iloc[ : , : -3]
df['Average'] = col.mean(axis = 1)
out = (df.join(df.drop(df.columns[[-3,-1]], axis=1)
.sub(df[df.columns[-3]], axis=0)
.add_suffix(' - Background')))
out.to_excel(f.replace('txt', 'xlsx'), 'Sheet1')
Related
I am reading a txt file for search variable.
I am using this variable to find it in a dataframe.
for lines in lines_list:
sn = lines
if sn in df[df['SERIAL'].str.contains(sn)]:
condition = df[df['SERIAL'].str.contains(sn)]
df_new = pd.DataFrame(condition)
df_new.to_csv('try.csv',mode='a', sep=',', index=False)
When I check the try.csv file, it has much more lines the txt file has.
The df has a lots of lines, more than the txt file.
I want save the whole line from search result into a dataframe or file
I tried to append the search result to a new dataframe or csv.
first create line list
f = open("text.txt", "r")
l = list(map(lambda x: x.strip(), f.readlines()))
write this apply func has comparing values and filtering
def apply_func(x):
if str(x) in l:
return x
return np.nan
and get output
df["Serial"] = df["Serial"].apply(apply_func)
df.dropna(inplace=True)
df.to_csv("new_df.csv", mode="a", index=False)
or try filter method
f = open("text.txt", "r")
l = list(map(lambda x: x.strip(), f.readlines()))
df = df.set_index("Serial").filter(items=l, axis=0).reset_index()
df.to_csv("new_df.csv", mode="a", index=False)
I'm trying to save dataframes into tabs/sheets in an existing xlsm file. I was able to save multiple sheets into an xlsx file with engine=xlsxwriter but couldn't find a way to modify it to write to a xlsm file. I found numerous examples using openpyxl as an engine but haven't found a way to predefine the cell formats. Here's the xlswriter code:
with pd.ExcelWriter(filename + '.xlsx', engine='xlsxwriter') as writer:
df.to_excel(writer, sheet_name=tabname)
workbook = writer.book
worksheet = writer.sheets[tabname]
fmt0 = workbook.add_format({'num_format': '0'})
fmt1 = workbook.add_format({'num_format': '0.0'})
fmt2 = workbook.add_format({'num_format': '0.00'})
for r in range(len(df.index)):
s = r + 1
cols = {3: 'min', 4: 'max'}
for c in cols:
v = df.at[df.index[r], cols[c]]
if is_number(v):
if '.' not in v:
worksheet.write_number(s, c, float(v), fmt0)
else:
pl = len(v) - v.index('.') - 1 # number of digits after '.'
if pl == 1:
worksheet.write_number(s, c, float(v), fmt1)
elif pl == 2:
worksheet.write_number(s, c, float(v), fmt2)
Does openpyxl have a way to do the same thing?
I think I can define a function that will do this but if there's a way to modify the existing code, I'd rather do that.
I wrote a query in the data frame and want to save it in CSV file
I tried this code and didn't work
q1 = "SELECT * FROM df1 join df2 on df1.Date = df2.Date"
df = pd.read_sql(q1,None)
df.to_csv('data.csv',index=False)
You can try following code:
import pandas as pd
df1 = pd.read_csv("Insert file path")
df2 = pd.read_csv("Insert file path")
df1['Date'] = pd.to_datetime(df1['Date'] ,errors = 'coerce',format = '%Y-%m-%d')
df2['Date'] = pd.to_datetime(df2['Date'] ,errors = 'coerce',format = '%Y-%m-%d')
df = df1.merge(df2,how='inner', on ='Date')
df.to_csv('data.csv',index=False)
This should solve your problem.
I am able to save multiple dataframe in multiple excel sheets.
writer = pd.ExcelWriter('Cloud.xlsx', engine='xlsxwriter')
frames = {'Image': df1, 'Objects': df2, 'Text': df3 , 'Labels': df4}
for sheet, frame in frames.items():
frame.to_excel(writer, sheet_name = sheet)
writer.save()
Now I want to create multiple files based on the dataframes column. For example, I want to create 4 excel files:
df1
Category URL Obj
A example.com Chair
A example2.com table
B example3.com glass
B example4.com tv
So my all datframes have 7 categories and I want to create 7 files based the category.
I think you need:
frames = {'Image': df1, 'Objects': df2, 'Text': df3 , 'Labels': df4}
for sheet, frame in frames.items():
for cat, g in frame.groupby('Category'):
# if file does not exist
if not os.path.isfile(f'{cat}.xlsx'):
writer = pd.ExcelWriter(f'{cat}.xlsx')
else: # else it exists
writer = pd.ExcelWriter(f'{cat}.xlsx', mode='a', engine='openpyxl')
g.to_excel(writer, sheet_name = sheet)
writer.save()
i am extracting selected pages from a pdf file. and want to assign dataframe name based on the pages extracted:
file = "abc"
selected_pages = ['10','11'] #can be any combination eg ['6','14','20]
for i in selected_pages():
df{str(i)} = read_pdf(path + file + ".pdf",encoding = 'ISO-8859-1', stream = True,area = [100,10,740,950],pages= (i), index = False)
print (df{str(i)} )
The idea, ultimately, as in above example, is to have dataframes: df10, df11. I have tried "df" + str(i), "df" & str(i) & df{str(i)}. however all are giving error msg: SyntaxError: invalid syntax
Or any better way of doing it is most welcome. thanks
This is where a dictionary would be a much better option.
Also note the error you have at the start of the loop. selected_pages is a list, so you can't do selected_pages().
file = "abc"
selected_pages = ['10','11'] #can be any combination eg ['6','14','20]
df = {}
for i in selected_pages:
df[i] = read_pdf(path + file + ".pdf",encoding = 'ISO-8859-1', stream = True, area = [100,10,740,950], pages= (i), index = False)
i = int(i) - 1 # this will bring it to 10
dfB = df[str(i)]
#select row number to drop: 0:4
dfB.drop(dfB.index[0:4],axis =0, inplace = True)
dfB.columns = ['col1','col2','col3','col4','col5']