Pandas writing to excel gives warning when using openpyxl - pandas

I am using the following code:
import os
import numpy as np
import pandas as pd
from openpyxl import load_workbook
def dump2ExcelTest(df, fname, sheetNameIn='Sheet1'):
if os.path.exists(fname):
writer = pd.ExcelWriter(fname, engine='openpyxl', mode='a')
book = load_workbook(fname)
writer.book = book
else:
writer = pd.ExcelWriter(fname, engine='openpyxl', mode='w')
df.to_excel(writer, sheet_name = sheetNameIn)
writer.save()
writer.close()
x1 = np.random.randn(100, 2)
df1 = pd.DataFrame(x1)
dump2ExcelTest(df1, r'Y:\summary\test3.xlsx')
On trying to open test3.xlsx I get the following warning window:
However, if I just do df1.to_excel(r'Y:\summary\test3.xlsx') then test3.xlsx opens fine.
I am not sure what to do about this as there is nothing in the log file.

I believe the way the ExcelWriter opens the file and tracks existing workbook contents is the problem. I'm not sure exactly what is going on under the hood but you have to both
specify the proper startrow for append
copy sheet information to the writer
I've used a contextmanager in Python for a little cleaner syntax.
This is your example but properly writing and appending as you desire.
import os
import numpy as np
import pandas as pd
from openpyxl import load_workbook
def dump2ExcelTest(df, fname, sheetNameIn='Sheet1'):
if os.path.exists(fname) is False:
df.to_excel(fname, engine='openpyxl')
start_row = 0
with pd.ExcelWriter(fname, engine='openpyxl', mode='a') as writer:
writer.book = load_workbook(fname)
if sheetNameIn not in writer.book.sheetnames:
raise ValueError(f"sheet {sheetNameIn} not in workbook")
# grab the proper start row and copy existing sheets to new writer
start_row = writer.book[sheetNameIn].max_row
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
df.to_excel(writer, sheetNameIn, startrow=start_row, header=False)
x1 = np.random.randn(100, 2)
df1 = pd.DataFrame(x1)
dump2ExcelTest(df1, "test3.xlsx")
More details and similar question here

Related

Export pandas dataframe to xlsx: dealing with the openpyxl issue on python 3.9

Using latest packages version: openpyxl: 3.0.6 | pandas: 1.2.3 |python: 3.9
The function below was working fine before updating the packages above to the latest version reported.
Now it raises the error: "zipfile.BadZipFile: File is not a zip file".
Such fuction is really useful and would be great to know if it can be fixed in order to works.
The function below can be run as it is, just replace "pathExport" to your export directory for testing.
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
truncate_sheet=False,
**to_excel_kwargs):
"""
Append a DataFrame [df] to existing Excel file [filename]
into [sheet_name] Sheet.
If [filename] doesn't exist, then this function will create it.
Parameters:
filename : File path or existing ExcelWriter
(Example: '/path/to/file.xlsx')
df : dataframe to save to workbook
sheet_name : Name of sheet which will contain DataFrame.
(default: 'Sheet1')
startrow : upper left cell row to dump data frame.
Per default (startrow=None) calculate the last row
in the existing DF and write to the next row...
truncate_sheet : truncate (remove and recreate) [sheet_name]
before writing DataFrame to Excel file
to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
[can be dictionary]
Returns: None
(c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
"""
from openpyxl import load_workbook
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
writer = pd.ExcelWriter(filename, engine='openpyxl')
# Python 2.x: define [FileNotFoundError] exception if it doesn't exist
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
try:
# try to open an existing workbook
writer.book = load_workbook(filename)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
except FileNotFoundError:
# file does not exist yet, we will create it
pass
if startrow is None:
startrow = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)
# save the workbook
writer.save()
pathExport = r"F:\PYTHON\NB-Suite_python39\MNE\outputData\df.xlsx"
df1 = pd.DataFrame({'numbers': [1, 2, 3],
'colors': ['red', 'white', 'blue'],
'colorsTwo': ['yellow', 'white', 'blue']
})
append_df_to_excel(pathExport, df1, sheet_name="DF1", index=False, startcol=0, startrow=0)
OK, I was able to replicate the problem. It is pandas related. Everything works just fine up to pandas 1.1.5
In pandas 1.2.0 they did some changes
At the time when you instantiate pd.ExcelWriter with
writer = pd.ExcelWriter(filename, engine='openpyxl')`
it creates empty file with size 0 bytes and overwrites the existing file and then you get error when try to load it. It is not openpyxl related, because with latest version of openpyxl it works fine with pandas 1.1.5.
The solution - specify mode='a', change the above line to
writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')
Alternatively - look at this or this solution where it loads the file before instantiating the pd.ExcelWriter.
EDIT: I've been advised in the comments that with mode='a' it will raise FileNotFoundError in case the file does not exists. Although it's unexpected that it will not create the file in this case, the solution is to move creating the writer inside the existing try block and create a writer with mode w in the except part:
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
truncate_sheet=False,
**to_excel_kwargs):
"""
Append a DataFrame [df] to existing Excel file [filename]
into [sheet_name] Sheet.
If [filename] doesn't exist, then this function will create it.
Parameters:
filename : File path or existing ExcelWriter
(Example: '/path/to/file.xlsx')
df : dataframe to save to workbook
sheet_name : Name of sheet which will contain DataFrame.
(default: 'Sheet1')
startrow : upper left cell row to dump data frame.
Per default (startrow=None) calculate the last row
in the existing DF and write to the next row...
truncate_sheet : truncate (remove and recreate) [sheet_name]
before writing DataFrame to Excel file
to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
[can be dictionary]
Returns: None
(c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
"""
from openpyxl import load_workbook
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
# Python 2.x: define [FileNotFoundError] exception if it doesn't exist
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
try:
writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')
# try to open an existing workbook
writer.book = load_workbook(filename)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
except FileNotFoundError:
# file does not exist yet, we will create it
writer = pd.ExcelWriter(filename, engine='openpyxl')
if startrow is None:
startrow = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)
# save the workbook
writer.save()
The solution is the following:
import pandas as pd
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None, startcol=None,
truncate_sheet=False, resizeColumns=True, na_rep = 'NA', **to_excel_kwargs):
"""
Append a DataFrame [df] to existing Excel file [filename]
into [sheet_name] Sheet.
If [filename] doesn't exist, then this function will create it.
Parameters:
filename : File path or existing ExcelWriter
(Example: '/path/to/file.xlsx')
df : dataframe to save to workbook
sheet_name : Name of sheet which will contain DataFrame.
(default: 'Sheet1')
startrow : upper left cell row to dump data frame.
Per default (startrow=None) calculate the last row
in the existing DF and write to the next row...
truncate_sheet : truncate (remove and recreate) [sheet_name]
before writing DataFrame to Excel file
resizeColumns: default = True . It resize all columns based on cell content width
to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
[can be dictionary]
na_rep: default = 'NA'. If, instead of NaN, you want blank cells, just edit as follows: na_rep=''
Returns: None
*******************
CONTRIBUTION:
Current helper function generated by [Baggio]: https://stackoverflow.com/users/14302009/baggio?tab=profile
Contributions to the current helper function: https://stackoverflow.com/users/4046632/buran?tab=profile
Original helper function: (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
Features of the new helper function:
1) Now it works with python 3.9 and latest versions of pandas and openpxl
---> Fixed the error: "zipfile.BadZipFile: File is not a zip file".
2) Now It resize all columns based on cell content width AND all variables will be visible (SEE "resizeColumns")
3) You can handle NaN, if you want that NaN are displayed as NaN or as empty cells (SEE "na_rep")
4) Added "startcol", you can decide to start to write from specific column, oterwise will start from col = 0
*******************
"""
from openpyxl import load_workbook
from string import ascii_uppercase
from openpyxl.utils import get_column_letter
from openpyxl import Workbook
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
try:
f = open(filename)
# Do something with the file
except IOError:
# print("File not accessible")
wb = Workbook()
ws = wb.active
ws.title = sheet_name
wb.save(filename)
writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')
# Python 2.x: define [FileNotFoundError] exception if it doesn't exist
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
try:
# try to open an existing workbook
writer.book = load_workbook(filename)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
except FileNotFoundError:
# file does not exist yet, we will create it
pass
if startrow is None:
# startrow = -1
startrow = 0
if startcol is None:
startcol = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, startcol=startcol, na_rep=na_rep, **to_excel_kwargs)
if resizeColumns:
ws = writer.book[sheet_name]
def auto_format_cell_width(ws):
for letter in range(1,ws.max_column):
maximum_value = 0
for cell in ws[get_column_letter(letter)]:
val_to_check = len(str(cell.value))
if val_to_check > maximum_value:
maximum_value = val_to_check
ws.column_dimensions[get_column_letter(letter)].width = maximum_value + 2
auto_format_cell_width(ws)
# save the workbook
writer.save()
Example Usage:
# Create a sample dataframe
df = pd.DataFrame({'numbers': [1, 2, 3],
'colors': ['red', 'white', 'blue'],
'colorsTwo': ['yellow', 'white', 'blue'],
'NaNcheck': [float('NaN'), 1, float('NaN')],
})
# EDIT YOUR PATH FOR THE EXPORT
filename = r"C:\DataScience\df.xlsx"
# RUN ONE BY ONE IN ROW THE FOLLOWING LINES, TO SEE THE DIFFERENT UPDATES TO THE EXCEL FILE
append_df_to_excel(filename, df, index=False, startrow=0) # Basic Export of df in default sheet (Sheet1)
append_df_to_excel(filename, df, sheet_name="Cool", index=False, startrow=0) # Append the sheet "Cool" where "df" is written
append_df_to_excel(filename, df, sheet_name="Cool", index=False) # Append another "df" to the sheet "Cool", just below the other "df" instance
append_df_to_excel(filename, df, sheet_name="Cool", index=False, startrow=0, startcol=5) # Append another "df" to the sheet "Cool" starting from col 5
append_df_to_excel(filename, df, index=False, truncate_sheet=True, startrow=10, na_rep = '') # Override (truncate) the "Sheet1", writing the df from row 10, and showing blank cells instead of NaN

How to add data to an existing excel file in python without overwriting the data

I have a master excel sheet where the data looks like this [1]: https://i.stack.imgur.com/IS4cw.png
I have a script which imports the csv files and combines them and save it to the master excel sheet.
import pandas as pd
from openpyxl import load_workbook
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
root.call('wm', 'attributes', '.', '-topmost', True)
files = filedialog.askopenfilename(multiple=True)
%gui tk
var = root.tk.splitlist(files)
filePaths = []
for f in var:
df = pd.read_csv(f,skiprows=8, index_col=None, header='infer',parse_dates=True, squeeze=True, encoding='ISO-8859–1',names=['Date', 'Time', 'Temperature', 'Humidty'])
filePaths.append(df)
df = pd.concat(filePaths, axis=0, join='outer', ignore_index=True, sort=True)
book = load_workbook(r'C:\Users\Administrator\Documents\Hebin\Scripts\Temperature Distribution chart/july/12.xlsx')
writer = pd.ExcelWriter(r'C:\Users\Administrator\Documents\Hebin\Scripts\Temperature Distribution chart/july/12.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df.to_excel(writer, "Sheet1", columns=['Date', 'Time','Temperature', 'Humidty'],index=False)
writer.save()
The problem is that the newly imported data is saved from row 1 instead of starting at the ending row of the previously saved data. How can I save the data in an orderly manner everytime without entering the row number?
The ExcelWriter can have its mode set to either write ('w') or append ('a'). The default is write.
writer = pd.ExcelWriter(r'C:\Users\Administrator\Documents\Hebin\Scripts\Temperature Distribution chart/july/12.xlsx', engine='openpyxl', mode='a')

How to import an excel workbook into jupyterr notebook

How do I import an excel workbook into jupyter notebook. I am using tensorflow.
xl.file =pd.excelfile('c:\users\owner\downloads\book1.xlsx')
book1 = pd.excelfile('book1.xlsx')
It looks like you are confusing the filename with the pandas method to read a file.
import pandas as pd
filename = 'c:\users\owner\downloads\book1.xlsx'
dataframe = pd.read_excel(filename)

Python refresh data in excel with external connection through add in doesnt work

I am using the following code to refresh data in excel file which uses external add in for receiving data.
import sys, os, pandas as pd, numpy as np, time, win32com.client
import win32com.client as w3c
if __name__ == '__main__':
your_file_path = r'C:\Book11.xlsx'
for ii in np.arange(1, 10):
xlapp = w3c.gencache.EnsureDispatch('Excel.Application')
xlapp.Visible = 0
xlwb = xlapp.Workbooks.Open(your_file_path, False, True, None)
books = w3c.Dispatch(xlwb)
xlwb.RefreshAll() # Runs with no errors, but doesn't refresh
xlapp.DisplayAlerts = False
xlwb.Save()
xlapp.Quit()
df = pd.read_excel(your_file_path) # updates should be applied
print(df)
time.sleep(20)
# Another version of code that I tried is following:
# xlapp = win32com.client.DispatchEx("Excel.Application")
# xlapp.Visible = True
# wb = xlapp.Workbooks.Open(your_file_path)
# wb.RefreshAll()
# xlapp.CalculateUntilAsyncQueriesDone()
# xlapp.DisplayAlerts = False
# wb.Save()
# xlapp.Quit()
However, the file doesn't refresh. In fact it looks like the following:
On the other hand if I just open the file on desktop using mouse clicks, I see the data as expected.
Are you running this as a macro?
Is refresh in bg property is false for all connections?
Things to try:
a)
Calculate
ActiveWorkbook.RefreshAll instead of wbRefresh.RefreshAll
b)
Unchecking "enable background refresh" (uncheck to disable the background refresh)

How to read the text in textbox by using openpyxl

all
I can read the text in cells, but the textbox can't read the text...
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re,os,sys,time
import openpyxl
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.drawing import *
reload(sys)
sys.setdefaultencoding('utf8')
wb = load_workbook(u'2.xlsx')
sheetnames = wb.get_sheet_names()
for i in range(0,len(sheetnames)):
sheet = wb.get_sheet_by_name(sheetnames[i])
for row in sheet.rows:
for cell in row:
if cell.value:
print cell.value
I try to unzip the xlsx file and find the content of textbox in xl\drawings\drawing[0-9].xml files..
and can openpyxl.drawing.text can read the textbox? I have no idea...
How can i do this..? thx...
I have to unzip the xlsx file......
zipFile = zipfile.ZipFile(os.path.join(os.getcwd(), u''+str(flist)+''))
for file in zipFile.namelist():
zipFile.extract(file, r'tmp')
zipFile.close()
num = 0
if os.path.exists(r'tmp/xl/drawings'):
xmldir = os.listdir(r'tmp/xl/drawings')
for xmlfile in xmldir:
xml = os.path.basename(xmlfile)
if os.path.splitext(xml)[1] == '.xml':
a = open(u'tmp/xl/drawings/'+str(xml)+'').read()
b = a.replace('\n','').replace(' ','')
c = re.findall(r'<a:p>(.*?)</a:p>',b)
for i in c:
text = "".join(re.findall(r'(?<=<a:t>).*?(?=</a:t>)',u''+str(i)+'',re.S)).replace(' ','').replace(' ','').replace('\\u6d3b\\u52a8','').replace('<','<').replace('>','>').replace('&','&')