Since days I am struggling with this problem: I have a large script where a function is exporting tables in an Excel workbook, each table into a different worksheet. Additionally, I want to give format to the worksheets using engine xlsxwriter. I use the instance with pd.ExcelWriter() as writer.
This works fine for an Excel workbook with a single sheet: using to_excel the table is exported and immediately I use an ad hoc function I created to format it.
Code structure:
Global Excel_formatting function that gives format (input: table, sheet name, text strings)
Script function_tables function (input: dataframe, pathfile) that creates subtables from input dataframe, and uses instance pd.ExcelWriter to:
-- export the Excel worksheets
-- call Excel_formatting function to format the worksheets
At high level, the script calls function_tables
See below the complete code:
# Global function to format the output tables
def Excel_formatting(table_input, sheet_name_input, title_in, remark_in, start_row_input):
# Assign WorkBook and worksheet
workbook = writer.book
worksheet = writer.sheets[sheet_name_input]
start_column = 0
# Title and remark
worksheet.write(0, start_column, title_in,
workbook.add_format({'bold': True,
'color': '#8B0000',
'size': 18,
'align':'left'}))
worksheet.write(1, start_column+1, remark_in,
workbook.add_format({'italic': True,
'size': 11,
'align':'left'}))
# Format header (on top of existing header)
header_format = workbook.add_format({'bold': True,
'text_wrap': False,
'fg_color': '#FF8B8B',
'border': 1,
'align':'center'})
for col_num, value in enumerate(table_input.columns.values):
worksheet.write(start_row_input, col_num, value, header_format)
# Freeze panes / Can also be done with to_excel
worksheet.freeze_panes(start_row_input+1, 0)
# Set column width
end_column = len(table_input.columns)
worksheet.autofit()
# Add autofilter to header
worksheet.autofilter(start_row_input, 0, start_row_input, end_column-1)
# Add logo (if present, to avoid script error)
figure_path = 'Logo.JPG'
if (os.path.exists(figure_path) == True):
worksheet.insert_image(0, start_column+5, figure_path, {'x_scale': 0.1, 'y_scale': 0.08, 'decorative': True})
# End of function
return workbook.close()
def function_tables(x, filename):
# Here the function creates subtables from input dataframe
df = x
Table_1 = df.groupby(['Feature 1'])['Deviation'].sum().reset_index()
Table_2 = df.groupby(['Feature 2'])['Deviation'].sum().reset_index()
# ...
Table_N = df.groupby(['Feature N'])['Deviation'].sum().reset_index()
# Export tables adding new sheets to the same Excel workbook
with pd.ExcelWriter(filename, engine='xlsxwriter', mode='w') as writer:
start_row = 2
Table_1.to_excel(writer, index=True, header=True, sheet_name='Overview Feat.1', startrow=start_row)
Table_2.to_excel(writer, index=True, header=True, sheet_name='Overview Feat.2', startrow=start_row)
# ...
Table_N.to_excel(writer, index=True, header=True, sheet_name='Overview Feat.N', startrow=start_row)
# Formatting the worksheets calling the global function
title_input_1 = 'Title for overview table 1'
remark_input_1 = 'Remark Table 1'
Excel_formatting(Table_2, 'Overview Feat.2', title_input_1, remark_input_1, start_row)
title_input_2 = 'Title for overview table 2'
remark_input_2 = 'Remark Table 2'
# ...
Excel_formatting(Table_2, 'Overview Feat.N', title_input_2, remark_input_2, start_row)
title_input_N = 'Title for overview table N'
remark_input_N = 'Remark Table N'
Excel_formatting(Table_1, 'Overview Feat.N', title_input_N, remark_input_N, start_row)
# Call section of script
function_tables(df_input, Path_filename)
I tried also openpyxl, a loop through the tables using a dictionary for the input or not having the formatting function as global but inside the writer instance but all failed, always giving me the same error:
worksheet = writer.sheets[sheet_name_input]
KeyError: 'Overview Feat.1'
It looks that it cannot find the sheetname. Any help? A poorsman alternative will be to create N Excel workbooks and then merged all them, but I prefer not to do so, it must be a more pythonic way to work this, right?
A million thanks!
There are a few issues in the code: the writer object needs to be passed to the Excel_formatting() function, the writer shouldn't be closed in that function, and there are some typos in the titles, captions and variable names.
Here is a working example with those issues fixed. I've added sample data frames, you can replace that with your groupby() code.
import pandas as pd
import os
# Global function to format the output tables
def Excel_formatting(table_input, writer, sheet_name_input, title_in, remark_in, start_row_input):
# Assign WorkBook and worksheet
workbook = writer.book
worksheet = writer.sheets[sheet_name_input]
start_column = 0
# Title and remark
worksheet.write(0, start_column, title_in,
workbook.add_format({'bold': True,
'color': '#8B0000',
'size': 18,
'align': 'left'}))
worksheet.write(1, start_column + 1, remark_in,
workbook.add_format({'italic': True,
'size': 11,
'align': 'left'}))
# Format header (on top of existing header)
header_format = workbook.add_format({'bold': True,
'text_wrap': False,
'fg_color': '#FF8B8B',
'border': 1,
'align': 'center'})
for col_num, value in enumerate(table_input.columns.values):
worksheet.write(start_row_input, col_num, value, header_format)
# Freeze panes / Can also be done with to_excel
worksheet.freeze_panes(start_row_input + 1, 0)
# Set column width
end_column = len(table_input.columns)
worksheet.autofit()
# Add autofilter to header
worksheet.autofilter(start_row_input, 0, start_row_input, end_column - 1)
# Add logo (if present, to avoid script error)
figure_path = 'Logo.JPG'
if os.path.exists(figure_path):
worksheet.insert_image(0, start_column + 5, figure_path, {'x_scale': 0.1, 'y_scale': 0.08, 'decorative': True})
def function_tables(x, filename):
Table_1 = pd.DataFrame({'Data': [11, 12, 13, 14]})
Table_2 = pd.DataFrame({'Data': [11, 12, 13, 14]})
# ...
Table_N = pd.DataFrame({'Data': [11, 12, 13, 14]})
# Export tables adding new sheets to the same Excel workbook
with pd.ExcelWriter(filename, engine='xlsxwriter', mode='w') as writer:
start_row = 2
Table_1.to_excel(writer, index=True, header=True, sheet_name='Overview Feat.1', startrow=start_row)
Table_2.to_excel(writer, index=True, header=True, sheet_name='Overview Feat.2', startrow=start_row)
# ...
Table_N.to_excel(writer, index=True, header=True, sheet_name='Overview Feat.N', startrow=start_row)
# Formatting the worksheets calling the global function
title_input_1 = 'Title for overview table 1'
remark_input_1 = 'Remark Table 1'
Excel_formatting(Table_1, writer, 'Overview Feat.1', title_input_1, remark_input_1, start_row)
title_input_2 = 'Title for overview table 2'
remark_input_2 = 'Remark Table 2'
Excel_formatting(Table_2, writer, 'Overview Feat.2', title_input_2, remark_input_2, start_row)
title_input_N = 'Title for overview table N'
remark_input_N = 'Remark Table N'
Excel_formatting(Table_N, writer, 'Overview Feat.N', title_input_N, remark_input_N, start_row)
# Call section of script
function_tables(None, 'test.xlsx')
Output:
However, to make it more generic it would be best to handle the main function in a loop like this:
def function_tables(x, filename):
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
Table_1 = pd.DataFrame({'Data': [11, 12, 13, 14]})
Table_2 = pd.DataFrame({'Data': [11, 12, 13, 14]})
# ...
Table_N = pd.DataFrame({'Data': [11, 12, 13, 14]})
# In a real case you would probably append() these in a loop.
dfs = [Table_1, Table_2, Table_N]
for i, df in enumerate(dfs, 1):
start_row = 2
df.to_excel(writer, index=True, header=True, sheet_name=f'Overview Feat.{i}', startrow=start_row)
# Formatting the worksheets calling the global function
title_input = f'Title for overview table {i}'
remark_input = f'Remark Table {i}'
Excel_formatting(df, writer, f'Overview Feat.{i}', title_input, remark_input, start_row)
writer.close()
Related
I am currently working on longitudinal data and trying to reshape the data from the wide format to the long. The naming pattern of the time-varying variables is r*variable (for example, height data collected in wave 1 is r1height). The identifiers are hhid (household id) and pn (person id). The data itself is unbalanced. Some variables are observed from first wave to last wave, but others are only observed from the middle of the study (i.e., wave 3 to 5).
I have already reshaped the data using merged.stack from the splitstackshape package (see codes below).
df <- data.frame(hhid = c("10001", "10002", "10003", "10004"),
pn = c("001", "001", "001", "002"),
r1weight = c(56, 76, 87, 64),
r2weight = c(57, 75, 88, 66),
r3weight = c(56, 76, 87, 65),
r4weight = c(78,99,23,32),
r5weight = c(55, 77, 84, 65),
r1height = c(151, 163, 173, 153),
r2height = c(154, 164, NA, 154),
r3height = c(NA, 165, NA, 152),
r4height = c(153, 162, 172, 154),
r5height = c(152,161,171,154),
r3bmi = c(22,23,24,25),
r4bmi = c(23,24,20,19),
r5bmi = c(21,14,22,19))
library(splitstackshape)
# Merge stack (this is what I want)
long1 <- merged.stack(df, id.vars = c("hhid", "pn"),
var.stubs = c("weight", "height", "bmi"),
sep = "var.stubs", atStart = F, keep.all = FALSE)
Now I want to know if I can use the "reshape" function to get the same results. I have tried using reshape method but failed. For example, the reshape function, as shown in the code below, returns bizarre longitudinal data. I thought the "sep" statement should cause the problem, but I don't know how to specify a pattern for my time-varying variables.
# Reshape (Wrong results)
library(reshape)
namelist <- names(df)
namelist <- namelist[namelist %in% c("hhid", "pn") == FALSE]
long2 <- reshape(data=df,
varying = namelist,
sep = "",
direction = "long",
idvar = c("hhid", "pn"))
Could anyone let me know how to address this problem?
Thanks
Situation
I'm trying to upload a pandas dataframe of Twitter API data to a table in BigQuery.
Here's my dataframe prep code from Google Colab notebook:
!pip install --upgrade google-cloud-language
!pip install pandas-gbq -U
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
import os
# Imports Credential File:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "pp-004a-d61bf3451d85.json"
print("Service Account Key: {}".format(os.environ["GOOGLE_APPLICATION_CREDENTIALS"]))
!pip install --upgrade tweepy
# VARIABLES
interval = "15"
start = '2022-04-07'
end = '2022-04-12'
# Tweepy
searchQ = '(max muncy) -is:retweet lang:en'
intval_tw = "{}T".format(interval)
start_tw = '{}T00:00:00Z'.format(start)
end_tw = '{}T23:59:59Z'.format(end)
# index = pd.date_range('1/1/2000', periods=9, freq='T')
# D = calendar day frequency, H = hourly frequency, T, min = minutely frequency
# Library installs
import tweepy
# from twitter_authentication import bearer_token
import time
import pandas as pd
import requests
import json
import numpy as np
bearer_token = "BEARER_TOKEN"
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
# NEED TO ENSURE HAVE ALL PARAMETERS
gathered_tweets = []
for response in tweepy.Paginator(client.search_recent_tweets,
query = searchQ,
user_fields = ['name', 'description', 'username', 'profile_image_url', 'url', 'pinned_tweet_id', 'verified', 'created_at', 'location', 'public_metrics', 'entities'],
tweet_fields = ['public_metrics', 'created_at','lang', 'attachments', 'context_annotations', 'conversation_id', 'entities', 'geo', 'in_reply_to_user_id', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source'],
media_fields = ['duration_ms', 'media_key', 'preview_image_url', 'type', 'url', 'height', 'width', 'public_metrics'],
expansions = ['author_id', 'attachments.media_keys', 'entities.mentions.username', 'geo.place_id', 'in_reply_to_user_id', 'referenced_tweets.id', 'referenced_tweets.id.author_id'],
start_time = start_tw,
end_time = end_tw,
max_results=100):
time.sleep(1)
gathered_tweets.append(response)
result = []
user_dict = {}
# Loop through each response object
for response in gathered_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'username': user.username,
'created_at': user.created_at,
'location': user.location,
'verified': user.verified,
'name': user.name,
'description': user.description,
'url': user.url,
'profile_image_url': user.profile_image_url,
'pinned_tweet': user.pinned_tweet_id,
'entities': user.entities,
'followers': user.public_metrics['followers_count'],
'total_tweets': user.public_metrics['tweet_count'],
'following': user.public_metrics['following_count'],
'listed': user.public_metrics['listed_count'],
'tweets': user.public_metrics['tweet_count']
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'username': author_info['username'],
'name': author_info['name'],
'author_followers': author_info['followers'],
'author_following': author_info['following'],
'author_tweets': author_info['tweets'],
'author_description': author_info['description'],
'author_url': author_info['url'],
'profile_image_url': author_info['profile_image_url'],
#'pinned_tweet': author_info['pinned_tweet_id'], https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets
#'total_tweets': author_info['tweet_count'],
#'listed_count': author_info['listed_count'],
'entities': author_info['entities'],
'verified': author_info['verified'],
'account_created_at': author_info['created_at'],
'text': tweet.text,
'created_at': tweet.created_at,
'lang': tweet.lang,
'tweet_id': tweet.id,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quotes': tweet.public_metrics['quote_count'],
'replied': tweet.in_reply_to_user_id,
'sensitive': tweet.possibly_sensitive,
'referenced_tweets': tweet.referenced_tweets,
'reply_settings': tweet.reply_settings,
'source': tweet.source
#'video_views': tweet.public_metrics['view_count']
})
dfTW00 = pd.DataFrame(result)
dfTW01 = dfTW00
# Create 'engagement' metric
dfTW01['engagement'] = dfTW01['retweets'] + dfTW01['replies'] + dfTW01['likes'] + dfTW01['quotes']
# Add 'tweets' column with value of 1
dfTW01['tweets'] = 1
# Engagement Rate calc
dfTW01['eng_rate'] = (dfTW01['tweets'] / dfTW01['engagement'])
# Add twitter link
dfTW01['base_url'] = 'https://twitter.com/twitter/status/'
# base_url = 'https://twitter.com/twitter/status/'
dfTW01['tweet_link'] = dfTW01['base_url'] + dfTW01['tweet_id'].astype(str)
# Imports the Google Cloud client library
from google.cloud import language_v1
# Instantiates a client
client = language_v1.LanguageServiceClient()
def get_sentiment(text):
# The text to analyze
document = language_v1.Document(
content=text,
type_=language_v1.types.Document.Type.PLAIN_TEXT
)
# Detects the sentiment of the text
sentiment = client.analyze_sentiment(
request={"document": document}
).document_sentiment
return sentiment
dfTW01["sentiment"] = dfTW01["text"].apply(get_sentiment)
dfTW02 = dfTW01['sentiment'].astype(str).str.split(expand=True)
dfTW02
dfTW03 = pd.merge(dfTW01, dfTW02, left_index=True, right_index=True)
dfTW03.rename(columns = {1:'magnitude', 3:'score'}, inplace=True)
cols = ['magnitude', 'score']
dfTW03[cols] = dfTW03[cols].apply(pd.to_numeric, errors='coerce', axis=1)
def return_status(x):
if x >= .5:
return 'Positive'
elif x <= -.5:
return 'Negative'
return 'Neutral'
dfTW03['sentiment2'] = dfTW03['score'].apply(return_status)
What I've tried
This is what I've used for the upload (I've confirmed the project, dataset and table info are correct):
df.to_gbq('004a01.004a-TW-01',
'pp-004a',
chunksize=None,
if_exists='append'
)
Results
However, that method is returning this error message:
TypeError: <' not supported between instances of 'int' and 'str'
Assessment
I've found several posts on SO addressing this, but I'm unable to relate them to my situation. (I thought various datatypes could be uploaded to a BigQuery table.)
Primarily, I'm not clear what the error message means by '<' not supported between instances of 'int' and 'str'.
Any input on what that would be greatly appreciated.
Below are the pandas dtypes in my dataframe if helpful.
Dataframe dtypes
Pandas dataframe dtypes:
author_id int64
username object
name object
author_followers int64
author_following int64
author_tweets int64
author_description object
author_url object
profile_image_url object
entities object
verified bool
account_created_at datetime64[ns, UTC]
text object
created_at datetime64[ns, UTC]
lang object
tweet_id int64
retweets int64
replies int64
likes int64
quotes int64
replied float64
sensitive bool
referenced_tweets object
reply_settings object
source object
engagement int64
tweets int64
eng_rate float64
base_url object
tweet_link object
sentiment object
0 object
magnitude float64
2 object
score float64
sentiment_rating float64
sentiment2 object
dtype: object
Instead of to_gbq() function from Pandas, you may try and use load_table_from_dataframe() function from BigQuery library in loading your dataframe to BigQuery.
Please see the below sample python code using load_table_from_dataframe():
import datetime
from google.cloud import bigquery
import pandas
import pytz
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "my-project.my-dataset.my-table"
records = [
{
"title": "The Meaning of Life",
"release_year": 1983,
"length_minutes": 112.5,
"release_date": pytz.timezone("Europe/Paris")
.localize(datetime.datetime(1983, 5, 9, 13, 0, 0))
.astimezone(pytz.utc),
# Assume UTC timezone when a datetime object contains no timezone.
"dvd_release": datetime.datetime(2002, 1, 22, 7, 0, 0),
},
{
"title": "Monty Python and the Holy Grail",
"release_year": 1975,
"length_minutes": 91.5,
"release_date": pytz.timezone("Europe/London")
.localize(datetime.datetime(1975, 4, 9, 23, 59, 2))
.astimezone(pytz.utc),
"dvd_release": datetime.datetime(2002, 7, 16, 9, 0, 0),
},
{
"title": "Life of Brian",
"release_year": 1979,
"length_minutes": 94.25,
"release_date": pytz.timezone("America/New_York")
.localize(datetime.datetime(1979, 8, 17, 23, 59, 5))
.astimezone(pytz.utc),
"dvd_release": datetime.datetime(2008, 1, 14, 8, 0, 0),
},
{
"title": "And Now for Something Completely Different",
"release_year": 1971,
"length_minutes": 88.0,
"release_date": pytz.timezone("Europe/London")
.localize(datetime.datetime(1971, 9, 28, 23, 59, 7))
.astimezone(pytz.utc),
"dvd_release": datetime.datetime(2003, 10, 22, 10, 0, 0),
},
]
dataframe = pandas.DataFrame(
records,
# In the loaded table, the column order reflects the order of the
# columns in the DataFrame.
columns=[
"title",
"release_year",
"length_minutes",
"release_date",
"dvd_release",
],
# Optionally, set a named index, which can also be written to the
# BigQuery table.
index=pandas.Index(
["Q24980", "Q25043", "Q24953", "Q16403"], name="wikidata_id"
),
)
job_config = bigquery.LoadJobConfig(
# Specify a (partial) schema. All columns are always written to the
# table. The schema is used to assist in data type definitions.
schema=[
# Specify the type of columns whose type cannot be auto-detected. For
# example the "title" column uses pandas dtype "object", so its
# data type is ambiguous.
bigquery.SchemaField("title", bigquery.enums.SqlTypeNames.STRING),
# Indexes are written if included in the schema by name.
bigquery.SchemaField("wikidata_id", bigquery.enums.SqlTypeNames.STRING),
],
# Optionally, set the write disposition. BigQuery appends loaded rows
# to an existing table by default, but with WRITE_TRUNCATE write
# disposition it replaces the table with the loaded data.
write_disposition="WRITE_TRUNCATE"
)
job = client.load_table_from_dataframe(
dataframe, table_id, job_config=job_config
) # Make an API request.
job.result() # Wait for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
I'm trying to loop through all files in a directory and add "indicator" data to them. I had the code working where I could select 1 file and do this, but now am trying to make it work on all files. The problem is when I make the loop it says
ValueError: Invalid file path or buffer object type: <class 'list'>
The goal would be for each loop to read another file from list, make changes, and save file back to folder with changes.
Here is complete code w/o imports. I copied 1 of the "file_path"s from the list and put in comment at bottom.
### open dialog to select file
#file_path = filedialog.askopenfilename()
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file in listdrs_path:
file_path = listdrs_path
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
print(listdr)
# Convert date to timestamp and make index
data.index = data["Date"].apply(lambda x: pd.Timestamp(x))
data.drop("Date", axis=1, inplace=True)
return data
df = data
##print(data)
######Indicator data#####################
def get_indicators(data):
# Get MACD
data["macd"], data["macd_signal"], data["macd_hist"] = talib.MACD(data['Close'])
# Get MA10 and MA30
data["ma10"] = talib.MA(data["Close"], timeperiod=10)
data["ma30"] = talib.MA(data["Close"], timeperiod=30)
# Get RSI
data["rsi"] = talib.RSI(data["Close"])
return data
#####end functions#######
data2 = get_indicators(data)
print(data2)
data2.to_csv(file_path)
###################################################
#here is an example of what path from list looks like
#'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/A.csv'
The problem is in line number 13 and 14. Your filename is in variable file but you are using file_path which you've assigned the file list. Because of this you are getting ValueError. Try this:
### open dialog to select file
#file_path = filedialog.askopenfilename()
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file_path in listdrs_path:
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
print(listdr)
# Convert date to timestamp and make index
data.index = data["Date"].apply(lambda x: pd.Timestamp(x))
data.drop("Date", axis=1, inplace=True)
return data
df = data
##print(data)
######Indicator data#####################
def get_indicators(data):
# Get MACD
data["macd"], data["macd_signal"], data["macd_hist"] = talib.MACD(data['Close'])
# Get MA10 and MA30
data["ma10"] = talib.MA(data["Close"], timeperiod=10)
data["ma30"] = talib.MA(data["Close"], timeperiod=30)
# Get RSI
data["rsi"] = talib.RSI(data["Close"])
return data
#####end functions#######
data2 = get_indicators(data)
print(data2)
data2.to_csv(file_path)
Let me know if it helps.
I need to write specific data in two excel sheets, the first sheet will be filled by the first and last date, while the second sheet will contain the time difference between two specific rows(only calculate the time difference when df.iloc[i, 1] == '[1]->[0]' and df.iloc[i + 1, 1] == '[0]->[1]').
This is my code:
import xlsxwriter
import pandas as pd
df= pd.DataFrame({'Time':['2019/01/03 15:02:07', '2019/01/03 15:16:55', '2019/01/03 15:17:20', '2019/01/03 15:28:58','2019/01/03 15:32:28','2019/01/03 15:38:54'],
'Payload':['[0]->[1]', '[1]->[0]','[0]->[1]','[0]->[1]','[1]->[0]','[0]->[1]']})
workbook = xlsxwriter.Workbook('Results.xlsx')
ws = workbook.add_worksheet("Rapport détaillé")
# wsNavco = workbook.add_worksheet("Délai reconnexion NAVCO")
ws.set_column(0, 1, 30)
ws.set_column(1, 2, 25)
# Add a format. Light yellow fill with dark red text.
format1 = workbook.add_format({'bg_color': '#fffcad',
'font_color': '#0a0a0a'})
# Add a format. Green fill with dark green text.
format2 = workbook.add_format({'bg_color': '#e7fabe',
'font_color': '#0a0a0a'})
# Write a conditional format over a range.
ws.conditional_format('A1:A24', {'type': 'cell',
'criteria': '>=',
'value': 50,
'format': format1})
ws.conditional_format('B1:B24', {'type': 'cell',
'criteria': '>=',
'value': 50,
'format': format2})
parametres = (
['Parametres', 'Valeurs'],
['1ere date ', str(df['Time'].iloc[0])],
['Derniere date ', str(df['Time'].iloc[len(df)-1])],
)
# Start from the first cell. Rows and
# columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
for name, parametres in (parametres):
ws.write(row, col, name)
ws.write(row, col + 1, parametres)
row += 1
workbook.close()
df= df.sort_values(by='Time')
df.Time = pd.to_datetime(df.Time, format='%Y/%m/%d %H:%M:%S')
print('df\n',df)
diff = []
for i in range(len(df) - 1):
if df.iloc[i, 1] == '[1]->[0]' and df.iloc[i + 1, 1] == '[0]->[1]':
time_diff = df.iloc[i + 1, 0] - df.iloc[i, 0]
else:
time_diff = 0
diff.append(time_diff)
diff.append(0) # to fill the last value
df['Difference'] = diff
print(df['Difference'])
print('df1\n',df)
workbook = xlsxwriter.Workbook('Results.xlsx')
wsNavco = workbook.add_worksheet('Délai reconnexion NAVCO')
# wsNavco = wb.worksheets[1]
wsNavco.set_column(0, 1, 25)
wsNavco.set_column(1, 2, 55)
# Add a format. Light yellow fill with dark red text.
format1 = workbook.add_format({'bg_color': '#fffcad',
'font_color': '#0a0a0a'})
# Add a format. Green fill with dark green text.
format2 = workbook.add_format({'bg_color': '#e7fabe',
'font_color': '#0a0a0a'})
# Write a conditional format over a range.
wsNavco.conditional_format('A1:A24', {'type': 'cell',
'criteria': '>=',
'value': 50,
'format': format1})
wsNavco.conditional_format('B1:B24', {'type': 'cell',
'criteria': '>=',
'value': 50,
'format': format2})
for i in range (len(df)-1):
wsNavco.set_column(1, 1, 15)
wsNavco.write('A'+str(3),'Payload')
wsNavco.write('A'+str(i+4), str((df.iloc[i,1])))
wsNavco.write('B'+str(3),'Délai reconnexion NAVCO')
wsNavco.write('B'+str(i+4), str((df.iloc[i,2])))
workbook.close()
The problem is that it creates the first sheet and name and fill it, but then it overwrites the first sheet by the second sheet.
My question is: how can I save both sheets.
You cannot append to an existing workbook with Xlsxwriter, you need to perform all the writes before closing the workbook. In your case this should be fine, just remove the lines that close and reopen the book between 'Rapport détaillé' and 'Délai reconnexion NAVCO'
If you prepare you data into DataFrames before hand it becomes very simple.
import pandas as pd
# Create some Pandas dataframes from some data.
df1 = pd.DataFrame({'Data': [11, 12, 13, 14]})
df2 = pd.DataFrame({'Data': [21, 22, 23, 24]})
df3 = pd.DataFrame({'Data': [31, 32, 33, 34]})
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')
workbook = writer.book
# Write each dataframe to a different worksheet.
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')
df3.to_excel(writer, sheet_name='Sheet3')
# Define formats.
format1 = workbook.add_format({'bg_color': '#fffcad',
'font_color': '#0a0a0a'})
format2 = workbook.add_format({'bg_color': '#e7fabe',
'font_color': '#0a0a0a'})
# Format worksheets.
worksheet = writer.sheets['Sheet1']
worksheet.conditional_format('A1:A24', {'type': 'cell',
'criteria': '>=',
'value': 50,
'format': format1})
worksheet = writer.sheets['Sheet2']
worksheet.conditional_format('B1:B24', {'type': 'cell',
'criteria': '>=',
'value': 50,
'format': format2})
# Close the Pandas Excel writer and output the Excel file.
writer.save()
There are alternative engines like Openpyxl that support appending. See this answer for details.
I normally use h5py to do the HDF5 stuff in Python and if I want to create a dataset which I want to extend later or, I do:
f = h5py.File('foo.h5', 'w')
d = f.create_dataset('whatever', (5, 5), maxshape=(None, 5), dtype='i8', chunks=True)
...
d.resize((23, 5))
...
The maxshape(None, ...) sets the first dimension to "infinity", so it's extensible.
Now I have a project where I need to stick with PyTables and wanted to build up large arrays step by step. Is there a way to extend arrays in PyTables?
This is roughly the idea:
import tables as tb
import numpy as np
filename = "foo.h5"
h5file = tb.File(filename, "a")
gbar = h5file.create_group(h5file.root, "bar", "Pressure")
h5file.create_array(gbar, 'left', np.array((1, 2, 3, 4)), "...")
# now extend the shape of (4,) and append more arrays iteratively???
h5file.close()
I found the solution in the docs: tables.EArray
http://www.pytables.org/usersguide/libref/homogenous_storage.html#earrayclassdescr
Here is a descriptive example code which adds two "columns" with two different ways of dtype definition. The with block can be called multiple times and it will extend the columns:
import tables as tb
import numpy as np
filename = 'foo.h5'
with tb.File(filename, "a") as f:
if "/foo" not in f:
group = f.create_group("/", 'foo', 'Foo Information')
else:
group = f.root.foo
if "col1" not in group:
a = tb.Atom.from_dtype(np.dtype('<f8'), dflt=0.0)
arr = f.create_earray(group, 'col1', a, (0,), "Bar")
else:
arr = getattr(group, "col1")
arr.append(np.arange(10))
arr.append(np.arange(40, 45))
if "col2" not in group:
b = tb.Int64Atom()
arr = f.create_earray(group, 'col2', b, (0,), "Baz")
else:
arr = getattr(group, "col")
arr.append(np.arange(7))
arr.append(np.arange(30, 38))