Unable to Group dataframe by Month number - pandas

I have the following code but it seems the line
cs.groupby(cs['Disbursal_Date'].dt.strftime('%B'))['Revenue'].sum()
just returns the entire dataframe without the data grouping by Month number.
Any help is much appreciated
import pandas as pd
import os
import glob
import numpy as np
os.chdir("C:/csv/")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
cs = pd.concat([pd.read_csv(f) for f in all_filenames])
cs.drop(cs.columns[[0]], axis=1, inplace=True)
cs = cs[cs["Booked"] == 1]
cs['Disbursal_Date'] = pd.to_datetime(cs['Disbursal_Date'])
cs.drop_duplicates(inplace=True)
cs['Revenue'] = np.where(cs['Loan_Amount'] < 1000, 28,
np.where((cs['Loan_Amount'] > 1000) & (cs['APR'] < 0.3), 0.0525 * cs['Loan_Amount'],
np.where((cs['Loan_Amount'] > 1000) & (cs['APR'] > 0.3), 0.0275 * cs['Loan_Amount'], 0)))
cs.loc[cs.Revenue >= 175, "Revenue"] = 175
cs.loc[cs.Revenue <= 52.50, "Revenue"] = 52.50
cs.groupby(cs['Disbursal_Date'].dt.strftime('%B'))['Revenue'].sum()
print(cs)

You're not assigning the result from your cs.groupby. Something like:
cs = cs.groupby(cs['Disbursal_Date'].dt.strftime('%B'))['Revenue'].sum()
print(cs)
Should do the trick.

Related

I am having a problem with a foor loop that includes dataframes

I have a dataframe with 8 columnds. If two of those columns satisfy a condition, I have to fill two columns with the product of other two. And after running the algorithm it is not working.
I have tryed to use series, I have tryed to use import warnings
warnings.filterwarnings("ignore") but it is not working
for i in seq:
if dataframefinal['trade'][i] == 1 and dataframefinal['z'][i] > 0:
dataframefinal['CloseAdj2'][i]= dataframefinal['Close2'][i] *
dataframefinal['trancosshort'][i]
dataframefinal['CloseAdj1'][i]= dataframefinal['Close1'][i] *
dataframefinal['trancostlong'][i]
elif dataframefinal['trade'][i] == 1 and dataframefinal['z'][i] < 0:
dataframefinal['CloseAdj2'][i]= dataframefinal['Close1'][i] *
dataframefinal['trancosshort'][i]
dataframefinal['CloseAdj1'][i]= dataframefinal['Close2'][i] *
dataframefinal['trancostlong'][i]
else:
dataframefinal['CloseAdj1'][i]= dataframefinal['Close1'][i]
dataframefinal['CloseAdj2'][i]= dataframefinal['Close2'][i]
You can use vectorized condition function numpy.select() to do this quickly:
import pandas as pd
from numpy.random import randn, randint
n = 10
df_data = pd.DataFrame(dict(trade=randint(0, 2, n),
z=randn(n),
Close1=randn(n),
Close2=randn(n),
trancosshort=randn(n),
trancostlong=randn(n)))
df_data["CloseAdj1"] = 0
df_data["CloseAdj2"] = 0
seq = [1, 3, 5, 7, 9]
df = df_data.loc[seq]
cond1 = df.eval("trade==1 and z > 0")
cond2 = df.eval("trade==2 and z < 0")
df["CloseAdj2"] = np.select([cond1, cond2],
[df.eval("Close2 * trancosshort"),
df.eval("Close1 * trancosshort")], df.Close2)
df["CloseAdj1"] = np.select([cond1, cond2],
[df.eval("Close1 * trancostlong"),
df.eval("Close2 * trancostlong")], df.Close1)
df_data.loc[seq, ["CloseAdj1", "CloseAdj2"]] = df[["CloseAdj1", "CloseAdj2"]]

Bokeh: Bad date format?

would anyone advise me how to adjust the X axis to better display the date on this graph?
from math import pi
import pandas as pd
from bokeh.io import show
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
#cesta k souboru
path = "C://Users//Zemi4//Desktop//zpr3//all2.csv"
#nacteni dataframu
data = pd.read_csv(path, delimiter = ",")
data['Cas'] = data['Cas'].astype(str)
data = data.set_index('Cas')
data.columns.name = 'Mistnost'
times = list(data.index)
rooms = list(data.columns)
df = pd.DataFrame(data.stack(), columns=['float']).reset_index()
colors = ['#440154', '#404387', '#29788E', '#22A784', '#79D151', '#FDE724', '#FCFEA4', '#FBA40A', '#DC5039']
mapper = LinearColorMapper(palette=colors, low=df.float.min(), high=df.float.max())
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Heatmap ({0} - {1})".format(times[0], times[-1]),
x_range=times, y_range=list(reversed(rooms)),
x_axis_location="above", plot_width=1500, plot_height=900,
tools=TOOLS, toolbar_location='below',
tooltips=[('Time: ', '#Cas'), ('Temperature: ', '#float'), ('Room: ', '#Mistnost')],
x_axis_type='datetime')
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3
p.rect(x="Cas", y="Mistnost", width=1, height=1,
source=df,
fill_color={'field': 'float', 'transform': mapper},
line_color=None)
color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
ticker=BasicTicker(desired_num_ticks=len(colors)),
formatter=PrintfTickFormatter(format="%f"),
label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')
show(p) # show the pl
Try: p.xaxis[0].ticker.desired_num_ticks = <number_ticks_you_want_to_display>.
Or apply a specific ticker (see Bokeh docs) like you did for the ColorBar.

Time Difference between Time Period and Instant

I have some time periods (df_A) and some time instants (df_B):
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
# Data
df_A = pd.DataFrame({'A1': [dt.datetime(2017,1,5,9,8), dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,2,7,9,19), dt.datetime(2017,2,7,9,19)],
'A2': [dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,5,9,12), dt.datetime(2017,1,7,9,26), dt.datetime(2017,1,7,9,20), dt.datetime(2017,1,7,9,21), dt.datetime(2017,2,7,9,23), dt.datetime(2017,2,7,9,25)]})
df_B = pd.DataFrame({ 'B': [dt.datetime(2017,1,6,14,45), dt.datetime(2017,1,4,3,31), dt.datetime(2017,1,7,3,31), dt.datetime(2017,1,7,14,57), dt.datetime(2017,1,9,14,57)]})
I can match these together:
# Define an Extra Margin
M = dt.timedelta(days = 10)
df_A["A1X"] = df_A["A1"] + M
df_A["A2X"] = df_A["A2"] - M
# Match
Bv = df_B .B .values
A1 = df_A .A1X.values
A2 = df_A .A2X.values
i, j = np.where((Bv[:, None] >= A1) & (Bv[:, None] <= A2))
df_C = pd.DataFrame(np.column_stack([df_B .values[i], df_A .values[j]]),
columns = df_B .columns .append (df_A.columns))
I would like to find the time difference between each time period and the time instant matched to it. I mean that
if B is between A1 and A2
then dT = 0
I've tried doing it like this:
# Calculate dt
def time(A1,A2,B):
if df_C["B"] < df_C["A1"]:
return df_C["A1"].subtract(df_C["B"])
elif df_C["B"] > df_C["A2"]:
return df_C["B"].subtract(df_C["A2"])
else:
return 0
df_C['dt'] = df_C.apply(time)
I'm getting "ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series"
So, I found two fixes:
You are adding M to the lower value and subtracting from the higher one. Change it to:
df_A['A1X'] = df_A['A1'] - M
df_A['A2X'] = df_A['A2'] + M
You are only passing one row of your dataframe at a time to your time function, so it should be something like:
def time(row):
if row['B'] < row['A1']:
return row['A1'] - row['B']
elif row['B'] > row['A2']:
return row['B'] - row['A2']
else:
return 0
And then you can call it like this:
df_C['dt'] = df_C.apply(time, axis=1) :)

Represent negative timedelta in most basic form

If I create a negative Timedelta for e.g. 0.5 hours, the internal representation looks as follow:
In [2]: pd.Timedelta('-0.5h')
Out[2]: Timedelta('-1 days +23:30:00')
How can I get back a (str) representation of this Timedelta in the form -00:30?
I want to display these deltas and requiring the user to calculate the expression -1 day + something is a bit award.
I can't add comment to you so adding it here. Don't know if this helps but I think you can use python humanize.
import humanize as hm
hm.naturaltime((pd.Timedelta('-0.5h')))
Out:
'30 minutes from now'
Ok, I will live with a hack going trough a date:
sign = ''
date = pd.to_datetime('today')
if delta.total_seconds() < 0:
sign = '-'
date = date - delta
else:
date = date + delta
print '{}{:%H:%M}'.format(sign, date.to_pydatetime())
You can use the components of a Pandas timedelta
import pandas as pd
t = pd.Timedelta('-0.5h')
print t.components
>> Components(days=-1L, hours=23L, minutes=30L, seconds=0L, milliseconds=0L, microseconds=0L, nanoseconds=0L)
You can access each component with
print t.components.days
>> -1
print t.components.hours
>> 23
print t.components.minutes
>> 30
The rest is then formatting.
source
This is a total hack that won't work for Series data, but....
import pandas as pd
import numpy as np
t = pd.Timedelta('-0.5h').components
mins = t.days*24*60 + t.hours*60 + t.minutes
print str(np.sign(mins))[0]+str(divmod(abs(mins), 60)[0]).zfill(2)+':'+str(divmod(abs(mins), 60)[1]).zfill(2)
>> -00:30
I was looking for something similar (see https://github.com/pandas-dev/pandas/issues/17232 )
I'm not sure if it will be implemented in Pandas, so here is a workaround
import pandas as pd
def timedelta2str(td, display_plus=False, format=None):
"""
Parameters
----------
format : None|all|even_day|sub_day|long
Returns
-------
converted : string of a Timedelta
>>> td = pd.Timedelta('00:00:00.000')
>>> timedelta2str(td)
'0 days'
>>> td = pd.Timedelta('00:01:29.123')
>>> timedelta2str(td, display_plus=True, format='sub_day')
'+ 00:01:29.123000'
>>> td = pd.Timedelta('-00:01:29.123')
>>> timedelta2str(td, display_plus=True, format='sub_day')
'- 00:01:29.123000'
"""
td_zero = pd.Timedelta(0)
sign_sep = ' '
if td >= td_zero:
s = td._repr_base(format=format)
if display_plus:
s = "+" + sign_sep + s
return s
else:
s = timedelta2str(-td, display_plus=False, format=format)
s = "-" + sign_sep + s
return s
if __name__ == "__main__":
import doctest
doctest.testmod()

zipline error KeyError: <type 'zipline.assets._assets.Equity'>

When I try to execute a simple crossover strategy algorithm outside quantopian framework using zipline, I get the following error.
KeyError: <type 'zipline.assets._assets.Equity'>
This is a simple crossover strategy where 50-100 day moving averages are calculated to derive trading strategy. I am unable to run this strategy out of Quantopian framework using zipline.
Code is as follows
import pandas as pd
import zipline
from zipline import TradingAlgorithm
from zipline.api import order, sid
from zipline.utils.factory import load_from_yahoo
import matplotlib.pyplot as plt
from zipline.api import order, symbol, record, order_target
import pytz
%matplotlib inline
# creating time interval
start = pd.Timestamp('2013-01-25', tz='UTC')
end = pd.Timestamp('2017-02-01', tz='UTC')
#input_date = get_pricing(['AAPL'],start,end,frequency='daily')
# loading the data
#input_data = load_bars_from_yahoo(stocks=['AAPL'], start=start,end=end,)
data = load_from_yahoo(stocks=['AAPL'], indexes={}, start=start, end=end)
data = data.dropna()
def initialize(context):
context.security= symbol('AAPL')
context.i =0
def handle_data(context, data):
context.i += 1
if context.i<100:
return
MA1 = data[context.security].mavg(50)
MA2 = data[context.security].mavg(100)
date = str(data[context.security].datetime)[:10]
current_price = data[context.security].price
current_positions = context.portfolio.positions[symbol('AAPL')].amount
cash = context.portfolio.cash
value = context.portfolio.portfolio_value
current_pnl = context.portfolio.pnl
if (MA1 > MA2) and current_positions == 0:
number_of_shares = 100
order(context.security, number_of_shares)
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price=
current_price,status="buy",shares=number_of_shares,PnL=current_pnl,cash=cash,value=value)
elif (MA1 < MA2) and current_positions != 0:
order_target(context.security, 0)
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price= current_price,status="sell",shares="--",PnL=current_pnl,cash=cash,value=value)
else:
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price= current_price,status="--",shares="--",PnL=current_pnl,cash=cash,value=value)
algo = TradingAlgorithm(initialize=initialize, handle_data=handle_data)
results = algo.run(input_data)
use codes as below to calculate MA1 and MA2, then it works!
because some of the function is out of date in zipline 1.1.0
from talib import MA
trailing_window = data.history(assets=context.security, fields='price', bar_count=100, frequency='1d')
MA1 = MA(trailing_window.values, 50)[-1]
MA2 = MA(trailing_window.values, 100)[-1]
or use the codes as below without using talib:
trailing_window1 = data.history(assets=context.security, fields='price', bar_count=50, frequency='1d')
trailing_window2 = data.history(assets=context.security, fields='price', bar_count=100, frequency='1d')
MA1 = trailing_window1.mean()
MA2 = trailing_window2.mean()