Pd['Column1] > Pd['Column2'] Key Error: 0 - pandas

I am trying to write a function that loops through a pandas dataframe and compares if column1 > column2, if so appends 1 to a list, which is then returned.
Importing finance data from yahoo finance, calculating 2 Std and assigning to column upper and lower, and the moving average.
import pandas as pd
import pandas_datareader as web
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
import numpy as np
end = datetime(2021, 1, 16)
start = datetime(2020 , 1, 16)
symbols = ['ETH-USD']
stock_df = web.get_data_yahoo(symbols, start, end)
period = 20
# Simple Moving Average
stock_df['SMA'] = stock_df['Close'].rolling(window=period).mean()
# Standard deviation
stock_df['STD'] = stock_df['Close'].rolling(window=period).std()
# Upper Bollinger Band
stock_df['Upper'] = stock_df['SMA'] + (stock_df['STD'] * 2)
# Lower Bollinger Band
stock_df['Lower'] = stock_df['SMA'] - (stock_df['STD'] * 2)
# List of columns
column_list = ['Close', 'SMA', 'Upper', 'Lower']
stock_df[column_list].plot(figsize=(12.2,6.4)) #Plot the data
plt.title('ETH-USD')
plt.ylabel('USD Price ($)')
plt.show();
#Create a new data frame, Period for calculation, removes NAN's
bolldf = stock_df[period-1:]
#Show the new data frame
bolldf
Function, Loop through column rows and compare, append df['Close'][0] to buy/sell signal if condition is met.
def signal(df):
buy_signal = []
sell_signal = []
for i in range(len(df['Close'])):
if df['Close'][i] > df['Upper'][i]:
buy_signal.append(1)
return buy_signal
buy_signal = signal(bolldf)
buy_signal
Info about the Error:
KeyError: 0
During handling of the above exception, another exception occurred:
---> 12 buy_signal = greaterthan(stock_df)
KeyError: 0
During handling of the above exception, another exception occurred:
11
---> 12 buy_signal = greaterthan(stock_df)
13
----> 8 if bolldf['Close'][i] > bolldf['Open'][i]:
KeyError: 0
When i attempt this function on the columns df['Upper'] > df['Lower], or df['SMA'] < df['Lower'] for example it works as expected, its only when using the columns from the original data that it does not work.
Any help would be amazing. Thank you.

Since it is a multi-index, the columns must also be specified in the multi-index format. You can check the contents of the columns with bolldf.columns, and you can get them by modifying as follows.
bolldf.columns
MultiIndex([('Adj Close', 'ETH-USD'),
( 'Close', 'ETH-USD'),
( 'High', 'ETH-USD'),
( 'Low', 'ETH-USD'),
( 'Open', 'ETH-USD'),
( 'Volume', 'ETH-USD'),
( 'SMA', ''),
( 'STD', ''),
( 'Upper', ''),
( 'Lower', '')],
names=['Attributes', 'Symbols'])
def signal(df):
buy_signal = []
sell_signal = []
for i in range(len(df[('Close','ETH-USD')])):
if df[('Close','ETH-USD')][i] > df[('Upper','')][i]:
buy_signal.append(1)
return buy_signal
buy_signal = signal(bolldf)

Related

cumsum() on running streak of data

i'm attempting to determine the sum of a column for a running period where there is a negative gain - ie i want to determine the total for a loosing streak.
i've set up a column that provides the numerical number of days where its been loosing (Consecutive Losses), but i wish to sum up the same for the total loss throughout the streak. what i have (Aggregate Consecutive Loss) 1) doesn't work (because it just cumsums() without resetting to zero at each streak) and 2) is incorrectly as i should in fact take the Open value at the start of the streak and Close value at the end.
how can i correctly setup this Aggregate Consecutive Loss value in pandas?
import pandas as pd
import numpy as np
import yfinance as yf
def get( symbols, group_by_ticker=False, **kwargs ):
if not isinstance(symbols, list):
symbols = [ symbols, ]
kwargs['auto_adjust'] = True
kwargs['prepost'] = True
kwargs['threads'] = True
df = None
if group_by_ticker:
kwargs['group_by'] = 'ticker'
df = yf.download( symbols, **kwargs)
for t in symbols:
df["Change Percent",t] = df["Close",t].pct_change() * 100
df["Gain",t] = np.where( df['Change Percent',t] > 0, True, False ).astype('bool')
a = df['Gain',t] != True
df['Consecutive Losses',t] = a.cumsum() - a.cumsum().where(~a).ffill().fillna(0).astype(int)
x = df['Change Percent',t].where( df['Consecutive Losses',t] > 0 )
df['Aggregate Consecutive Loss',t] = x.cumsum() - x.cumsum().where(~a).ffill().fillna(0).astype(float)
return df
data = get( ["DOW", "IDX"], period="6mo")
data[['Change Percent','Gain','Consecutive Losses','Aggregate Consecutive Loss']].head(50)

Pandas accumulate data for linear regression

I try to adjust my data so total_gross per day is accumulated. E.g.
`Created` `total_gross` `total_gross_accumulated`
Day 1 100 100
Day 2 100 200
Day 3 100 300
Day 4 100 400
Any idea, how I have to change my code to have total_gross_accumulated available?
Here is my data.
my code:
from sklearn import linear_model
def load_event_data():
df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'])
df['created'] = pd.to_datetime(df.created)
return df.set_index('created').resample('D').sum().fillna(0)
event_data = load_event_data()
X = event_data.index
y = event_data.total_gross
plt.xticks(rotation=90)
plt.plot(X, y)
plt.show()
List comprehension is the most pythonic way to do this.
SHORT answer:
This should give you the new column that you want:
n = event_data.shape[0]
# skip line 0 and start by accumulating from 1 until the end
total_gross_accumulated =[event_data['total_gross'][:i].sum() for i in range(1,n+1)]
# add the new variable in the initial pandas dataframe
event_data['total_gross_accumulated'] = total_gross_accumulated
OR faster
event_data['total_gross_accumulated'] = event_data['total_gross'].cumsum()
LONG answer:
Full code using your data:
import pandas as pd
def load_event_data():
df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'])
df['created'] = pd.to_datetime(df.created)
return df.set_index('created').resample('D').sum().fillna(0)
event_data = load_event_data()
n = event_data.shape[0]
# skip line 0 and start by accumulating from 1 until the end
total_gross_accumulated =[event_data['total_gross'][:i].sum() for i in range(1,n+1)]
# add the new variable in the initial pandas dataframe
event_data['total_gross_accumulated'] = total_gross_accumulated
Results:
event_data.head(6)
# total_gross total_gross_accumulated
#created
#2019-03-01 3481810 3481810
#2019-03-02 4690 3486500
#2019-03-03 0 3486500
#2019-03-04 0 3486500
#2019-03-05 0 3486500
#2019-03-06 0 3486500
X = event_data.index
y = event_data.total_gross_accumulated
plt.xticks(rotation=90)
plt.plot(X, y)
plt.show()

dask how to define a custom (time fold) function that operates in parallel and returns a dataframe with a different shape

I am trying to implement a time fold function to be 'map'ed to various partitions of a dask dataframe which in turn changes the shape of the dataframe in question (or alternatively produces a new dataframe with the altered shape). This is how far I have gotten. The result 'res' returned on compute is a list of 3 delayed objects. When I try to compute each of them in a loop (last tow lines of code) this results in a "TypeError: 'DataFrame' object is not callable" After going through the examples for map_partitions, I also tried altering the input DF (inplace) in the function with no return value which causes a similar TypeError with NoneType. What am I missing?
Also, looking at the visualization (attached) I feel like there is a need for reducing the individually computed (folded) partitions into a single DF. How do I do this?
#! /usr/bin/env python
# Start dask scheduler and workers
# dask-scheduler &
# dask-worker --nthreads 1 --nprocs 6 --memory-limit 3GB localhost:8786 --local-directory /dev/shm &
from dask.distributed import Client
from dask.delayed import delayed
import pandas as pd
import numpy as np
import dask.dataframe as dd
import math
foldbucketsecs=30
periodicitysecs=15
secsinday=24 * 60 * 60
chunksizesecs=60 # 1 minute
numts = 5
start = 1525132800 # 01/05
end = 1525132800 + (3 * 60) # 3 minute
c = Client('127.0.0.1:8786')
def fold(df, start, bucket):
return df
def reduce_folds(df):
return df
def load(epoch):
idx = []
for ts in range(0, chunksizesecs, periodicitysecs):
idx.append(epoch + ts)
d = np.random.rand(chunksizesecs/periodicitysecs, numts)
ts = []
for i in range(0, numts):
tsname = "ts_%s" % (i)
ts.append(tsname)
gts.append(tsname)
res = pd.DataFrame(index=idx, data=d, columns=ts, dtype=np.float64)
res.index = pd.to_datetime(arg=res.index, unit='s')
return res
gts = []
load(start)
cols = len(gts)
idx1 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+periodicitysecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx1[:0], data=[], columns=gts, dtype=np.float64)
dfs = [delayed(load)(fn) for fn in range(start, end, chunksizesecs)]
from_delayed = dd.from_delayed(dfs, meta, 'sorted')
nfolds = int(math.ceil((end - start)/foldbucketsecs))
cprime = nfolds * cols
gtsnew = []
for i in range(0, cprime):
gtsnew.append("ts_%s,fold=%s" % (i%cols, i/cols))
idx2 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+foldbucketsecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx2[:0], data=[], columns=gtsnew, dtype=np.float64)
folded_df = from_delayed.map_partitions(delayed(fold)(from_delayed, start, foldbucketsecs), meta=meta)
result = c.submit(reduce_folds, folded_df)
c.gather(result).visualize(filename='/usr/share/nginx/html/svg/df4.svg')
res = c.gather(result).compute()
for f in res:
f.compute()
Never mind! It was my fault, instead of wrapping my function in delayed I simply passed it to the map_partitions call like so and it worked.
folded_df = from_delayed.map_partitions(fold, start, foldbucketsecs, nfolds, meta=meta)

Iterating over columns in data frame by skipping first column and drawing multiple plots

I have a data frame as following,
df.head()
ID AS_FP AC_FP RP11_FP RP11_be AC_be AS_be Info
AE02 0.060233 0 0.682884 0.817115 0.591182 0.129252 SAP
AE03 0 0 0 0.889181 0.670113 0.766243 SAP
AE04 0 0 0.033256 0.726193 0.171861 0.103839 others
AE05 0 0 0.034988 0.451329 0.431836 0.219843 others
What I am aiming is to plot each column starting from AS_FP til RP11_beta as lmplot, each x axis is column ending with FP and y axis is its corresponding column ending with be.
And I wanted to save it as separate files so I strated iterating through the columns by skipping first column ID, like this,
for ind, column in enumerate(df.columns):
if column.split('_')[0] == column.split('_')[0]:
But I got lost how to continue, I need to plot
sns.lmplot(x, y, data=df, hue='Info',palette=colors, fit_reg=False,
size=10,scatter_kws={"s": 700},markers=["o", "v"])
and save each image as seperate file
Straightforward solution:
1) Toy data:
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns
dct = OrderedDict()
dct["ID"] = ["AE02", "AE03", "AE04", "AE05"]
dct["AS_FP"] = [0.060233, 0, 0, 0]
dct["AC_FP"] = [0, 0,0, 0]
dct["RP11_FP"] = [0.682884, 0, 0.033256, 0.034988]
dct["AS_be"] = [0.129252, 0.766243, 0.103839, 0.219843]
dct["AC_be"] = [0.591182, 0.670113, 0.171861, 0.431836]
dct["RP11_be"] = [0.817115, 0.889181, 0.726193, 0.451329]
dct["Info"] = ["SAP", "SAP", "others", "others"]
df = pd.DataFrame(dct)
2) Iterating through pairs, saving each figure with unique filename:
graph_cols = [col for col in df.columns if ("_FP" in col) or ("_be" in col)]
fps = sorted([col for col in graph_cols if "_FP" in col])
bes = sorted([col for col in graph_cols if "_be" in col])
for x, y in zip(fps, bes):
snsplot = sns.lmplot(x, y, data=df, fit_reg=False, hue='Info',
size=10, scatter_kws={"s": 700})
snsplot.savefig(x.split("_")[0] + ".png")
You can add needed params in lmlplot as you need.

pandas histogram plot error: ValueError: num must be 1 <= num <= 0, not 1

I am drawing a histogram of a column from pandas data frame:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib
df.hist(column='column_A', bins = 100)
but got the following errors:
62 raise ValueError(
63 "num must be 1 <= num <= {maxn}, not {num}".format(
---> 64 maxn=rows*cols, num=num))
65 self._subplotspec = GridSpec(rows, cols)[int(num) - 1]
66 # num - 1 for converting from MATLAB to python indexing
ValueError: num must be 1 <= num <= 0, not 1
Does anyone know what this error mean? Thanks!
Problem
The problem you encounter arises when column_A does not contain numeric data. As you can see in the excerpt from pandas.plotting._core below, the numeric data is essential to make the function hist_frame (which you call by DataFrame.hist()) work correctly.
def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
# skipping part of the code
# ...
if column is not None:
if not isinstance(column, (list, np.ndarray, Index)):
column = [column]
data = data[column]
data = data._get_numeric_data() # there is no numeric data in the column
naxes = len(data.columns) # so the number of axes becomes 0
# naxes is passed to the subplot generating function as 0 and later determines the number of columns as 0
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
sharex=sharex, sharey=sharey, figsize=figsize,
layout=layout)
# skipping the rest of the code
# ...
Solution
If your problem is to represent numeric data (but not of numeric dtype yet) with a histogram, you need to cast your data to numeric, either with pd.to_numeric or df.astype(a_selected_numeric_dtype), e.g. 'float64', and then proceed with your code.
If your problem is to represent non-numeric data in one column with a histogram, you can call the function hist_series with the following line: df['column_A'].hist(bins=100).
If your problem is to represent non-numeric data in many columns with a histogram, you may resort to a handful options:
Use matplotlib and create subplots and histograms directly
Update pandas at least to version 0.25
usually is 0
mta['penn'] = [mta_bystation[mta_bystation.STATION == "34 ST-PENN STA"], 'Penn Station']
mta['grdcntrl'] = [mta_bystation[mta_bystation.STATION == "GRD CNTRL-42 ST"], 'Grand Central']
mta['heraldsq'] = [mta_bystation[mta_bystation.STATION == "34 ST-HERALD SQ"], 'Herald Sq']
mta['23rd'] = [mta_bystation[mta_bystation.STATION == "23 ST"], '23rd St']
#mta['portauth'] = [mta_bystation[mta_bystation.STATION == "42 ST-PORT AUTH"], 'Port Auth']
#mta['unionsq'] = [mta_bystation[mta_bystation.STATION == "14 ST-UNION SQ"], 'Union Sq']
mta['timessq'] = [mta_bystation[mta_bystation.STATION == "TIMES SQ-42 ST"], 'Ti