How to add new columns to pandas data frame using .loc - pandas

I am calculating very simple daily stock calculations in data frame ( for e.g. SMA, VWAP, RSI etc). After I upgraded to anaconda 3.0, my code stopped working and gives followed error. I don't have much experience in coding and need some help.
KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['RSI', 'ZONE'], dtype='object'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"
Followed is the code.
import yfinance as yf
import pandas as pd
def convert_to_dataframe_daily(data):
window = 10
window20 = 20
window50 = 50
window100 = 100
window200 = 200
ema_time = 8
#data = yf.download("googl", period="30d", interval="5m")
#data = yf.download('TSLA', period='30d', interval='5m')
pd.set_option('display.max_columns', None)
#calculation for VWAP
volumeC = data['Volume']
priceC = data['Close']
df = data.assign(VWAP=((volumeC * priceC).cumsum() / volumeC.cumsum()).ffill())
#Convert the timezone to Chicago central
#df.index = pd.DatetimeIndex(df.index.tz_convert('US/Central')) # aware--> aware
#reset the dataframe index and separate time
df.reset_index(inplace=True)
#df.index.intersection
#df2 = df[df.index.isin(dts)]
#df['Date'] = pd.to_datetime(df['Datetime']).dt.date
#df['Time'] = pd.to_datetime(df['Datetime']).dt.time
# calculate stochastic
df['low5']= df['Low'].rolling(5).min()
df['high5']= df['High'].rolling(5).max()
#k = 100 * (c - l) / (h - l)
df['K'] = (df['Close']-df['low5'])/(df['high5']-df['low5'])
#s.reindex([1, 2, 3])
columns = df.columns.values.tolist()
#df[columns[index]]
#df = pd.DataFrame(np.random.randn(8, 4),index=dates, columns=['A', 'B', 'C', 'D'])
df = df.loc[:, ('Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE')]
#df = df.reindex(['Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE'])
df['RSI'] = calculate_rsi(df)
filter_Z1 = df['K'] <=0.1
filter_Z2 = (df['K'] > 0.1) & (df['K'] <= 0.2)
filter_Z3 = (df['K'] > 0.2) & (df['K'] <= 0.3)
filter_Z4 = (df['K'] > 0.3) & (df['K'] <= 0.4)
filter_Z5 = (df['K'] > 0.4) & (df['K'] <= 0.5)
filter_Z6 = (df['K'] > 0.5) & (df['K'] <= 0.6)
filter_Z7 = (df['K'] > 0.6) & (df['K'] <= 0.7)
filter_Z8 = (df['K'] > 0.7) & (df['K'] <= 0.8)
filter_Z9 = (df['K'] > 0.8) & (df['K'] <= 0.9)
filter_Z10 = (df['K'] > 0.9) & (df['K'] <= 1)
#plug in stochastic zones
df['ZONE'].where(-filter_Z1, 'Z1', inplace=True)
df['ZONE'].where(-filter_Z2, 'Z2', inplace=True)
df['ZONE'].where(-filter_Z3, 'Z3', inplace=True)
df['ZONE'].where(-filter_Z4, 'Z4', inplace=True)
df['ZONE'].where(-filter_Z5, 'Z5', inplace=True)
df['ZONE'].where(-filter_Z6, 'Z6', inplace=True)
df['ZONE'].where(-filter_Z7, 'Z7', inplace=True)
df['ZONE'].where(-filter_Z8, 'Z9', inplace=True)
df['ZONE'].where(-filter_Z9, 'Z9', inplace=True)
df['ZONE'].where(-filter_Z10, 'Z10', inplace=True)
df = df['Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE']
return df
data = yf.download('ba', period='500d', interval='1d')
df = convert_to_dataframe_daily(data)
print(df)

A few lines need to be tweaked
Instead of
df = df.loc[:, ('Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE')]
use
df = df[['Date','Open','High','Low', 'Close','Volume','VWAP','K']]
before
df['ZONE'].where(-filter_Z1, 'Z1', inplace=True)
...
put a line
df['ZONE'] = 0
The line before return df should be changed to
df = df[['Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE']]

Related

How to color a plot lines based on amplitude

So I want to change the color of the blue vlines(matplotlib) in the above plot.
First I want to make the negative(< 0) values different color and take their absolute so that only amplitude is visible but they will be a different color than the negative ones. Positive values could remain unchanged.
minimum reproducible code as below:
import numpy as np
import random
import matplotlib.pyplot as plt
peakmzs = np.array([random.uniform(506, 2000) for i in range(2080)])
peakmzs = peakmzs[peakmzs.argsort()[::1]]
spec = np.zeros_like(peakmzs)
b = np.where((peakmzs > 1500) & (peakmzs < 1540))[0]
spec[b] = [random.uniform(0, 0.002) for i in range(len(b))]
b = np.where((peakmzs > 700) & (peakmzs < 820))[0]
spec[b] = [random.uniform(0, 0.05) for i in range(len(b))]
spec[300:302] = 0.07
b = np.where((peakmzs > 600) & (peakmzs < 650))[0]
spec[b] = [random.uniform(0, 0.03) for i in range(len(b))]
plt.vlines(peakmzs, spec, ymax=spec.max())
plt.show()
shp_values = np.zeros_like(peakmzs)
b = np.where((peakmzs > 1500) & (peakmzs < 1540))[0]
b_ = np.random.randint(1500, 1540, 10)
# print(b_)
shp_values[b] = [random.uniform(-0.003, 0.002) for i in range(len(b))]
shp_values[b_] = 0
b = np.where((peakmzs > 700) & (peakmzs < 820))[0]
shp_values[b] = [random.uniform(-0.004, 0.002) for i in range(len(b))]
b_ = np.random.randint(700, 820, 70)
shp_values[b_] = 0
# [random.uniform(-0.005, 0.003) for i in range(len(peakmzs))]
plt.plot(shp_values)
Based on the suggestion from #JohanC,
demo_shp = np.array(shapvalues[0][19])
colors = np.where(demo_shp < 0, 'cyan', 'pink')
plt.vlines(peakmzs, ymin=[0], ymax=demo_shp, colors=colors)
plt.show()

How to calculate all aggregations at once without using a loop over indices?

How to calculate all aggregations at once without using a loop over indices?
%%time
import random
random.seed(1)
df = pd.DataFrame({'val':random.sample(range(10), 10)})
for j in range(10):
for i in df.index:
df.loc[i,'mean_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].mean()
df.loc[i,'std_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].std()
df.loc[i,'max_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].max()
df.loc[i,'min_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].min()
df.loc[i,'median_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].median()
You could use the rolling method, see for example:
df = pd.DataFrame({'val': np.random.random(100)})
for i in range(10):
agg = df["val"].rolling(i).aggregate(['mean', 'median'])
df[[f"mean_{i}", f"median_{i}"]] = agg.values
I think what you're looking for is something like this:
import random
random.seed(1)
df = pd.DataFrame({'val':random.sample(range(10), 10)})
for j in range(1, 10):
df[f'mean_last_{j}'] = df['val'].rolling(j, min_periods=1).mean()
df[f'std_last_{j}'] = df['val'].rolling(j, min_periods=1).std()
df[f'max_last_{j}'] = df['val'].rolling(j, min_periods=1).max()
df[f'min_last_{j}'] = df['val'].rolling(j, min_periods=1).min()
df[f'median_last_{j}'] = df['val'].rolling(j, min_periods=1).median()
However, my code is "off-by-one" relative to your example code. Do you intend for each aggregation INCLUDE value from the current row, or should it only use the previous j rows, without the current one? My code includes the current row, but yours does not. Your code results in NaN values for the first group of aggregations.
Edit: The answer from #Carlos uses rolling(j).aggregate() to specify list of aggregations in one line. Here's what that looks like:
import random
random.seed(1)
df = pd.DataFrame({'val':random.sample(range(10), 10)})
aggs = ['mean', 'std', 'max', 'min', 'median']
for j in range(10):
stats = df["val"].rolling(j, min_periods=min(j, 1)).aggregate(aggs)
df[[f"{a}_last_{j}" for a in aggs]] = stats.values

Binning data and plotting

I have a dataframe of essentially random numbers, (except for one column), some of which are NaNs. MWE:
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
randomNumberGenerator = np.random.RandomState(1000)
z = 5 * randomNumberGenerator.rand(101)
A = 4 * z - 3+ randomNumberGenerator.randn(101)
B = 4 * z - 2+ randomNumberGenerator.randn(101)
C = 4 * z - 1+ randomNumberGenerator.randn(101)
D = 4 * z - 4+ randomNumberGenerator.randn(101)
A[50] = np.nan
A[:3] = np.nan
B[12:20] = np.nan
sources= pd.DataFrame({'z': z})
sources['A'] = A
sources['B'] = B
sources['C'] = C
sources['D'] = D
#sources= sources.dropna()
x = sources.z
y1 = sources.A
y2 = sources.B
y3 = sources.C
y4 = sources.D
for i in [y1, y2, y3, y4]:
count = np.count_nonzero(~np.logical_or(np.isnan(x), np.isnan(i)))
label = 'Points plotted: %d'%count
plt.scatter(x, i, label = label)
plt.legend()
I need to bin the data according to x and plot different columns in each bin, in 3 side-by-side subplots:
x_1 <= 1 plot A-B | 1 < x_2 < 3 plot B+C | 3 < x_3 plot C-D
I've tried to bin the data with
x1 = sources[sources['z']<1] # z < 1
x2 = sources[sources['z']<3]
x2 = x2[x2['z']>=1] # 1<= z < 3
x3 = sources[sources['z']<max(z)]
x3 = x3[x3['z']>=3] # 3 <= z <= max(z)
x1 = x1['z']
x2 = x2['z']
x3 = x3['z']
but there's got to be a better way to go about it. What's the best way to produce something like this?
For binning in pandas is used cut, so solution is:
sources= pd.DataFrame({'z': z})
sources['A'] = A
sources['B'] = B
sources['C'] = C
sources['D'] = D
#sources= sources.dropna()
bins = pd.cut(sources['z'], [-np.inf, 1, 3, max(z)], labels=[1,2,3])
m1 = bins == 1
m2 = bins == 2
m3 = bins == 3
x11 = sources.loc[m1, 'A']
x12 = sources.loc[m1, 'B']
x21 = sources.loc[m2, 'B']
x22 = sources.loc[m2, 'C']
x31 = sources.loc[m3, 'C']
x32 = sources.loc[m3, 'D']
y11 = sources.loc[m1, 'A']
y12 = sources.loc[m1, 'B']
y21 = sources.loc[m2, 'B']
y22 = sources.loc[m2, 'C']
y31 = sources.loc[m3, 'C']
y32 = sources.loc[m3, 'D']
tups = [(x11, x12, y11, y12), (x21, x22,y21, y22),(x31, x32, y31, y32)]
fig, ax = plt.subplots(1,3)
ax = ax.flatten()
for k, (i1, i2, j1, j2) in enumerate(tups):
count1 = np.count_nonzero(~np.logical_or(np.isnan(i1), np.isnan(j1)))
count2 = np.count_nonzero(~np.logical_or(np.isnan(i2), np.isnan(j2)))
label1 = 'Points plotted: %d'%count1
label2 = 'Points plotted: %d'%count2
ax[k].scatter(i1, j1, label = label1)
ax[k].scatter(i2, j2, label = label2)
ax[k].legend()

zscore v.s. minmax normalization, why their results look the same

I'm trying to normalize my time series with two different normalization method, minmax and zscore and compare the results. Here is my code:
def scale_raw_data_zscore(raw_data):
scaled_zscore = pd.DataFrame()
idx = 514844
values = raw_data.loc[idx]['d_column'].values
values = values.reshape((len(values), 1))
scaler = StandardScaler()
scaler = scaler.fit(values)
normalized = scaler.transform(values)
normalized = normalized.reshape(normalized.shape[0])
normalized = pd.DataFrame(normalized, index=raw_data.loc[idx].index, columns=raw_data.columns)
scaled_zscore = scaled_zscore.append(normalized)
return scaled_zscore
def scale_raw_data_minmax(raw_data):
scaled_minmax = pd.DataFrame()
idx = 514844
values = raw_data.loc[idx]['d_column'].values
values = values.reshape((len(values), 1))
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
normalized = scaler.transform(values)
normalized = normalized.reshape(normalized.shape[0])
normalized = pd.DataFrame(normalized, index=raw_data.loc[idx].index, columns=raw_data.columns)
scaled_minmax = scaled_minmax.append(normalized)
return scaled_minmax
def plot_data(raw_data, scaled_zscore, scaled_minmax):
fig = pyplot.figure()
idx = 514844
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
ax3 = fig.add_subplot(313)
raw_data.loc[idx].plot(kind='line', x='date', y='d_column', ax=ax1, title='ID: ' + str(idx), legend=False, figsize=(20, 5))
scaled_zscore.reset_index(drop=True).plot(kind='line', y='d_column', ax=ax2, title='zscore', color='green', legend=False, figsize=(20, 5))
scaled_minmax.reset_index(drop=True).plot(kind='line', y='d_column', ax=ax3, title='minmax', color='red', legend=False, figsize=(20, 5))
pyplot.show()
scaled_zscore = scale_raw_data_zscore(raw_data)
scaled_minmax = scale_raw_data_minmax(raw_data)
plot_data(raw_data, scaled_zscore, scaled_minmax)
I'm adding the plot of the results. Why the results of both scaling methods are exactly the same? And why they have a different pattern from the raw data?

How to format data in Pandas to allow for the correct representation using Seaborn?

I am trying to generate the heatmap below. I have generated a figure, but I need help formatting the x-axis on the bottom to have the names show up in the appropriate order. Any help is appreciated. Thank You!
Proposed Heatmap
Generated Heatmap
df = pd.read_table('/srv/data/shared/virus_data.txt', header=None)
df.set_index(0, inplace=True)
df_bgs = df.loc[:, df.isna().any(axis=0)].iloc[3:]
df.dropna(axis=1, inplace=True)
df.sort_values(['Treatment', 'Time'], axis=1, inplace=True)
dfn = df.iloc[3:].astype('float')
dfn.index.name = 'Gene'
col_names = df.loc['Time'] + ' ' + df.loc['Treatment'] + 'HR ' + \
df.loc['Replicate']
dfn.columns = col_names
df.columns = col_names
dfn = dfn.sub(df_bgs.mean(axis=1), axis=0)
dfn[dfn<0] = 0
dfn = dfn.div(dfn.loc['HPRT1'], axis=1)
dfn.drop('HPRT1', axis=0, inplace=True)
dfn = np.log2(dfn+0.01)
treatment2select = ['M', 'M+SNV', 'M+ANDV']
df_ec = dfn.loc[:, df.loc['Treatment'].str.contains('EC')]
df_m = dfn.loc[:, df.loc['Treatment'].isin(treatment2select)]
def row_z_score(df):
return df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0)
df_ec = row_z_score(df_ec)
df_m = row_z_score(df_m)
uni_treatment = df.loc['Treatment'].unique()
treatment2color = dict(zip(uni_treatment,sns.color_palette(palette="YlGn",
n_colors=len(uni_treatment))))
col_colors = df.loc['Treatment'].map(treatment2color)
g = sns.clustermap(df_m, col_cluster=True,
col_colors = col_colors,
cmap='RdBu_r', method='ward')
g.ax_col_dendrogram.set_visible(False)