I am calculating very simple daily stock calculations in data frame ( for e.g. SMA, VWAP, RSI etc). After I upgraded to anaconda 3.0, my code stopped working and gives followed error. I don't have much experience in coding and need some help.
KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['RSI', 'ZONE'], dtype='object'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"
Followed is the code.
import yfinance as yf
import pandas as pd
def convert_to_dataframe_daily(data):
window = 10
window20 = 20
window50 = 50
window100 = 100
window200 = 200
ema_time = 8
#data = yf.download("googl", period="30d", interval="5m")
#data = yf.download('TSLA', period='30d', interval='5m')
pd.set_option('display.max_columns', None)
#calculation for VWAP
volumeC = data['Volume']
priceC = data['Close']
df = data.assign(VWAP=((volumeC * priceC).cumsum() / volumeC.cumsum()).ffill())
#Convert the timezone to Chicago central
#df.index = pd.DatetimeIndex(df.index.tz_convert('US/Central')) # aware--> aware
#reset the dataframe index and separate time
df.reset_index(inplace=True)
#df.index.intersection
#df2 = df[df.index.isin(dts)]
#df['Date'] = pd.to_datetime(df['Datetime']).dt.date
#df['Time'] = pd.to_datetime(df['Datetime']).dt.time
# calculate stochastic
df['low5']= df['Low'].rolling(5).min()
df['high5']= df['High'].rolling(5).max()
#k = 100 * (c - l) / (h - l)
df['K'] = (df['Close']-df['low5'])/(df['high5']-df['low5'])
#s.reindex([1, 2, 3])
columns = df.columns.values.tolist()
#df[columns[index]]
#df = pd.DataFrame(np.random.randn(8, 4),index=dates, columns=['A', 'B', 'C', 'D'])
df = df.loc[:, ('Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE')]
#df = df.reindex(['Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE'])
df['RSI'] = calculate_rsi(df)
filter_Z1 = df['K'] <=0.1
filter_Z2 = (df['K'] > 0.1) & (df['K'] <= 0.2)
filter_Z3 = (df['K'] > 0.2) & (df['K'] <= 0.3)
filter_Z4 = (df['K'] > 0.3) & (df['K'] <= 0.4)
filter_Z5 = (df['K'] > 0.4) & (df['K'] <= 0.5)
filter_Z6 = (df['K'] > 0.5) & (df['K'] <= 0.6)
filter_Z7 = (df['K'] > 0.6) & (df['K'] <= 0.7)
filter_Z8 = (df['K'] > 0.7) & (df['K'] <= 0.8)
filter_Z9 = (df['K'] > 0.8) & (df['K'] <= 0.9)
filter_Z10 = (df['K'] > 0.9) & (df['K'] <= 1)
#plug in stochastic zones
df['ZONE'].where(-filter_Z1, 'Z1', inplace=True)
df['ZONE'].where(-filter_Z2, 'Z2', inplace=True)
df['ZONE'].where(-filter_Z3, 'Z3', inplace=True)
df['ZONE'].where(-filter_Z4, 'Z4', inplace=True)
df['ZONE'].where(-filter_Z5, 'Z5', inplace=True)
df['ZONE'].where(-filter_Z6, 'Z6', inplace=True)
df['ZONE'].where(-filter_Z7, 'Z7', inplace=True)
df['ZONE'].where(-filter_Z8, 'Z9', inplace=True)
df['ZONE'].where(-filter_Z9, 'Z9', inplace=True)
df['ZONE'].where(-filter_Z10, 'Z10', inplace=True)
df = df['Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE']
return df
data = yf.download('ba', period='500d', interval='1d')
df = convert_to_dataframe_daily(data)
print(df)
A few lines need to be tweaked
Instead of
df = df.loc[:, ('Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE')]
use
df = df[['Date','Open','High','Low', 'Close','Volume','VWAP','K']]
before
df['ZONE'].where(-filter_Z1, 'Z1', inplace=True)
...
put a line
df['ZONE'] = 0
The line before return df should be changed to
df = df[['Date','Open','High','Low', 'Close','Volume','VWAP','K','RSI', 'ZONE']]
Related
So I want to change the color of the blue vlines(matplotlib) in the above plot.
First I want to make the negative(< 0) values different color and take their absolute so that only amplitude is visible but they will be a different color than the negative ones. Positive values could remain unchanged.
minimum reproducible code as below:
import numpy as np
import random
import matplotlib.pyplot as plt
peakmzs = np.array([random.uniform(506, 2000) for i in range(2080)])
peakmzs = peakmzs[peakmzs.argsort()[::1]]
spec = np.zeros_like(peakmzs)
b = np.where((peakmzs > 1500) & (peakmzs < 1540))[0]
spec[b] = [random.uniform(0, 0.002) for i in range(len(b))]
b = np.where((peakmzs > 700) & (peakmzs < 820))[0]
spec[b] = [random.uniform(0, 0.05) for i in range(len(b))]
spec[300:302] = 0.07
b = np.where((peakmzs > 600) & (peakmzs < 650))[0]
spec[b] = [random.uniform(0, 0.03) for i in range(len(b))]
plt.vlines(peakmzs, spec, ymax=spec.max())
plt.show()
shp_values = np.zeros_like(peakmzs)
b = np.where((peakmzs > 1500) & (peakmzs < 1540))[0]
b_ = np.random.randint(1500, 1540, 10)
# print(b_)
shp_values[b] = [random.uniform(-0.003, 0.002) for i in range(len(b))]
shp_values[b_] = 0
b = np.where((peakmzs > 700) & (peakmzs < 820))[0]
shp_values[b] = [random.uniform(-0.004, 0.002) for i in range(len(b))]
b_ = np.random.randint(700, 820, 70)
shp_values[b_] = 0
# [random.uniform(-0.005, 0.003) for i in range(len(peakmzs))]
plt.plot(shp_values)
Based on the suggestion from #JohanC,
demo_shp = np.array(shapvalues[0][19])
colors = np.where(demo_shp < 0, 'cyan', 'pink')
plt.vlines(peakmzs, ymin=[0], ymax=demo_shp, colors=colors)
plt.show()
How to calculate all aggregations at once without using a loop over indices?
%%time
import random
random.seed(1)
df = pd.DataFrame({'val':random.sample(range(10), 10)})
for j in range(10):
for i in df.index:
df.loc[i,'mean_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].mean()
df.loc[i,'std_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].std()
df.loc[i,'max_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].max()
df.loc[i,'min_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].min()
df.loc[i,'median_last_{}'.format(j)] = df.loc[(df.index < i) & (df.index >= i - j),'val'].median()
You could use the rolling method, see for example:
df = pd.DataFrame({'val': np.random.random(100)})
for i in range(10):
agg = df["val"].rolling(i).aggregate(['mean', 'median'])
df[[f"mean_{i}", f"median_{i}"]] = agg.values
I think what you're looking for is something like this:
import random
random.seed(1)
df = pd.DataFrame({'val':random.sample(range(10), 10)})
for j in range(1, 10):
df[f'mean_last_{j}'] = df['val'].rolling(j, min_periods=1).mean()
df[f'std_last_{j}'] = df['val'].rolling(j, min_periods=1).std()
df[f'max_last_{j}'] = df['val'].rolling(j, min_periods=1).max()
df[f'min_last_{j}'] = df['val'].rolling(j, min_periods=1).min()
df[f'median_last_{j}'] = df['val'].rolling(j, min_periods=1).median()
However, my code is "off-by-one" relative to your example code. Do you intend for each aggregation INCLUDE value from the current row, or should it only use the previous j rows, without the current one? My code includes the current row, but yours does not. Your code results in NaN values for the first group of aggregations.
Edit: The answer from #Carlos uses rolling(j).aggregate() to specify list of aggregations in one line. Here's what that looks like:
import random
random.seed(1)
df = pd.DataFrame({'val':random.sample(range(10), 10)})
aggs = ['mean', 'std', 'max', 'min', 'median']
for j in range(10):
stats = df["val"].rolling(j, min_periods=min(j, 1)).aggregate(aggs)
df[[f"{a}_last_{j}" for a in aggs]] = stats.values
I have a dataframe of essentially random numbers, (except for one column), some of which are NaNs. MWE:
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
randomNumberGenerator = np.random.RandomState(1000)
z = 5 * randomNumberGenerator.rand(101)
A = 4 * z - 3+ randomNumberGenerator.randn(101)
B = 4 * z - 2+ randomNumberGenerator.randn(101)
C = 4 * z - 1+ randomNumberGenerator.randn(101)
D = 4 * z - 4+ randomNumberGenerator.randn(101)
A[50] = np.nan
A[:3] = np.nan
B[12:20] = np.nan
sources= pd.DataFrame({'z': z})
sources['A'] = A
sources['B'] = B
sources['C'] = C
sources['D'] = D
#sources= sources.dropna()
x = sources.z
y1 = sources.A
y2 = sources.B
y3 = sources.C
y4 = sources.D
for i in [y1, y2, y3, y4]:
count = np.count_nonzero(~np.logical_or(np.isnan(x), np.isnan(i)))
label = 'Points plotted: %d'%count
plt.scatter(x, i, label = label)
plt.legend()
I need to bin the data according to x and plot different columns in each bin, in 3 side-by-side subplots:
x_1 <= 1 plot A-B | 1 < x_2 < 3 plot B+C | 3 < x_3 plot C-D
I've tried to bin the data with
x1 = sources[sources['z']<1] # z < 1
x2 = sources[sources['z']<3]
x2 = x2[x2['z']>=1] # 1<= z < 3
x3 = sources[sources['z']<max(z)]
x3 = x3[x3['z']>=3] # 3 <= z <= max(z)
x1 = x1['z']
x2 = x2['z']
x3 = x3['z']
but there's got to be a better way to go about it. What's the best way to produce something like this?
For binning in pandas is used cut, so solution is:
sources= pd.DataFrame({'z': z})
sources['A'] = A
sources['B'] = B
sources['C'] = C
sources['D'] = D
#sources= sources.dropna()
bins = pd.cut(sources['z'], [-np.inf, 1, 3, max(z)], labels=[1,2,3])
m1 = bins == 1
m2 = bins == 2
m3 = bins == 3
x11 = sources.loc[m1, 'A']
x12 = sources.loc[m1, 'B']
x21 = sources.loc[m2, 'B']
x22 = sources.loc[m2, 'C']
x31 = sources.loc[m3, 'C']
x32 = sources.loc[m3, 'D']
y11 = sources.loc[m1, 'A']
y12 = sources.loc[m1, 'B']
y21 = sources.loc[m2, 'B']
y22 = sources.loc[m2, 'C']
y31 = sources.loc[m3, 'C']
y32 = sources.loc[m3, 'D']
tups = [(x11, x12, y11, y12), (x21, x22,y21, y22),(x31, x32, y31, y32)]
fig, ax = plt.subplots(1,3)
ax = ax.flatten()
for k, (i1, i2, j1, j2) in enumerate(tups):
count1 = np.count_nonzero(~np.logical_or(np.isnan(i1), np.isnan(j1)))
count2 = np.count_nonzero(~np.logical_or(np.isnan(i2), np.isnan(j2)))
label1 = 'Points plotted: %d'%count1
label2 = 'Points plotted: %d'%count2
ax[k].scatter(i1, j1, label = label1)
ax[k].scatter(i2, j2, label = label2)
ax[k].legend()
I'm trying to normalize my time series with two different normalization method, minmax and zscore and compare the results. Here is my code:
def scale_raw_data_zscore(raw_data):
scaled_zscore = pd.DataFrame()
idx = 514844
values = raw_data.loc[idx]['d_column'].values
values = values.reshape((len(values), 1))
scaler = StandardScaler()
scaler = scaler.fit(values)
normalized = scaler.transform(values)
normalized = normalized.reshape(normalized.shape[0])
normalized = pd.DataFrame(normalized, index=raw_data.loc[idx].index, columns=raw_data.columns)
scaled_zscore = scaled_zscore.append(normalized)
return scaled_zscore
def scale_raw_data_minmax(raw_data):
scaled_minmax = pd.DataFrame()
idx = 514844
values = raw_data.loc[idx]['d_column'].values
values = values.reshape((len(values), 1))
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
normalized = scaler.transform(values)
normalized = normalized.reshape(normalized.shape[0])
normalized = pd.DataFrame(normalized, index=raw_data.loc[idx].index, columns=raw_data.columns)
scaled_minmax = scaled_minmax.append(normalized)
return scaled_minmax
def plot_data(raw_data, scaled_zscore, scaled_minmax):
fig = pyplot.figure()
idx = 514844
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
ax3 = fig.add_subplot(313)
raw_data.loc[idx].plot(kind='line', x='date', y='d_column', ax=ax1, title='ID: ' + str(idx), legend=False, figsize=(20, 5))
scaled_zscore.reset_index(drop=True).plot(kind='line', y='d_column', ax=ax2, title='zscore', color='green', legend=False, figsize=(20, 5))
scaled_minmax.reset_index(drop=True).plot(kind='line', y='d_column', ax=ax3, title='minmax', color='red', legend=False, figsize=(20, 5))
pyplot.show()
scaled_zscore = scale_raw_data_zscore(raw_data)
scaled_minmax = scale_raw_data_minmax(raw_data)
plot_data(raw_data, scaled_zscore, scaled_minmax)
I'm adding the plot of the results. Why the results of both scaling methods are exactly the same? And why they have a different pattern from the raw data?
I am trying to generate the heatmap below. I have generated a figure, but I need help formatting the x-axis on the bottom to have the names show up in the appropriate order. Any help is appreciated. Thank You!
Proposed Heatmap
Generated Heatmap
df = pd.read_table('/srv/data/shared/virus_data.txt', header=None)
df.set_index(0, inplace=True)
df_bgs = df.loc[:, df.isna().any(axis=0)].iloc[3:]
df.dropna(axis=1, inplace=True)
df.sort_values(['Treatment', 'Time'], axis=1, inplace=True)
dfn = df.iloc[3:].astype('float')
dfn.index.name = 'Gene'
col_names = df.loc['Time'] + ' ' + df.loc['Treatment'] + 'HR ' + \
df.loc['Replicate']
dfn.columns = col_names
df.columns = col_names
dfn = dfn.sub(df_bgs.mean(axis=1), axis=0)
dfn[dfn<0] = 0
dfn = dfn.div(dfn.loc['HPRT1'], axis=1)
dfn.drop('HPRT1', axis=0, inplace=True)
dfn = np.log2(dfn+0.01)
treatment2select = ['M', 'M+SNV', 'M+ANDV']
df_ec = dfn.loc[:, df.loc['Treatment'].str.contains('EC')]
df_m = dfn.loc[:, df.loc['Treatment'].isin(treatment2select)]
def row_z_score(df):
return df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0)
df_ec = row_z_score(df_ec)
df_m = row_z_score(df_m)
uni_treatment = df.loc['Treatment'].unique()
treatment2color = dict(zip(uni_treatment,sns.color_palette(palette="YlGn",
n_colors=len(uni_treatment))))
col_colors = df.loc['Treatment'].map(treatment2color)
g = sns.clustermap(df_m, col_cluster=True,
col_colors = col_colors,
cmap='RdBu_r', method='ward')
g.ax_col_dendrogram.set_visible(False)