plotting a beautiful timeseries plot - pandas

I have three columns of time series data. I would like to plot the three time series one upon another beautifully in one plot. A star needs to be placed in the respective 5th, 8th, and 10th data points of each time series. My goal is to implement it in Python. I would appreciate if Experts could offer a more efficient method of doing it.
My code:
import matplotlib.pyplot as plt
import numpy as np
data=np.loadtxt("data_3_timeseries")
data.plot()
plt.show()
data_3_timeseries is attached here
-0.00831 -0.0213 -0.0182
0.0105 -0.00767 -0.012
0.00326 0.0148 -0.00471
-0.0263 -0.00318 0.011
0.012 0.0398 0.0117
-0.0156 -0.0133 0.02
-0.0482 -0.00783 -0.0162
0.0103 -0.00639 0.0103
0.0158 0.000788 -0.00484
-0.000704 -0.0236 0.00579
0.00151 -0.0135 -0.0195
-0.0163 -0.00185 0.00722
0.0207 0.00998 -0.0387
-0.0246 -0.0274 -0.0108
0.0123 -0.0155 0.0137
-0.00963 0.0023 0.0305
-0.0147 0.0255 -0.00806
0.000488 -0.0187 5.29e-05
-0.0167 0.0105 -0.0204
0.00653 0.0176 -0.00643
0.0154 -0.0136 0.00415
-0.0147 -0.00339 0.0175
-0.0238 -0.00284 0.0204
-0.00629 0.0205 -0.017
0.00449 -0.0135 -0.0127
0.00843 -0.0167 0.00903
-0.00331 7.2e-05 -0.00281
-0.0043 0.0047 0.00681
-0.0356 0.0214 0.0158
-0.0104 -0.0165 0.0092
0.00599 -0.0128 -0.0202
0.015 -0.0272 0.0117
0.012 0.0258 -0.0154
-0.00509 -0.0194 0.00219
-0.00154 -0.00778 -0.00483
-0.00152 -0.0451 0.0187
0.0271 0.0186 -0.0133
-0.0146 -0.0416 0.0154
-0.024 0.00295 0.006
-0.00889 -0.00501 -0.028
-0.00555 0.0124 -0.00406
-0.0185 -0.0114 0.0224
0.0143 0.0204 -0.0193
-0.0168 -0.00608 0.00178
-0.0159 0.0189 0.0109
-0.0213 -0.007 -0.0323
0.0031 0.0207 -0.00333
-0.0202 -0.0157 -0.0105
0.0159 0.00216 -0.0262
0.0105 -0.00292 0.00447
0.0126 0.0163 -0.0141
0.01 0.00679 0.025
0.0237 -0.0142 -0.0149
0.00394 -0.0379 0.00905
-0.00803 0.0186 -0.0176
-0.013 0.0162 0.0208
-0.00197 0.0313 -0.00804
0.0218 -0.0249 0.000412
-0.0164 0.00681 -0.0109
-0.0162 -0.00795 -0.00279
-0.01 -0.00977 -0.0194
-0.00723 -0.0464 0.00453
-0.000533 0.02 -0.0193
0.00706 0.0391 0.0194

Due to your identical columns, the lines are lining one over the other.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=list('ABC'))
ax = data.plot()
for col, place, line in zip(list('ABC'), [5, 8, 10], ax.lines):
ax.plot(place, data[col][place], marker="*", c=line.get_color())
plt.show()
"NORMALIZED"
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=list('ABC'))
data['B'] = data['B'].apply(lambda x: x + 0.3)
data['C'] = data['C'].apply(lambda x: x + 0.6)
ax = data.plot()
for col, place, line in zip(list('ABC'), [5, 8, 10], ax.lines):
ax.plot(place, data[col][place], marker="*", c=line.get_color())
plt.show()

Something like this? Note: I did not use your columns correctly, but the principle stands.
markers = [5, 8, 10]
plt.plot(data[:len(data) // 3],
color='Blue',)
for marker in markers:
plt.plot(marker,
data[:len(data) // 3][marker],
marker="*",
color='blue')
plt.plot(data[len(data) // 3 : len(data) // 3 * 2],
color='orange')
plt.plot(data[len(data) // 3 * 2 : len(data) // 3 * 3],
color='green')
I only placed markers for the first data column, but you can do the same for all the columns.
Output:
You can of course further prettify your plot. Enough options with Matplot.

Related

changing the axis values in matplotlib plot

In continuation of this accepted answer plotting a beautiful timeseries plot I want to change the y axis values after the plot by dividing a floating point number (0.2) with the y axis values so that the values of the y axis will be 0.0,0.5,1.0,1.5,2,2.5,3.0. without changing the timeseries.
The code is
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=list('ABC'))
data['B'] = data['B'].apply(lambda x: x + 0.3)
data['C'] = data['C'].apply(lambda x: x + 0.6)
ax = data.plot()
for col, place, line in zip(list('ABC'), [5, 8, 10], ax.lines):
ax.plot(place, data[col][place], marker="*", c=line.get_color())
plt.show()
data_3_timeseries
-0.00831 -0.0213 -0.0182
0.0105 -0.00767 -0.012
0.00326 0.0148 -0.00471
-0.0263 -0.00318 0.011
0.012 0.0398 0.0117
-0.0156 -0.0133 0.02
-0.0482 -0.00783 -0.0162
0.0103 -0.00639 0.0103
0.0158 0.000788 -0.00484
-0.000704 -0.0236 0.00579
0.00151 -0.0135 -0.0195
-0.0163 -0.00185 0.00722
0.0207 0.00998 -0.0387
-0.0246 -0.0274 -0.0108
0.0123 -0.0155 0.0137
-0.00963 0.0023 0.0305
-0.0147 0.0255 -0.00806
0.000488 -0.0187 5.29e-05
-0.0167 0.0105 -0.0204
0.00653 0.0176 -0.00643
0.0154 -0.0136 0.00415
-0.0147 -0.00339 0.0175
-0.0238 -0.00284 0.0204
-0.00629 0.0205 -0.017
0.00449 -0.0135 -0.0127
0.00843 -0.0167 0.00903
-0.00331 7.2e-05 -0.00281
-0.0043 0.0047 0.00681
-0.0356 0.0214 0.0158
-0.0104 -0.0165 0.0092
0.00599 -0.0128 -0.0202
0.015 -0.0272 0.0117
0.012 0.0258 -0.0154
-0.00509 -0.0194 0.00219
-0.00154 -0.00778 -0.00483
-0.00152 -0.0451 0.0187
0.0271 0.0186 -0.0133
-0.0146 -0.0416 0.0154
-0.024 0.00295 0.006
-0.00889 -0.00501 -0.028
-0.00555 0.0124 -0.00406
-0.0185 -0.0114 0.0224
0.0143 0.0204 -0.0193
-0.0168 -0.00608 0.00178
-0.0159 0.0189 0.0109
-0.0213 -0.007 -0.0323
0.0031 0.0207 -0.00333
-0.0202 -0.0157 -0.0105
0.0159 0.00216 -0.0262
0.0105 -0.00292 0.00447
0.0126 0.0163 -0.0141
0.01 0.00679 0.025
0.0237 -0.0142 -0.0149
0.00394 -0.0379 0.00905
-0.00803 0.0186 -0.0176
-0.013 0.0162 0.0208
-0.00197 0.0313 -0.00804
0.0218 -0.0249 0.000412
-0.0164 0.00681 -0.0109
-0.0162 -0.00795 -0.00279
-0.01 -0.00977 -0.0194
-0.00723 -0.0464 0.00453
-0.000533 0.02 -0.0193
0.00706 0.0391 0.0194
I've tried to be as detailed with the comments as I can, I hope this will be clear:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
COLUMNS = ['A', 'B', 'C'] # If you have more columns you can add here
MARKS = [('A', 5), ('B', 8), ('C', 10), ('A', 20), ('C', 25)] # You can add here more marks
# Here You can add/edit colors for the lines and the markers, and add new columns if exists
COLORS_DICT = {'A': {'Line': 'Purple', 'Marker': 'Blue'},
'B': {'Line': 'Red', 'Marker': 'Green'},
'C': {'Line': 'Brown', 'Marker': 'Orange'}, }
FACTOR = 6 # the factor
SPACER = 1 # This spacer together with the factor will have the y axes with 0.5 gaps
MARKER = '*' # star Marker, can be altered.
LINE_WIDTH = 0.5 # the width of the lines
COLORS = True # True for colors False for black
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=COLUMNS)
for i, col in enumerate(COLUMNS): # iterating through the columns
data[col] = data[col].apply(lambda x: x * FACTOR + i * SPACER) # applying each column the factor and the spacer
ax = data.plot()
ax.get_legend().remove() # removing the columns' legend (If Colors is False there's no need for legend)
for col, line in zip(COLUMNS, ax.lines): # iterating through the column and lines
if COLORS:
line.set_color(COLORS_DICT[col]['Line'])
else:
line.set_color('Black')
line.set_linewidth(LINE_WIDTH)
for col, mark in MARKS:
ax.plot(mark, data[col][mark], marker=MARKER, c=COLORS_DICT[col]['Marker'] if COLORS else 'Black')
plt.show()

How to Normalize the values between zero and one?

I have a ndarray of shape (74,):
[-1.995 1.678 -2.535 1.739 -1.728 -1.268 -0.727 -3.385 -2.348
-3.021 0.5293 -0.4573 0.5137 -3.047 -4.75 -1.847 2.922 -0.989
-1.507 -0.9224 -2.545 6.957 0.9985 -2.035 -3.234 -2.848 -1.971
-3.246 2.057 -1.991 -6.27 9.22 0.4045 -2.703 -1.577 4.066
7.215 -4.07 12.98 -3.02 1.456 9.44 6.49 0.272 2.07
1.625 -3.531 -2.846 -4.914 -0.536 -3.496 -1.095 -2.719 -0.5825
5.535 -0.1753 3.658 4.234 4.543 -0.8384 -2.705 -2.012 -6.56
10.5 -2.021 -2.48 1.725 5.69 3.672 -6.855 -3.887 1.761
6.926 -4.848 ]
I need to normlize this vector where the values become between [0,1] and then the sum of the values inside this vector = 1.
You can try this formula to make it between [0, 1]:
min_val = np.min(original_arr)
max_val = np.max(original_arr)
normalized_arr = (original_arr - min_val) / (max_val - min_val)
You can try this formula to make the sum of the array to be 1:
new_arr = original_arr / original_arr.sum()

Doing a ML predict but prices appears as in CSV

I am learning ML and running my code on prediction. When I run the code, I find the prices in the csv is the same as the predict, what am I doing wrong?
----CODE---
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data = melbourne_data.dropna(axis=0)
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Price', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
print(X.describe())
print(X.head())
melbourne_model = DecisionTreeRegressor(random_state=1)
melbourne_model.fit(X, y)
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))
-----OUTPUT----
Rooms Price ... Lattitude Longtitude
count 6196.000000 6.196000e+03 ... 6196.000000 6196.000000
mean 2.931407 1.068828e+06 ... -37.807904 144.990201
std 0.971079 6.751564e+05 ... 0.075850 0.099165
min 1.000000 1.310000e+05 ... -38.164920 144.542370
25% 2.000000 6.200000e+05 ... -37.855438 144.926198
50% 3.000000 8.800000e+05 ... -37.802250 144.995800
75% 4.000000 1.325000e+06 ... -37.758200 145.052700
max 8.000000 9.000000e+06 ... -37.457090 145.526350
[8 rows x 6 columns]
Rooms Price Bathroom Landsize Lattitude Longtitude
1 2 1035000.0 1.0 156.0 -37.8079 144.9934
2 3 1465000.0 2.0 134.0 -37.8093 144.9944
4 4 1600000.0 1.0 120.0 -37.8072 144.9941
6 3 1876000.0 2.0 245.0 -37.8024 144.9993
7 2 1636000.0 1.0 256.0 -37.8060 144.9954
Making predictions for the following 5 houses:
Rooms Price Bathroom Landsize Lattitude Longtitude
1 2 1035000.0 1.0 156.0 -37.8079 144.9934
2 3 1465000.0 2.0 134.0 -37.8093 144.9944
4 4 1600000.0 1.0 120.0 -37.8072 144.9941
6 3 1876000.0 2.0 245.0 -37.8024 144.9993
7 2 1636000.0 1.0 256.0 -37.8060 144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
First, split your data into a train and test file.
Next, train the model using the .fit() function using your x_train and y_train datasets.
Then, run the .predict() function to make a prediction and assign the values as a list in the y_pred variable.
Finally, Make sure not to include the column that you are trying to predict in melbourne_features.
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data = melbourne_data.dropna(axis=0)
y = melbourne_data.Price
#Make sure not to include the column that you are trying to predict.
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
print(X.describe())
print(X.head())
#Enter 0.50 when you wanted to have 50 percent of your data to be tested and 50 percent to be trained.
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.50)
melbourne_model = DecisionTreeRegressor(random_state=1)
#Alternatively, you can use RandomForestRegressor to lower down your mean absolute error compare to DecisionTreeRegressor.
#melbourne_model = RandomForestRegressor(n_estimators = 1000)
#Fit the x_train and y_train data only. In other words, train the model.
melbourne_model.fit(x_train, y_train)
#Finally, make a prediction.
y_pred = melbourne_model.predict(x_test)
print("Making predictions for the following 5 houses:")
print(x_test.head())
print("The predictions are")
print(pd.DataFrame({'Actual Price':y_test,
'Predicted Price': y_pred
}
)
)
#The mean absolute error is a single number that you can plus or minus
#from your prediction price to get the best estimate of the actual price
#Your goal is to have as low mean absolute error as possible.
print(f'Mean Absolute Error : {mean_absolute_error(y_test, y_pred)}')
Source:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
https://www.geeksforgeeks.org/python-decision-tree-regression-using-sklearn/
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
Additional Reference:
https://www.youtube.com/watch?v=PaFPbb66DxQ
https://www.youtube.com/watch?v=YSB7FtzeicA
https://www.youtube.com/watch?v=BFaadIqWlAg
https://www.youtube.com/watch?v=ENvSybznF_o
https://www.youtube.com/watch?v=yXoxdXMvD7c

Jupyter "TypeError: invalid type comparison"

Alright, just started a new job and i have been tasked with writing a simple notebook in jupyter. I really want to impress my supervisor and have been working on this code for hours and can't get it to work, hopefully somebody here can help me.
Here is the code I have been working on:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
df = pd.read_csv(r'C:\Users\jk2588\Documents\EDA\EDA Practice\top1000_dataset.csv', converters={'GENDER': lambda x: int(x == 'Male')}, usecols = ['MEMBER_ID', 'GENDER', 'Age', 'Dement'])
df_gp_1 = df[['MEMBER_ID', 'Dement']].groupby('MEMBER_ID').agg(np.mean).reset_index()
df_gp_2 = df[['MEMBER_ID', 'GENDER', 'Age']].groupby('MEMBER_ID').agg(max).reset_index()
df_gp = pd.merge(df_gp_1, df_gp_2, on = ['MEMBER_ID'])
df.head()
Output: MEMBER_ID Age Dement GENDER
0 000000002 01 36 NaN 0
1 000000002 01 36 NaN 0
2 000000002 01 36 NaN 0
3 000000002 01 36 NaN 0
4 000000002 01 36 NaN 0
df['Dement'] = df['Dement'].fillna(0)
df['Dement'] = df['Dement'].astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
MEMBER_ID 999 non-null object
Age 999 non-null int64
Dement 999 non-null int64
GENDER 999 non-null int64
dtypes: int64(3), object(1)
memory usage: 31.3+ KB
freq = ((df_gp.Age.value_counts(normalize = True).reset_index().sort_values(by = 'index').Age)*100).tolist()
number_gp = 7
def ax_settings(ax, var_name, x_min, x_max):
ax.set_xlim(x_min,x_max)
ax.set_yticks([])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_edgecolor('#444444')
ax.spines['bottom'].set_linewidth(2)
ax.text(0.02, 0.05, var_name, fontsize=17, fontweight="bold", transform = ax.transAxes)
return None
fig = plt.figure(figsize=(12,7))
gs = gridspec.GridSpec(nrows=number_gp,
ncols=2,
figure=fig,
width_ratios= [3, 1],
height_ratios= [1]*number_gp,
wspace=0.2, hspace=0.05
)
ax = [None]*(number_gp + 1)
features = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
for i in range(number_gp):
ax[i] = fig.add_subplot(gs[i, 0])
ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
if i < (number_gp - 1): ax[i].set_xticks([])
ax[0].legend(['Male', 'Female'], facecolor='w')
ax[number_gp] = fig.add_subplot(gs[:, 1])
ax[number_gp].spines['right'].set_visible(False)
ax[number_gp].spines['top'].set_visible(False)
ax[number_gp].barh(features, freq, color='#004c99', height=0.4)
ax[number_gp].set_xlim(0,100)
ax[number_gp].invert_yaxis()
ax[number_gp].text(1.09, -0.04, '(%)', fontsize=10, transform = ax[number_gp].transAxes)
ax[number_gp].tick_params(axis='y', labelsize = 14)
plt.show()
I am then met with:
C:\Users\jk2588\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py:1167: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = method(y)
--------------------------------------------------------------------------
TypeError Traceback (most recent call last
<ipython-input-38-8665030edb1c> in <module>()
24 ax[i] = fig.add_subplot(gs[i, 0])
25 ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
---> 26 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
27 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
28 if i < (number_gp - 1): ax[i].set_xticks([])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1281
1282 with np.errstate(all='ignore'):
-> 1283 res = na_op(values, other)
1284 if is_scalar(res):
1285 raise TypeError('Could not compare {typ} type with Series'
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y)
1167 result = method(y)
1168 if result is NotImplemented:
-> 1169 raise TypeError("invalid type comparison")
1170 else:
1171 result = op(x, y)
TypeError: invalid type comparison
Please help, i have been faced with an absurd amount of errors this week

Lookup value in dataframe using multiindex

I am looking up a value in a dataframe using a multi-index. df[value1,value2]. This works, but throws a keyerror if the value is not in the index. I can handle the exception but is there an equivalent syntax to a python dict.get()? That is, I would prefer the lookup to return None if the value is not found.
Mark
Just call DataFrame.get():
In [50]: from pandas.util.testing import makeCustomDataframe as mkdf
In [51]: df = mkdf(5, 2, c_idx_nlevels=2, data_gen_f=lambda *args: rand())
In [52]: df
Out[52]:
C0 C_l0_g0 C_l0_g1
C1 C_l1_g0 C_l1_g1
R0
R_l0_g0 0.155 0.989
R_l0_g1 0.427 0.330
R_l0_g2 0.951 0.720
R_l0_g3 0.745 0.485
R_l0_g4 0.674 0.841
In [53]: level = df.columns[0]
In [54]: level
Out[54]: ('C_l0_g0', 'C_l1_g0')
In [55]: df.get(level)
Out[55]:
R0
R_l0_g0 0.155
R_l0_g1 0.427
R_l0_g2 0.951
R_l0_g3 0.745
R_l0_g4 0.674
Name: (C_l0_g0, C_l1_g0), dtype: float64
In [56]: df.get('how are you?')
In [57]: df.get('how are you?', 'Fine')
Out[57]: 'Fine'
You can also just define a function:
def get_from_index(df, key, default=None):
try:
return df.loc[key]
except KeyError:
return default
If your df has a multiindex in columns 'key1' and 'key2' and you want to look up value xxx on key1 and yyy on key2 , try this
df.ix[df.index.get_level_values('key1') == xxx &
df.index.get_level_values('key2') == yyy]