changing the axis values in matplotlib plot - pandas

In continuation of this accepted answer plotting a beautiful timeseries plot I want to change the y axis values after the plot by dividing a floating point number (0.2) with the y axis values so that the values of the y axis will be 0.0,0.5,1.0,1.5,2,2.5,3.0. without changing the timeseries.
The code is
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=list('ABC'))
data['B'] = data['B'].apply(lambda x: x + 0.3)
data['C'] = data['C'].apply(lambda x: x + 0.6)
ax = data.plot()
for col, place, line in zip(list('ABC'), [5, 8, 10], ax.lines):
ax.plot(place, data[col][place], marker="*", c=line.get_color())
plt.show()
data_3_timeseries
-0.00831 -0.0213 -0.0182
0.0105 -0.00767 -0.012
0.00326 0.0148 -0.00471
-0.0263 -0.00318 0.011
0.012 0.0398 0.0117
-0.0156 -0.0133 0.02
-0.0482 -0.00783 -0.0162
0.0103 -0.00639 0.0103
0.0158 0.000788 -0.00484
-0.000704 -0.0236 0.00579
0.00151 -0.0135 -0.0195
-0.0163 -0.00185 0.00722
0.0207 0.00998 -0.0387
-0.0246 -0.0274 -0.0108
0.0123 -0.0155 0.0137
-0.00963 0.0023 0.0305
-0.0147 0.0255 -0.00806
0.000488 -0.0187 5.29e-05
-0.0167 0.0105 -0.0204
0.00653 0.0176 -0.00643
0.0154 -0.0136 0.00415
-0.0147 -0.00339 0.0175
-0.0238 -0.00284 0.0204
-0.00629 0.0205 -0.017
0.00449 -0.0135 -0.0127
0.00843 -0.0167 0.00903
-0.00331 7.2e-05 -0.00281
-0.0043 0.0047 0.00681
-0.0356 0.0214 0.0158
-0.0104 -0.0165 0.0092
0.00599 -0.0128 -0.0202
0.015 -0.0272 0.0117
0.012 0.0258 -0.0154
-0.00509 -0.0194 0.00219
-0.00154 -0.00778 -0.00483
-0.00152 -0.0451 0.0187
0.0271 0.0186 -0.0133
-0.0146 -0.0416 0.0154
-0.024 0.00295 0.006
-0.00889 -0.00501 -0.028
-0.00555 0.0124 -0.00406
-0.0185 -0.0114 0.0224
0.0143 0.0204 -0.0193
-0.0168 -0.00608 0.00178
-0.0159 0.0189 0.0109
-0.0213 -0.007 -0.0323
0.0031 0.0207 -0.00333
-0.0202 -0.0157 -0.0105
0.0159 0.00216 -0.0262
0.0105 -0.00292 0.00447
0.0126 0.0163 -0.0141
0.01 0.00679 0.025
0.0237 -0.0142 -0.0149
0.00394 -0.0379 0.00905
-0.00803 0.0186 -0.0176
-0.013 0.0162 0.0208
-0.00197 0.0313 -0.00804
0.0218 -0.0249 0.000412
-0.0164 0.00681 -0.0109
-0.0162 -0.00795 -0.00279
-0.01 -0.00977 -0.0194
-0.00723 -0.0464 0.00453
-0.000533 0.02 -0.0193
0.00706 0.0391 0.0194

I've tried to be as detailed with the comments as I can, I hope this will be clear:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
COLUMNS = ['A', 'B', 'C'] # If you have more columns you can add here
MARKS = [('A', 5), ('B', 8), ('C', 10), ('A', 20), ('C', 25)] # You can add here more marks
# Here You can add/edit colors for the lines and the markers, and add new columns if exists
COLORS_DICT = {'A': {'Line': 'Purple', 'Marker': 'Blue'},
'B': {'Line': 'Red', 'Marker': 'Green'},
'C': {'Line': 'Brown', 'Marker': 'Orange'}, }
FACTOR = 6 # the factor
SPACER = 1 # This spacer together with the factor will have the y axes with 0.5 gaps
MARKER = '*' # star Marker, can be altered.
LINE_WIDTH = 0.5 # the width of the lines
COLORS = True # True for colors False for black
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=COLUMNS)
for i, col in enumerate(COLUMNS): # iterating through the columns
data[col] = data[col].apply(lambda x: x * FACTOR + i * SPACER) # applying each column the factor and the spacer
ax = data.plot()
ax.get_legend().remove() # removing the columns' legend (If Colors is False there's no need for legend)
for col, line in zip(COLUMNS, ax.lines): # iterating through the column and lines
if COLORS:
line.set_color(COLORS_DICT[col]['Line'])
else:
line.set_color('Black')
line.set_linewidth(LINE_WIDTH)
for col, mark in MARKS:
ax.plot(mark, data[col][mark], marker=MARKER, c=COLORS_DICT[col]['Marker'] if COLORS else 'Black')
plt.show()

Related

plotting a beautiful timeseries plot

I have three columns of time series data. I would like to plot the three time series one upon another beautifully in one plot. A star needs to be placed in the respective 5th, 8th, and 10th data points of each time series. My goal is to implement it in Python. I would appreciate if Experts could offer a more efficient method of doing it.
My code:
import matplotlib.pyplot as plt
import numpy as np
data=np.loadtxt("data_3_timeseries")
data.plot()
plt.show()
data_3_timeseries is attached here
-0.00831 -0.0213 -0.0182
0.0105 -0.00767 -0.012
0.00326 0.0148 -0.00471
-0.0263 -0.00318 0.011
0.012 0.0398 0.0117
-0.0156 -0.0133 0.02
-0.0482 -0.00783 -0.0162
0.0103 -0.00639 0.0103
0.0158 0.000788 -0.00484
-0.000704 -0.0236 0.00579
0.00151 -0.0135 -0.0195
-0.0163 -0.00185 0.00722
0.0207 0.00998 -0.0387
-0.0246 -0.0274 -0.0108
0.0123 -0.0155 0.0137
-0.00963 0.0023 0.0305
-0.0147 0.0255 -0.00806
0.000488 -0.0187 5.29e-05
-0.0167 0.0105 -0.0204
0.00653 0.0176 -0.00643
0.0154 -0.0136 0.00415
-0.0147 -0.00339 0.0175
-0.0238 -0.00284 0.0204
-0.00629 0.0205 -0.017
0.00449 -0.0135 -0.0127
0.00843 -0.0167 0.00903
-0.00331 7.2e-05 -0.00281
-0.0043 0.0047 0.00681
-0.0356 0.0214 0.0158
-0.0104 -0.0165 0.0092
0.00599 -0.0128 -0.0202
0.015 -0.0272 0.0117
0.012 0.0258 -0.0154
-0.00509 -0.0194 0.00219
-0.00154 -0.00778 -0.00483
-0.00152 -0.0451 0.0187
0.0271 0.0186 -0.0133
-0.0146 -0.0416 0.0154
-0.024 0.00295 0.006
-0.00889 -0.00501 -0.028
-0.00555 0.0124 -0.00406
-0.0185 -0.0114 0.0224
0.0143 0.0204 -0.0193
-0.0168 -0.00608 0.00178
-0.0159 0.0189 0.0109
-0.0213 -0.007 -0.0323
0.0031 0.0207 -0.00333
-0.0202 -0.0157 -0.0105
0.0159 0.00216 -0.0262
0.0105 -0.00292 0.00447
0.0126 0.0163 -0.0141
0.01 0.00679 0.025
0.0237 -0.0142 -0.0149
0.00394 -0.0379 0.00905
-0.00803 0.0186 -0.0176
-0.013 0.0162 0.0208
-0.00197 0.0313 -0.00804
0.0218 -0.0249 0.000412
-0.0164 0.00681 -0.0109
-0.0162 -0.00795 -0.00279
-0.01 -0.00977 -0.0194
-0.00723 -0.0464 0.00453
-0.000533 0.02 -0.0193
0.00706 0.0391 0.0194
Due to your identical columns, the lines are lining one over the other.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=list('ABC'))
ax = data.plot()
for col, place, line in zip(list('ABC'), [5, 8, 10], ax.lines):
ax.plot(place, data[col][place], marker="*", c=line.get_color())
plt.show()
"NORMALIZED"
data = pd.DataFrame(np.loadtxt("data_3_timeseries"), columns=list('ABC'))
data['B'] = data['B'].apply(lambda x: x + 0.3)
data['C'] = data['C'].apply(lambda x: x + 0.6)
ax = data.plot()
for col, place, line in zip(list('ABC'), [5, 8, 10], ax.lines):
ax.plot(place, data[col][place], marker="*", c=line.get_color())
plt.show()
Something like this? Note: I did not use your columns correctly, but the principle stands.
markers = [5, 8, 10]
plt.plot(data[:len(data) // 3],
color='Blue',)
for marker in markers:
plt.plot(marker,
data[:len(data) // 3][marker],
marker="*",
color='blue')
plt.plot(data[len(data) // 3 : len(data) // 3 * 2],
color='orange')
plt.plot(data[len(data) // 3 * 2 : len(data) // 3 * 3],
color='green')
I only placed markers for the first data column, but you can do the same for all the columns.
Output:
You can of course further prettify your plot. Enough options with Matplot.

How to remove the overlapping in threshold_scale of Choropleth Map?

#ChoroplethMap
bins = list(state_avg_value["price"].quantile([0, 0.25, 0.5, 0.75, 1]))
#state_avg_value_max= state_avg_value['price'].max()
m = folium.Map(location=[48, -102], zoom_start=3)
folium.Choropleth(
geo_data=state_geo,
data=state_avg_value,
columns=["state", "price"],
key_on="feature.properties.name",
fill_color="BuPu",
fill_opacity=0.7,
line_opacity=0.5,
legend_name="Price (in dollars)",
bins=bins,
reset=True,
).add_to(m)
m
#Q. How to remove the overlapping in threshold_scale?
#[ChoroplethMap][1]
#[1]: https://i.stack.imgur.com/Xi4A6.jpg

Matplotlib Colorbar missing 1 required positional argument: 'mappable'

I would like to create a scatter chart for dataframe below:
df_sample.head(10)
duration distance speed
0 26.299999 3.569 8.1
1 6.000000 0.739 7.4
2 25.700001 2.203 5.1
3 34.400002 2.876 5.0
4 3.000000 0.656 13.1
5 29.299999 3.704 7.6
6 10.200000 2.076 12.2
7 4.000000 0.774 11.6
8 9.200000 1.574 10.3
9 10.800000 0.782 4.3
Almost get it done with codes below. I want to add colorbar to the figure based on speed (Yellow: Slowest & Blue : Fastest) and eventually I got an error at fig.colorbar(ax=ax) at the last line. Please advise: what is mappable?
with plt.style.context('seaborn-ticks'):
fig, ax = plt.subplots(figsize = (10, 6))
ax.set_title('Relationship between Distance & Duration', fontdict={'fontsize': 18, 'fontweight': 'bold'}, loc='left', pad=20)
ax.scatter(x=df_sample.duration.values, y=df_sample.distance.values, c=df_sample.speed.values, cmap=cm.YlGnBu)
# remove top & right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# equivalent to 'sns.despine(offset = 5)'
ax.spines['left'].set_position(('outward', 5))
ax.spines['left'].set_linewidth(1.2)
ax.spines['bottom'].set_position(('outward', 5))
ax.spines['bottom'].set_linewidth(1.2)
# set ticks & ticklabels
xloc = np.arange(0, 45, 5)
ax.set_xticks(xloc)
ax.set_xticklabels(labels=xloc, fontdict={'fontsize': 12})
ax.set_xlabel('Minute(s)', fontdict={'fontsize': 14, 'fontweight': 'bold'})
yloc = np.arange(6)
ylab = [f"{int(num)}" for num in yloc]
ax.set_yticks(yloc)
ax.set_yticklabels(labels=ylab, fontdict={'fontsize': 12})
ax.set_ylabel("Distance (KM)" , fontdict={'fontsize': 14, 'fontweight': 'bold'})
fig.colorbar(ax=ax);
You can assign your scatterplot to a variable, for instance:
sp=ax.scatter(x=df_sample.duration.values, y=df_sample.distance.values, c=df_sample.speed.values, cmap=cm.YlGnBu)
and then pass it as a mappable object to the colorbar:
fig.colorbar(sp)

Jupyter "TypeError: invalid type comparison"

Alright, just started a new job and i have been tasked with writing a simple notebook in jupyter. I really want to impress my supervisor and have been working on this code for hours and can't get it to work, hopefully somebody here can help me.
Here is the code I have been working on:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
df = pd.read_csv(r'C:\Users\jk2588\Documents\EDA\EDA Practice\top1000_dataset.csv', converters={'GENDER': lambda x: int(x == 'Male')}, usecols = ['MEMBER_ID', 'GENDER', 'Age', 'Dement'])
df_gp_1 = df[['MEMBER_ID', 'Dement']].groupby('MEMBER_ID').agg(np.mean).reset_index()
df_gp_2 = df[['MEMBER_ID', 'GENDER', 'Age']].groupby('MEMBER_ID').agg(max).reset_index()
df_gp = pd.merge(df_gp_1, df_gp_2, on = ['MEMBER_ID'])
df.head()
Output: MEMBER_ID Age Dement GENDER
0 000000002 01 36 NaN 0
1 000000002 01 36 NaN 0
2 000000002 01 36 NaN 0
3 000000002 01 36 NaN 0
4 000000002 01 36 NaN 0
df['Dement'] = df['Dement'].fillna(0)
df['Dement'] = df['Dement'].astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
MEMBER_ID 999 non-null object
Age 999 non-null int64
Dement 999 non-null int64
GENDER 999 non-null int64
dtypes: int64(3), object(1)
memory usage: 31.3+ KB
freq = ((df_gp.Age.value_counts(normalize = True).reset_index().sort_values(by = 'index').Age)*100).tolist()
number_gp = 7
def ax_settings(ax, var_name, x_min, x_max):
ax.set_xlim(x_min,x_max)
ax.set_yticks([])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_edgecolor('#444444')
ax.spines['bottom'].set_linewidth(2)
ax.text(0.02, 0.05, var_name, fontsize=17, fontweight="bold", transform = ax.transAxes)
return None
fig = plt.figure(figsize=(12,7))
gs = gridspec.GridSpec(nrows=number_gp,
ncols=2,
figure=fig,
width_ratios= [3, 1],
height_ratios= [1]*number_gp,
wspace=0.2, hspace=0.05
)
ax = [None]*(number_gp + 1)
features = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
for i in range(number_gp):
ax[i] = fig.add_subplot(gs[i, 0])
ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
if i < (number_gp - 1): ax[i].set_xticks([])
ax[0].legend(['Male', 'Female'], facecolor='w')
ax[number_gp] = fig.add_subplot(gs[:, 1])
ax[number_gp].spines['right'].set_visible(False)
ax[number_gp].spines['top'].set_visible(False)
ax[number_gp].barh(features, freq, color='#004c99', height=0.4)
ax[number_gp].set_xlim(0,100)
ax[number_gp].invert_yaxis()
ax[number_gp].text(1.09, -0.04, '(%)', fontsize=10, transform = ax[number_gp].transAxes)
ax[number_gp].tick_params(axis='y', labelsize = 14)
plt.show()
I am then met with:
C:\Users\jk2588\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py:1167: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = method(y)
--------------------------------------------------------------------------
TypeError Traceback (most recent call last
<ipython-input-38-8665030edb1c> in <module>()
24 ax[i] = fig.add_subplot(gs[i, 0])
25 ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
---> 26 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
27 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
28 if i < (number_gp - 1): ax[i].set_xticks([])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1281
1282 with np.errstate(all='ignore'):
-> 1283 res = na_op(values, other)
1284 if is_scalar(res):
1285 raise TypeError('Could not compare {typ} type with Series'
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y)
1167 result = method(y)
1168 if result is NotImplemented:
-> 1169 raise TypeError("invalid type comparison")
1170 else:
1171 result = op(x, y)
TypeError: invalid type comparison
Please help, i have been faced with an absurd amount of errors this week

Find x,y coordinate of Dataframe.plot.kde()

I am trying to identify the peak coordinates (x,y) of a kde/gaussian
curve
How can I get the X values and Y values from
losing_mae.plot.kde(...) so that I can get the argmax()
losing_mae.tail(10)
238 -500.0
239 -637.5
240 -412.5
242 -1062.5
243 -562.5
245 -412.5
247 -437.5
252 -800.0
254 -662.5
255 -1062.5
Name: mae, Length: 113, dtype: float64
losing_mae.hist(ax=ax, bins=25, color='c', alpha=0.5)
losing_mae.plot.kde(color='c', ax=ax2, lw=1)
Set up:
import numpy as np
import pandas as pd
losing_mae = pd.DataFrame.from_dict({1: {0: -500.0,
1: -637.5,
2: -412.5,
3: -1062.5,
4: -562.5,
5: -412.5,
6: -437.5,
7: -800.0,
8: -662.5,
9: -1062.5}}
The kde plot returns an axes object. You can drill down to find the x and y:
d = losing_mae.plot.kde()
print(d.get_children())
Which gives a list of the objects. You probably want to drill down into the Line2D:
[<matplotlib.lines.Line2D at 0x7fb82ce67550>,
<matplotlib.spines.Spine at 0x7fb82d237e80>,
<matplotlib.spines.Spine at 0x7fb84003cd30>,
<matplotlib.spines.Spine at 0x7fb82d221b38>,
<matplotlib.spines.Spine at 0x7fb82d221748>,
<matplotlib.axis.XAxis at 0x7fb82d2590f0>,
<matplotlib.axis.YAxis at 0x7fb82d221400>,
Text(0.5, 1.0, ''),
Text(0.0, 1.0, ''),
Text(1.0, 1.0, ''),
<matplotlib.legend.Legend at 0x7fb82ce67400>,
<matplotlib.patches.Rectangle at 0x7fb82cea6940>]
Now grab the line and its path and then you can get the vertices:
l = d.get_children()[0].get_path()
l = l.vertices
print(l)
array([[-1.38750000e+03, 5.87608940e-05],
[-1.38619870e+03, 5.97906082e-05],
[-1.38489740e+03, 6.08341884e-05],
.... # and so on for ~2000 points
Separate the X and Y:
x, y = np.split(l.vertices, 2, 1)
And then you can just call max on both to get the points you want:
peakX, peakY = x.max(), y.max()
print(peakX, peakY)
87.5 0.0015392054229208412