plot a groupby object with bokeh - pandas

Consider the following MWE.
from pandas import DataFrame
from bokeh.plotting import figure
data = dict(x = [0,1,2,0,1,2],
y = [0,1,2,4,5,6],
g = [1,1,1,2,2,2])
df = DataFrame(data)
p = figure()
p.line( 'x', 'y', source=df[ df.g == 1 ] )
p.line( 'x', 'y', source=df[ df.g == 2 ] )
Ideally, I would like to compress the last to lines in one:
p.line( 'x', 'y', source=df.groupby('g') )
(Real life examples have a large and variable number of groups.) Is there any concise way to do this?

I just found out that the following works
gby = df.groupby('g')
gby.apply( lambda d: p.line( 'x', 'y', source=d ) )
(it has some drawbacks, though).
Any better idea?

I didn't come out with df.groupby so I used df.loc but maybe multi_line is what you are after:
from pandas import DataFrame
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
data = dict(x = [0, 1, 2, 0, 1, 2],
y = [0, 1, 2, 4, 5, 6],
g = [1, 1, 1, 2, 2, 2])
df = DataFrame(data, index = data['g'])
dfs = [DataFrame(df.loc[i].values, columns = df.columns) for i in df['g'].unique()]
source = ColumnDataSource(dict(x = [df['x'].values for df in dfs], y = [df['y'].values for df in dfs]))
p = figure()
p.multi_line('x', 'y', source = source)
show(p)
Result:

This is Tony's solution slightly simplified.
import pandas as pd
from bokeh.plotting import figure
data = dict(x = [0, 1, 2, 0, 1, 2],
y = [0, 1, 2, 4, 5, 6],
g = [1, 1, 1, 2, 2, 2])
df = pd.DataFrame(data)
####################### So far as in the OP
gby = df.groupby('g')
p = figure()
x = [list( sdf['x'] ) for i,sdf in gby]
y = [list( sdf['y'] ) for i,sdf in gby]
p.multi_line( x, y )

from pandas import DataFrame
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
data = dict(x = [0, 1, 2, 0, 1, 2],
y = [0, 1, 2, 4, 5, 6],
g = [1, 1, 1, 2, 2, 2])
df = DataFrame(data)
plt = figure()
for i, group in df.groupby(['g']):
source = ColumnDataSource(group)
plt.line('x','y', source=source, legend_group='g')
show(plt)

Related

Cublic spline interpolation produces straight lines

I would like to obtain a smooth curve going through specific points with integer coordinates. Instead of that I get straight line segments between the points. I tried interp1d(x,y,kind='cubic') and also CubicSpline, nothing works. Here is my code:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import interp1d,CubicSpline
x = np.arange(34)
y = [8,3,0,1,6,2,1,7,6,2,0,2,6,0,1,6,2,2,0,2,7,0,2,8,6,3,6,2,0,1,6,2,7,2]
f = CubicSpline(x, y)
plt.figure(figsize=(10,3))
plt.plot(x, y, 'o', x, f(x))
plt.show()
and here is the result:
Can you tell me how to get smooth curves instead?
Now you are using the original x-values to draw the curve. You need a new array with much more intermediate x-values. Numpy's np.linspace() creates such an array between a given minimum and maximum.
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import interp1d, CubicSpline
y = [8, 3, 0, 1, 6, 2, 1, 7, 6, 2, 0, 2, 6, 0, 1, 6, 2, 2, 0, 2, 7, 0, 2, 8, 6, 3, 6, 2, 0, 1, 6, 2, 7, 2]
x = np.arange(len(y))
f = CubicSpline(x, y)
plt.figure(figsize=(10, 3))
xs = np.linspace(x.min(), x.max(), 500)
plt.plot(x, y, 'o', xs, f(xs))
plt.tight_layout()
plt.show()

order plotly legend by custom order

I have a plotly figure to which I add values from two dataframes, df_a and df_b. I display both dataframes in different subplots, but they share a legend. How can I order the shared legend? The expected order is: [a, b, c, f, g]. Please see below for the current implementation - it seems to pick the order from input data in some way.
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
buckets = ["a", "b", "c", "f", "g"]
fig_subplots = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=["df_a", "df_b"])
def get_chart(df, buckets, d_legend):
fig = go.Figure()
colorscale = px.colors.qualitative.Pastel
i_color = 0
unique_dates = [str(dt) for dt in df.date.unique()]
for bucket in sorted(buckets, reverse=True):
y_values = df[df['variable'] == bucket]["value"].to_list()
enable_legend_for_bucket = False if bucket in d_legend else True
fig.add_trace(go.Bar(
name=bucket,
x=unique_dates,
y=y_values,
marker_color=colorscale[i_color],
legendgroup=bucket,
showlegend=enable_legend_for_bucket
))
if len(y_values) != 0:
d_legend[bucket] = True # store first time this bucket was populated for legend
i_color += 1
fig.update_layout(barmode="stack")
return fig, d_legend
list_df = [df_a.melt(id_vars="date"), df_b.melt(id_vars="date")]
d_legend = {}
iRow = 1
for df in list_df:
fig, d_legend = get_chart(df, buckets, d_legend)
for el in fig['data']:
fig_subplots.append_trace(el, iRow, 1)
iRow += 1
fig_subplots.update_layout(barmode='stack', legend={'traceorder':'normal'})
Without finding an easy built-in method to order the legend, and to have the same stacking of values with the legend sorted in any desired order.
new_order = sorted(['a','b','c','g','f'],reverse=True)
print(new_order)
ordered_object_list =[]
for i in new_order:
item = [obj for obj in fig_subplots.data if obj['name'] == i]
ordered_object_list += item
fig_subplots.data = ordered_object_list
fig_subplots.update_layout(legend={'traceorder':'reversed'})
There is also an approach with plotly express:
import pandas as pd
import plotly.express as px
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_a["id"] = "df_a"
df_b["id"] = "df_b"
df_c = df_a.append(df_b)
px.bar(df_c.melt(id_vars=["date", "id"]), x="date", y="value", color="variable", facet_col="id", range_y=[0,20], facet_col_wrap=1)

Pandas groupby when only one value groupedby

I want to compute the cumulative count of a given variable. So I expect that the following code works
import pandas as pd
import numpy as np
df = pd.DataFrame.from_records({'x': [0, 1, 0, 1, 1]})
df2 = pd.DataFrame.from_records({'x': [0, 0, 0, 0, 0]})
result = df.groupby('x').apply(lambda x: pd.Series(np.arange(len(x)), index=x.index)).reset_index(level=0, drop=True).sort_index()
assert (result == [0, 0, 1, 1, 2]).all()
result2 = df2.groupby('x').apply(lambda x: pd.Series(np.arange(len(x)))).reset_index(level=0, drop=True).sort_index()
assert (result2 == [0, 1, 2, 3, 4]).all()
The first assert is True but not the second one.
Why ?
This seems to be an open issue.
See BUG: inconsistent return format of Dataframe group apply function.
A workaround can be:
assert (result2.values == [0, 1, 2, 3, 4]).all()

Vectorization of selective cumulative sum

I have a pandas Series where each element is a list with indices:
series_example = pd.Series([[1, 3, 2], [1, 2]])
In addition, I have an array with values associated to every index:
arr_example = np.array([3., 0.5, 0.25, 0.1])
I want to create a new Series with the cumulative sums of the elements of the array given by the indices in the row of the input Series. In the example, the output Series would have the following contents:
0 [0.5, 0.6, 0.85]
1 [0.5, 0.75]
dtype: object
The non-vectorized way to do it would be the following:
def non_vector_transform(series, array):
series_output = pd.Series(np.zeros(len(series_example)), dtype = object)
for i in range(len(series)):
element_list = series[i]
series_output[i] = []
acum = 0
for element in element_list:
acum += array[element]
series_output[i].append(acum)
return series_output
I would like to do this in a vectorized way. Any vectorization magician to help me in here?
Use Series.apply and np.cumsum:
import numpy as np
import pandas as pd
series_example = pd.Series([[1, 3, 2], [1, 2]])
arr_example = np.array([3., 0.5, 0.25, 0.1])
result = series_example.apply(lambda x: np.cumsum(arr_example[x]))
print(result)
Or if you prefer a for loop:
import numpy as np
import pandas as pd
series_example = pd.Series([[1, 3, 2], [1, 2]])
arr_example = np.array([3., 0.5, 0.25, 0.1])
# Copy only if you do not want to overwrite the original series
result = series_example.copy()
for i, x in result.iteritems():
result[i] = np.cumsum(arr_example[x])
print(result)
Output:
0 [0.5, 0.6, 0.85]
1 [0.5, 0.75]
dtype: object

Two Lower X axes matplot lib

I was wondering if it is possible to have two distinct X axes in matplotlib, but not so that they are on opposite sides of the graph. Instead, would it be possible to put them next to each other?
Is this what you're looking for?
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import matplotlib.pyplot as plt
host = host_subplot(111, axes_class=AA.Axes)
plt.subplots_adjust(bottom=0.2)
par2 = host.twiny()
offset = -40
new_fixed_axis = par2.get_grid_helper().new_fixed_axis
par2.axis["bottom"] = new_fixed_axis(loc="bottom",
axes=par2,
offset=(0, offset))
par2.axis["top"].toggle(all=False)
host.set_xlim(0, 2)
host.set_ylim(0, 2)
host.set_ylabel("Distance")
host.set_xlabel("Density")
par2.set_xlabel("Velocity")
p1, = host.plot([0, 1, 2], [0, 1, 2], label="Density")
p3, = par2.plot([50, 30, 15], [0, 1, 2], label="Velocity")
par2.set_xlim(1, 65)
host.legend()
host.axis["bottom"].label.set_color(p1.get_color())
par2.axis["bottom"].label.set_color(p3.get_color())