Make a vbar with stdev from dataframes for mean and stdeviation - pandas

I have 2 data frames, 1 for mean and standard deviation and I am trying to make them into a bar chart in bokeh with error bars but I am stuck at how to ??groupby?? the 'Design' and Treatment'.
Basically, I am trying to get 3 bars per x-value (T0 to T2). The legend should show something like: 'mouse-yes', 'mouse-no', and 'cat-no'. How do I restructure the dict or dataframe to convert to something for vbar? And then how do I couple that with the stdev dataframe?
Also, is there a way to make the x_range in the figure automatically taking all original values from the 'Time' column? I'd like to be able to interchange the 'Time', 'Design', and 'Treatment' columns for the x-axis. I'm guessing this is where a pivot table comes in handy.
from bokeh.core.properties import value
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
output_file("dodged_bars.html")
import pandas as pd
dat_mean=[['T0','mouse','yes',25],['T0','mouse','no',24],['T0','cat','no',23],['T1','mouse','yes',15],['T1','mouse','no',14],['T1','cat','no',13],['T2','mouse','yes',5],['T2','mouse','no',4],['T2','cat','no',3]]
df_mean= pd.DataFrame(dat_mean,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
dat_std=[['T0','mouse','yes',5],['T0','mouse','no',5],['T0','cat','no',5],['T1','mouse','yes',2.5],['T1','mouse','no',2.5],['T1','cat','no',2.5],['T2','mouse','yes',1],['T2','mouse','no',1],['T2','cat','no',1]]
df_std= pd.DataFrame(dat_std,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
data = df_mean.to_dict(orient='list')*
dates = df_mean['Time'].tolist()
source = ColumnDataSource(data=data)
p = figure(x_range=['T0', 'T1', 'T2'], y_range=(0, 30), plot_height=250, title="Bokeh plot",
toolbar_location=None, tools="")
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
show(p)

from bokeh.core.properties import value
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
output_file("dodged_bars.html")
import pandas as pd
dat_mean=[['T0','mouse','yes',25],['T0','mouse','no',24],['T0','cat','no',23],['T1','mouse','yes',15],['T1','mouse','no',14],['T1','cat','no',13],['T2','mouse','yes',5],['T2','mouse','no',4],['T2','cat','no',3]]
df_mean= pd.DataFrame(dat_mean,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
dat_std=[['T0','mouse','yes',5],['T0','mouse','no',5],['T0','cat','no',5],['T1','mouse','yes',2.5],['T1','mouse','no',2.5],['T1','cat','no',2.5],['T2','mouse','yes',1],['T2','mouse','no',1],['T2','cat','no',1]]
df_std= pd.DataFrame(dat_std,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
time_seq=df_mean['Time'].drop_duplicates()
time_vals=time_seq.tolist()
MEANs=df_mean.groupby(["Design", "Treatment"])["Mean for Cmpd1"].apply(list).to_dict()
keys=[]
for h in range(len(MEANs)):
raw_key=list(MEANs.keys())[h]
keys.append(raw_key[0]+'_'+raw_key[1])
results = {'time_vals' : time_vals,
keys[0] : list(MEANs.values())[0],
keys[1] : list(MEANs.values())[1],
keys[2] : list(MEANs.values())[2]}
source = ColumnDataSource(data=results)
p = figure(x_range=['T0', 'T1', 'T2'], y_range=(0, 30), plot_height=250, title="Bokeh plot",
toolbar_location=None, tools="")
for hh in range(len(MEANs)):
p.vbar(x=dodge('time_vals', -0.25+.2*hh, range=p.x_range), top=keys[hh], width=0.2,
source=source,color=color[hh], legend=value(keys[hh]))
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
show(p)
This code works. Errorbars can be done with add_layout and Whisker functions

Related

How to wrap text in a dataframe's table (converted to .png)

I am having an issue where I cannot format my tables. The text is too long to just edit the dimensions or the text size. How can I quickly change this so you can see all the text when I have the data for each column more filled in? I am looking for a wrap text kind of function but I don't know if that is possible the way I'm doing it. Is there another way you'd recommend? I'm changing the table into a .png to insert into an Excel file. It has to be a .png so it's an object and doesn't mess with the size of the rows and columns in Excel.
import matplotlib.pyplot as plt
import xlsxwriter as xl
import numpy as np
import yfinance as yf
import pandas as pd
import datetime as dt
import mplfinance as mpf
import pandas_datareader
from pandas_datareader import data as pdr
yf.pdr_override()
import numpy as np
Individualreport = "C:\\Users\\Ashley\\FromPython.xlsx"
Ticklist = pd.read_excel("C:\\Users\\Ashley\\Eyes Trial Data Center.xlsx",sheet_name='Tickers', header=None)
stocks = Ticklist.values.ravel()
PipelineData = pd.read_excel("C:\\Users\\Ashley\\Eyes Trial Data Center.xlsx", sheet_name='Pipeline', header=None)
writer = pd.ExcelWriter(Individualreport, engine='xlsxwriter')
for i in stocks:
#write pipeline data
t = PipelineData.loc[(PipelineData[0]==i)]
print(t)
def render_mpl_table(data, col_width=10, row_height=1, font_size=10, wrap=True,
header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
bbox=[0, 0, 1, 1], header_columns=0,
ax=None, **kwargs):
if ax is None:
size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
fig, ax = plt.subplots(figsize=size)
ax.axis('off')
mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)
mpl_table.auto_set_font_size(False)
#mpl_table.set_fontsize(font_size)
for k, cell in mpl_table._cells.items():
cell.set_edgecolor(edge_color)
if k[0] == 0 or k[1] < header_columns:
cell.set_text_props(weight='bold', color='w')
cell.set_facecolor(header_color)
else:
cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
return ax.get_figure(), ax
fig,ax = render_mpl_table(t, header_columns=0, col_width=2.0)
fig.savefig(str(i)+'pipe.png')
I think I needed to use an additional package, haven't tried with this example, but worked in another similar example I did.
from textwrap import wrap
label = ("label text that is getting put in the graph")
label = [ '\n'.join(wrap(l, 20)) for l in label ]
#20 is number of characters per line

pulling data out of bins in density map created with matplotlib

I have created a lightning density map using lines of data representing lightning strikes. One line is shown below:
1996-01-17 03:54:35.853 44.9628 -78.9399 -37.9
Now that I have applied these lines of data to the density map and distributed them into their appropriate bins based on Lat/Long, I would like to pull the data back out specific to the bin that it fell into so that I can manipulate that data further.
I have tried to find answers to this online but have failed to find anything that is specific to what I am trying to do. Any and all help is greatly appreciated!
my code:
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.axes as ax
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from metpy.plots import USCOUNTIES
from matplotlib.axes import Axes
from cartopy.mpl.geoaxes import GeoAxes
GeoAxes._pcolormesh_patched = Axes.pcolormesh
import datetime
fig, ax = plt.subplots(figsize=(15,15),subplot_kw=dict(projection=ccrs.Stereographic(central_longitude=-76, central_latitude=43)))
ax.set_extent([-79, -73, 42, 45],ccrs.Geodetic())
ax.add_feature(USCOUNTIES.with_scale('500k'), edgecolor='gray', linewidth=0.25)
ax.add_feature(cfeature.STATES.with_scale('50m'))
winter = [12, 1, 2]
summer = [6, 7, 8]
seasondata = []
lons=[]
lats=[]
f = open("2007-2016.txt", "r")
for line in f.readlines():
parts = line.split()
dates = parts[0]
charges = float(parts[4])
date = datetime.datetime.strptime(dates, "%Y-%m-%d")
#if date.month in summer:
if date.month in winter:
seasondata.append(line)
if charges <= 0:
seasondata.append(line)
lon = float(parts[3])
lat = float(parts[2])
lons.append(lon)
lats.append(lat)
if charges >= 15:
seasondata.append(line)
lon = float(parts[3])
lat = float(parts[2])
lons.append(lon)
lats.append(lat)
lons=np.array(lons)
lats=np.array(lats)
ax.set_title('2007-2016 Jan, Feb, Dec: Lightning Density', loc ='Left')
xynps = (ax.projection.transform_points(ccrs.Geodetic(), lons, lats))
bins=[300,240]
h2d, xedges, yedges, im = ax.hist2d(xynps[:,0], xynps[:,1], bins=bins, cmap=plt.cm.YlOrRd, zorder=10, alpha=0.4)
lons=[]
lats=[]
f = open("turbine.txt", "r")
for line in f.readlines():
parts = line.split()
lat=float(parts[0])
lon=float(parts[1])
lats.append(lat)
lons.append(lon)
markerSymbol='o'
markerSize=10
ax.scatter(lons, lats, transform=ccrs.PlateCarree(), marker = markerSymbol, s=markerSize, c='b')
cbar = plt.colorbar(im, fraction=0.046, pad=0.04)
cbar.set_label("Flashes per 2km^2")
plt.show()

using ipywidgets SelectMultiple on a dataframe

import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
a = ['Banking', 'Auto', 'Life', 'Electric', 'Technology', 'Airlines',
'Healthcare']
df = pd.DataFrame(np.random.randn(7, 4), columns = list('ABCD'))
df.index = a
df.head(7)
dropdown = widgets.SelectMultiple(
options=df.index,
description='Sector',
disabled=False,
layout={'height':'100px', 'width':'40%'})
display(dropdown)
I want to create a function where I can filter the df by Sector. i.e say I select Airlines, Banking and Electric from the display(dropdown) and it returns a dataframe of the selected sectors only.
Try something like this, I have used a global variable to demonstrate in this case, but I would normally wrap up the functionality in a class so you always have access to the filtered dataframe.
Rather than use interact I have used .observe on the Selection widget.
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
a = ['Banking', 'Auto', 'Life', 'Electric', 'Technology', 'Airlines',
'Healthcare']
df = pd.DataFrame(np.random.randn(7, 4), columns = list('ABCD'), index=a)
filtered_df = None
dropdown = widgets.SelectMultiple(
options=df.index,
description='Sector',
disabled=False,
layout={'height':'100px', 'width':'40%'})
def filter_dataframe(widget):
global filtered_df
selection = list(widget['new'])
with out:
clear_output()
display(df.loc[selection])
filtered_df = df.loc[selection]
out = widgets.Output()
dropdown.observe(filter_dataframe, names='value')
display(dropdown)
display(out)

seaborn.swarmplot problem with symlog scale: zero's are not expanded

I have a data set of positive values and zero's that I would like to show on the log scale. To represent zero's I use 'symlog' option, but all zero values are mapped into one point on swarmplot. How to fix it?
import numpy as np
import seaborn as sns
import pandas as pd
import random
import matplotlib.pyplot as plt
n = 100
x = np.concatenate(([0]*n,np.linspace(0,1,n),[5]*n,np.linspace(10,100,n),np.linspace(100,1000,n)),axis=None)
data = pd.DataFrame({'value': x, 'category': random.choices([0,1,2,3], k=len(x))})
f, ax = plt.subplots(figsize=(10, 6))
ax.set_yscale("symlog",linthreshy=1.e-2)
ax.set_ylim(ymax=1000)
sns.swarmplot(x="category", y="value", data=data)
sns.despine(left=True)
link to the resulting plot

Time series plot of categorical or binary variables in pandas or matplotlib

I have data that represent a time series of categorical variables. I want to display the transitions in categories below a traditional line plot of related continuous time series to show off context as time evolves. I'd like to know the best way to do this. My attempt was in terms of Rectangles. The appearance is a bit weird, and importantly the axis labels for the x axis don't render as dates.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from pandas.plotting import register_matplotlib_converters
import matplotlib.dates as mdates
register_matplotlib_converters()
t0 = pd.DatetimeIndex(["2017-06-01 00:00","2017-06-17 00:00","2017-07-03 00:00","2017-08-02 00:00","2017-08-09 00:00","2017-09-01 00:00"])
t1 = pd.DatetimeIndex(["2017-06-01 00:00","2017-08-15 00:00","2017-09-01 00:00"])
df0 = pd.DataFrame({"cat":[0,2,1,2,0,1]},index = t0)
df1 = pd.DataFrame({"op":[0,1,0]},index=t1)
# Create new plot
fig,ax = plt.subplots(1,figsize=(8,3))
data_layout = {
"cat" : {0: ('bisque','Low'),
1: ('lightseagreen','Medium'),
2: ('rebeccapurple','High')},
"op" : {0: ('darkturquoise','Open'),
1: ('tomato','Close')}
}
vars =("cat","op")
dfs = [df0,df1]
all_ticks = []
leg = []
for j,(v,d) in enumerate(zip(vars,dfs)):
dvals = d[v][:].astype("d")
normal = mpl.colors.Normalize(vmin=0, vmax=2.)
colors = plt.cm.Set1(0.75*normal(dvals.as_matrix()))
handles = []
for i in range(d.count()-1):
s = d[v].index.to_pydatetime()
level = d[v][i]
base = d[v].index[i]
w = s[i+1] - s[i]
patch=mpl.patches.Rectangle((base,float(j)),width=w,color=data_layout[v][level][0],height=1,fill=True)
ax.add_patch(patch)
for lev in data_layout[v]:
print data_layout[v][level]
handles.append(mpl.patches.Patch(color=data_layout[v][lev][0],label=data_layout[v][lev][1]))
all_ticks.append(j+0.5)
leg.append( plt.legend(handles=handles,loc = (3-3*j+1)))
plt.axhline(y=1.,linewidth=3,color="gray")
plt.xlim(pd.Timestamp(2017,6,1).to_pydatetime(),pd.Timestamp(2017,9,1).to_pydatetime())
plt.ylim(0,2)
ax.add_artist(leg[0]) # two legends on one axis
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d') # This fails
plt.yticks(all_ticks,vars)
plt.show()
which produces this with no dates and has jittery lines:. How do I fix this? Is there a better way entirely?
This is a way to display dates on x-axis:
In your code substitute the line that fails with this one:
ax.xaxis.set_major_formatter((mdates.DateFormatter('%Y-%m-%d')))
But I don't remember how it should look like, can you show us the end-result again?