Multiple grouped charts with altair - dataframe

My data has 4 attributes: dataset (D1/D2), model (M1/M2), layer (L1/L2), scene (S1/S2). I can make a chart grouped by scenes and then merge plots horizontally and vertically (pic above).
However, I would like to have 'double grouping' by scene and dataset, like merging the D1 and D2 plots by placing blue/orange bars from next to each other but with different opacity or pattern/hatch.
Basically something like this (pretend that the black traits are a hatch pattern).
Here is the code to reproduce the first plot
import numpy as np
import itertools
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import os
import altair as alt
alt.renderers.enable('altair_viewer')
np.random.seed(0)
################################################################################
model_keys = ['M1', 'M2']
data_keys = ['D1', 'D2']
scene_keys = ['S1', 'S2']
layer_keys = ['L1', 'L2']
ys = []
models = []
dataset = []
layers = []
scenes = []
for sc in scene_keys:
for m in model_keys:
for d in data_keys:
for l in layer_keys:
for s in range(10):
data_y = list(np.random.rand(10) / 10)
ys += data_y
scenes += [sc] * len(data_y)
models += [m] * len(data_y)
dataset += [d] * len(data_y)
layers += [l] * len(data_y)
# ------------------------------------------------------------------------------
df = pd.DataFrame({'Y': ys,
'Model': models,
'Dataset': dataset,
'Layer': layers,
'Scenes': scenes})
bars = alt.Chart(df, width=100, height=90).mark_bar().encode(
# field to group columns on
x=alt.X('Scenes:N',
title=None,
axis=alt.Axis(
grid=False,
title=None,
labels=False,
),
),
# field to use as Y values and how to calculate
y=alt.Y('Y:Q',
aggregate='mean',
axis=alt.Axis(
grid=True,
title='Y',
titleFontWeight='normal',
),
),
# field to use for sorting
order=alt.Order('Scenes',
sort='ascending',
),
# field to use for color segmentation
color=alt.Color('Scenes',
legend=alt.Legend(orient='bottom',
padding=-10,
),
title=None,
),
)
error_bars = alt.Chart(df).mark_errorbar(extent='ci').encode(
x=alt.X('Scenes:N'),
y=alt.Y('Y:Q'),
)
text = alt.Chart(df).mark_text(align='center',
baseline='line-bottom',
color='black',
dy=-5 # y-shift
).encode(
x=alt.X('Scenes:N'),
y=alt.Y('mean(Y):Q'),
text=alt.Text('mean(Y):Q', format='.1f'),
)
chart_base = bars + error_bars + text
chart_base = chart_base.facet(
# field to use to use as the set of columns to be represented in each group
column=alt.Column('Layer:N',
# header=alt.Header(
# labelFontStyle='bold',
# ),
title=None,
sort=list(set(models)), # get unique indices
),
spacing={"row": 0, "column": 15},
)
def unique(sequence):
seen = set()
return [x for x in sequence if not (x in seen or seen.add(x))]
for i, m in enumerate(unique(models)):
chart_imnet = chart_base.transform_filter(
alt.FieldEqualPredicate(field='Dataset', equal='D1'),
).transform_filter(
alt.FieldEqualPredicate(field='Model', equal=m)
)
chart_places = chart_base.transform_filter(
alt.FieldEqualPredicate(field='Dataset', equal='D2')
).transform_filter(
alt.FieldEqualPredicate(field='Model', equal=m)
)
if i == 0:
title_params = dict({'align': 'center', 'anchor': 'middle', 'dy': -10})
chart_imnet = chart_imnet.properties(title=alt.TitleParams('D1', **title_params))
chart_places = chart_places.properties(title=alt.TitleParams('D2', **title_params))
chart_places = alt.concat(chart_places,
title=alt.TitleParams(
m,
baseline='middle',
orient='right',
anchor='middle',
angle=90,
# dy=10,
dx=30 if i == 0 else 0,
),
)
if i == 0:
chart = (chart_imnet | chart_places).resolve_scale(x='shared')
else:
chart = (chart & (chart_imnet | chart_places).resolve_scale(x='shared'))
chart.save('test.html')

For now, I don't know a good answer, but once https://github.com/altair-viz/altair/pull/2528 is accepted you can use the xOffset encoding channel as such:
alt.Chart(df, height=90).mark_bar(tooltip=True).encode(
x=alt.X("Scenes:N"),
y=alt.Y("mean(Y):Q"),
color=alt.Color("Scenes:N"),
opacity=alt.Opacity("Dataset:N"),
xOffset=alt.XOffset("Dataset:N"),
column=alt.Column('Layer:N'),
row=alt.Row("Model:N")
).resolve_scale(x='independent')
Which will result in:
See Colab Notebook or Vega Editor
EDIT
To control the opacity and legend names one can do as such
alt.Chart(df, height=90).mark_bar(tooltip=True).encode(
x=alt.X("Scenes:N"),
y=alt.Y("mean(Y):Q"),
color=alt.Color("Scenes:N"),
opacity=alt.Opacity("Dataset:N",
scale=alt.Scale(domain=['D1', 'D2'],
range=[0.2, 1.0]),
legend=alt.Legend(labelExpr="datum.label == 'D1' ? 'D1 - transparent' : 'D2 - full'")),
xOffset=alt.XOffset("Dataset:N"),
column=alt.Column('Layer:N'),
row=alt.Row("Model:N")
).resolve_scale(x='independent')

Related

3d interactive graph linked with three sliders

I’m trying to create a 3d interactive graph linked with three sliders. I used dash plotly. But when I run this code, I get a blank 2d graph with sliders. If anyone can help me to find mistakes in my code, it would be very helpful. Thank you
I coded this in python.
Below is my data alonwith code
|A |C |B|
|191|11870402.57|150927.156|
|194|11534176.96|150926.613|
|200|8791715.569|150309.893|
|219|9058784.693|130344.409|
|193|11710374.76|150993.204|
|230|8966576.793|121803.204|
|196|11563137.82|147352.525|
|197|11559778.19|147360.662|
|232|8145250.015|134850.363|
|230|8960357.94|122119.87|
|241|8343604.908|118177.929|
'''
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import pandas as pd
app = Dash(name)
app.layout = html.Div([
html.H4(‘Illustrations’),
dcc.Graph(id=“graph_scatter”),
html.P(“A:”),
dcc.Slider(
id=‘3d-scatter-plot-x-range-slider’,
min=df[‘A’].min(), max=df[‘A’].max(),
value=df[‘A’].max()),
html.P(“B:”),
dcc.Slider(
id=‘3d-scatter-plot-y-range-slider’,
min=df[‘B’].min(), max=df[‘B’].max(),
value=df[‘B’].max()),
html.P(“C:”),
dcc.Slider(
id=‘3d-scatter-plot-z-range-slider’,
min=df[‘C’].min(), max=df[‘C’].max(),
value=df[‘C’].max())
])
#app.callback(
Output(‘graph’, ‘figure’),
[Input(‘3d-scatter-plot-x-range-slider’, ‘value’),
Input(‘3d-scatter-plot-y-range-slider’, ‘value’),
Input(‘3d-scatter-plot-z-range-slider’, ‘value’)
])
def update_bar_chart(slider_range_x, slider_range_y, slider_range_z):
df = pd.read_csv(‘ABC.csv’) # replace with your own data source
low_x, high_x = slider_range_x
low_y, high_y = slider_range_y
low_z, high_z = slider_range_z
mask = (df.A > low_x) &
(df.A < high_x) & (df.B > low_y) & (df.B < high_y) & (df.C > low_z) & (df.C <
high_z)
fig = px.scatter_3d(mask,
x ='A', z='C',y='B')
return fig
if name == “main”:
app.run_server(debug=False)
'''
I see some problem in your code.
First: You set id in your dcc.Graph is graph_scatter but in you callback you set it is graph
Second: You are using Slider so that you can not change the low_x, low_y, low_z but high_x, high_y, high_y. So that you can not use something like low_x, high_x = slider_range_x. Based on your code I revised as below:
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import pandas as pd
import dash_bootstrap_components as dbc
app = Dash(__name__, external_stylesheets=[dbc.themes.LUX])
app.layout = html.Div([
html.H4('Illustrations'),
dcc.Graph(id='graph_scatter',figure={}),
html.P('A:'),
dcc.Slider(
id='3d-scatter-plot-x-range-slider',
min=df['A'].min(), max=df['A'].max(),
value=df['A'].max()),
html.P('B:'),
dcc.Slider(
id='3d-scatter-plot-y-range-slider',
min=df['B'].min(), max=df['B'].max(),
value=df['B'].max()),
html.P('C:'),
dcc.Slider(
id='3d-scatter-plot-z-range-slider',
min=df['C'].min(), max=df['C'].max(),
value=df['C'].max())
])
#app.callback(
Output('graph_scatter', 'figure'),
[Input('3d-scatter-plot-x-range-slider', 'value'),
Input('3d-scatter-plot-y-range-slider', 'value'),
Input('3d-scatter-plot-z-range-slider', 'value')
])
def update_bar_chart(slider_range_x, slider_range_y, slider_range_z):
high_x = slider_range_x
high_y = slider_range_y
high_z = slider_range_z
dff = df[(df['A'] < high_x)&(df['B'] < high_y)&(df['C'] < high_z)]
print(high_x)
fig = px.scatter_3d(dff,x ='A', z='C',y='B')
return fig
if __name__ == "__main__":
app.run_server(debug=False)

Altering the X-axis in Altair

I'd like to fill the charts with selectors like the example below. Any tips on how to get this to work in a faceted chart?
np.random.seed(42)
source = pd.DataFrame(np.cumsum(np.random.rand(8, 4), 0).round(2),
columns=['A', 'B', 'C', 'D'], index=pd.RangeIndex(8, name='x'))
source = source.reset_index().melt('x', var_name='category', value_name='y')
xRange= pd.DataFrame(np.linspace(min(source['x']), max(source['x']), num=100), columns=['x'])
pts = alt.selection_multi(fields=['x'], nearest=True, on='click',empty='none')
# The basic line
main = alt.Chart(source).mark_line(interpolate='basis').encode(
x='x:Q',
y='y:Q',
).transform_filter(
alt.FieldEqualPredicate(field='category', equal='A')
)
line = alt.Chart(source).mark_line(color='Maroon').encode(
x='x:Q',
y='y:Q',
).transform_filter(
alt.FieldEqualPredicate(field='category', equal='B')
)
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(xRange).mark_rule(size=2).encode(
x='x:Q',
#y='y:Q',
#opacity=alt.value(0.4),
opacity = alt.condition(pts, alt.value(1.0), alt.value(0.2))
).add_selection(pts)
position = alt.Chart(xRange).mark_text(
align='right', dy=140, dx=-8, fontSize=14).encode(
x=alt.X('x'),
text=alt.Text('x',format='.1f')
).transform_filter(pts)
alt.vconcat(
main + selectors + position,
line + selectors + position
)
But ideally using facet, however i have not found a way around that you can only use a single DataFrame/source. Is there a way to use alt.sequence of impute to generate additional points on the x-axis?
pts = alt.selection_multi(fields=['x'], nearest=True, on='click',empty='none')
# The basic line
line = alt.Chart().mark_line(interpolate='basis').encode(
x='x:Q',
y='y:Q',
)
# Transparent rules across the chart.
rules = alt.Chart().mark_rule(size=2).encode(
x='x:Q',
opacity = alt.condition(pts, alt.value(1.0), alt.value(0.3))
).add_selection(pts)
text = alt.Chart().mark_text(
align='right', dy=140, dx=-8, fontSize=14).encode(
x=alt.X('x'),
text=alt.Text('x',format='.1f')
).transform_filter(pts)
alt.layer(line, rules, text, data=source).facet(
'category:N',
columns=2
)
You can use the sequence generator. It is almost the same to what you had already:
import numpy as np
import pandas as pd
import altair as alt
np.random.seed(42)
source = pd.DataFrame(np.cumsum(np.random.rand(8, 4), 0).round(2),
columns=['A', 'B', 'C', 'D'], index=pd.RangeIndex(8, name='x'))
source = source.reset_index().melt('x', var_name='category', value_name='y')
# xRange= pd.DataFrame(np.linspace(min(source['x']), max(source['x']), num=100), columns=['x'])
xRange = alt.sequence(0, 7.1, 0.1, as_='x')
pts = alt.selection_multi(fields=['x'], nearest=True, on='mouseover',empty='none')
# The basic line
line = alt.Chart().mark_line(interpolate='linear').encode(
x='x:Q',
y='y:Q',
)
# Transparent rules across the chart.
rules = alt.Chart(xRange).mark_rule(size=2).encode(
x='x:Q',
opacity = alt.condition(pts, alt.value(1.0), alt.value(0.3))
).add_selection(pts)
text = alt.Chart(xRange).mark_text(
align='right', dy=140, dx=-8, fontSize=14).encode(
x=alt.X('x:Q'),
text=alt.Text('x:Q',format='.1f')
).transform_filter(pts)
alt.layer(line, rules, text, data=source).facet(
'category:N',
columns=2
)

How to annotate subplots in Plotly inside a for loop

I am trying to annotate my subplots inside a for loop. Each subplot will have RMS value printed on the plot. I tried to do it the following way:
from plotly import tools
figg = tools.make_subplots(rows=4, cols=1)
fake_date = {"X": np.arange(1, 101, 0.5), "Y": np.sin(x), "Z": [x + 1 for x in range(10)] * 20}
fake_date = pd.DataFrame(fake_date)
fake_date.sort_values("Z")
unique_ids = fake_date['Z'].unique()
train_id, test_id = np.split(np.random.permutation(unique_ids), [int(.6 * len(unique_ids))])
for i, j in enumerate(test_id):
x_test = fake_date[fake_date['Z'].isin([test_id[i]])]
y_test = fake_date[fake_date['Z'].isin([test_id[i]])]
# Evaluate
rms_test = 0.04
r_test = 0.9
Real = {'type' : 'scatter',
'x' : x_test.X,
'y' : x_test.Y,
"mode" : 'lines+markers',
"name" : 'Real'}
figg.append_trace(Real, i+1, 1)
figg['layout'].update( annotations=[dict(x = 10,y = 0.2, text= rms_test, xref= "x1",yref="y1")] )
figg['layout'].update(height=1800, width=600, title='Testing')
pyo.iplot(figg)
This does not work, although the answer given here seems to work for others. Can anyone point out what wrong am I doing?
I generated fake date for reproducibility
I am not sure where to exactly place the RMS value, but below is a sample code which will help you achieve what you want.
We create an array annotation_arr where we store the annotations using the for loop.
We need to set the xval and yval for each of the individual axes. Remember, first axis will be x, second will be x2 so, I have written a ternary condition for that, please checkout the below code and let me know if there is any issues!
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
from plotly import tools
import numpy as np
import pandas as pd
init_notebook_mode(connected=True)
rows = 4
figg = tools.make_subplots(rows=rows, cols=1)
fake_date = {"X": np.arange(0, 100, 0.5), "Y": [np.sin(x) for x in range(200)], "Z": [x + 1 for x in range(10)] * 20}
fake_date = pd.DataFrame(fake_date)
fake_date.sort_values("Z")
unique_ids = fake_date['Z'].unique()
train_id, test_id = np.split(np.random.permutation(unique_ids), [int(.6 * len(unique_ids))])
top = 0
annotation_arr = []
for i, j in enumerate(test_id):
x_test = fake_date[fake_date['Z'].isin([test_id[i]])]
y_test = fake_date[fake_date['Z'].isin([test_id[i]])]
# Evaluate
rms_test = 0.04
r_test = 0.9
Real = {'type' : 'scatter',
'x' : x_test.X,
'y' : x_test.Y,
"mode" : 'lines+markers',
"name" : 'Real'}
top = top + 1/rows
i_val = "" if i == 0 else i + 1
annotation_arr.append(dict(x = r_test,y = top, text= rms_test, xref= "x"+str(i_val),yref="y"+str(i_val)))
figg.append_trace(Real, i+1, 1)
figg['layout'].update( annotations=annotation_arr )
figg['layout'].update(height=1800, width=600, title='Testing')
iplot(figg)

Curve fitting a function of functions in matplotlib

I have 3 functions: E (energy), P (pressure) and H (enthalpy).
The given data data.dat, contains the variables V (volume) and E (energy):
# Volume: V_C_I Energy: E_C_I
111.593876 -1.883070511360E+03
113.087568 -1.883074916825E+03
114.632273 -1.883078906679E+03
116.184030 -1.883082373429E+03
117.743646 -1.883085344696E+03
119.326853 -1.883087860954E+03
120.927806 -1.883089938181E+03
122.538335 -1.883091557526E+03
124.158641 -1.883092750745E+03
125.789192 -1.883093540824E+03
125.790261 -1.883093800747E+03
127.176327 -1.883094160364E+03
128.358654 -1.883094017730E+03
128.542807 -1.883094255789E+03
129.977279 -1.883094094751E+03
131.390610 -1.883093689121E+03
132.812287 -1.883093053342E+03
134.242765 -1.883092185844E+03
135.682211 -1.883091101112E+03
137.130792 -1.883089807766E+03
138.588565 -1.883088314435E+03
The following script, performs:
1) a curve_fit of E versus V,
2) calculates P (pressure) using the def(P) function,
3) calculates H (enthalpy) using the def(H) function. (H = E + PV).
4) Performs 3 plots with the fitting curves: E versus P, P versus V and H versus P.
When plotting the fitting curve, I have used the following:
For example, for the E versus V curve:
plt.plot(V_C_I_lin, E(V_C_I_lin, *popt_C_I)
where V_C_I_lin is a linspace of volumes between the data points.
This seems fine:
Similarly, for the P versus V curve, the analogous scheme:
plt.plot(V_C_I_lin, P(V_C_I_lin, *popt_C_I), color='grey', label='E fit Data' )
produces the desired result:
The pressure for each data point is saved in the file ./E_V_P_H__C_I.dat by the script.
However, for the H versus P curve, the analogous scheme:
plt.plot(xp_C_I, H(V_C_I_lin, *popt_C_I), color='grey', label='H fit Data' )
where xp_C_I is a linspace of pressures, does not produce the correct fit:
Why this is happening in the third case?
Code:
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import sys
import os
# Intial candidates for fit
E0_init = -941.510817926696
V0_init = 63.54960592453
B0_init = 76.3746233515232
B0_prime_init = 4.05340727164527
def E(V, E0, V0, B0, B0_prime):
return E0+ (2.293710449E+17)*(1E-21)*( (9.0/16.0)*(V0*B0) * ( (((V0/V)**(2.0/3.0)-1.0)**3.0)*B0_prime + ((V0/V)**(2.0/3.0)-1)**2 * (6.0-4.0*(V0/V)**(2.0/3.0)) ))
def P(V, E0, V0, B0, B0_prime):
f0=(3.0/2.0)*B0
f1=((V0/V)**(7.0/3.0))-((V0/V)**(5.0/3.0))
f2=((V0/V)**(2.0/3.0))-1
pressure= f0*f1*(1+(3.0/4.0)*(B0_prime-4)*f2)
return pressure
def H(V, E0, V0, B0, B0_prime):
return E(V, E0, V0, B0, B0_prime) + P(V, E0, V0, B0, B0_prime) * V
# Data (Red triangles):
V_not_p_f_unit_C_I, E_not_p_f_unit_C_I = np.loadtxt('data.dat', skiprows = 1).T
# A minor conversion of the data:
nFU_C_I = 2.0
nFU_C_II = 4.0
E_C_I = E_not_p_f_unit_C_I/nFU_C_I
V_C_I = V_not_p_f_unit_C_I/nFU_C_I
######
# Fitting and obtaining the parameters:
init_vals = [E0_init, V0_init, B0_init, B0_prime_init]
popt_C_I, pcov_C_I = curve_fit(E, V_C_I, E_C_I, p0=init_vals)
# Calculation of P:
pressures_per_F_unit_C_I = P(V_C_I, *popt_C_I)
# Calculation of H: H = E + PV
H_C_I = E_C_I + pressures_per_F_unit_C_I * V_C_I
# We save E, P and H into a file:
output_array_3 = np.vstack((E_C_I, V_C_I, pressures_per_F_unit_C_I, H_C_I)).T
np.savetxt('E_V_P_H__C_I.dat', output_array_3, header="Energy / FU (a.u.) \t Volume / FU (A^3) \t Pressure / F.U. (GPa) \t Enthalpy (a.u.)", fmt="%0.13f")
EnergyCI, VolumeCI, PressureCI, EnthalpyCI = np.loadtxt('./E_V_P_H__C_I.dat', skiprows = 1).T
# Plotting E vs V:
fig = plt.figure()
V_C_I_lin = np.linspace(VolumeCI[0], VolumeCI[-1], 100)
# Linspace for plotting the fitting curve:
V_C_I_lin = np.linspace(VolumeCI[0], VolumeCI[-1], 100)
p2, = plt.plot(V_C_I_lin, E(V_C_I_lin, *popt_C_I), color='grey', label='E fit Data' )
# Plotting the scattered points:
p1 = plt.scatter(VolumeCI, EnergyCI, color='red', marker="^", label='Data', s=100)
fontP = FontProperties()
fontP.set_size('small')
plt.legend((p1, p2 ), ("Data", 'E fit Data' ), prop=fontP)
plt.xlabel('V / Formula unit (Angstrom$^{3}$)')
plt.ylabel('E / Formula unit (a.u.)')
plt.ticklabel_format(useOffset=False)
plt.savefig('E_vs_V.pdf', bbox_inches='tight')
# Plotting P vs V:
fig = plt.figure()
# Linspace for plotting the fitting curve:
xp_C_I = np.linspace(PressureCI[-1], PressureCI[0], 100)
# Plotting the fitting curves:
p2, = plt.plot(V_C_I_lin, P(V_C_I_lin, *popt_C_I), color='grey', label='E fit Data' )
# Plotting the scattered points:
p1 = plt.scatter(VolumeCI, PressureCI, color='red', marker="^", label='Data', s=100)
fontP = FontProperties()
fontP.set_size('small')
plt.legend((p1, p2), ("Data", "E fit Data"), prop=fontP)
plt.xlabel('V / Formula unit (Angstrom$^{3}$)')
plt.ylabel('P (GPa)')
plt.ticklabel_format(useOffset=False)
plt.savefig('P_vs_V.pdf', bbox_inches='tight')
# Plotting H vs P:
fig = plt.figure()
xp_C_I = np.linspace(PressureCI[0], PressureCI[-1], 100)
V_C_I_lin = np.linspace(VolumeCI[0], VolumeCI[-1], 100)
# Linspace for plotting the fitting curve:
V_C_I_lin = np.linspace(VolumeCI[0], VolumeCI[-1], 100)
p2, = plt.plot(xp_C_I, H(V_C_I_lin, *popt_C_I), color='grey', label='H fit Data' )
# Plotting the scattered points:
p1 = plt.scatter(pressures_per_F_unit_C_I, H_C_I, color='red', marker="^", label='Data', s=100)
fontP = FontProperties()
fontP.set_size('small')
plt.legend((p1, p2 ), ("Data", 'H fit Data' ), prop=fontP)
plt.xlabel('P / Formula unit (GPa)')
plt.ylabel('H / Formula unit (a.u.)')
plt.ticklabel_format(useOffset=False)
plt.savefig('H_vs_V.pdf', bbox_inches='tight')
plt.show()
You cannot just plot H(V) versus some completely uncorrelated pressures xp_C_I.
plt.plot(xp_C_I, H(V_C_I_lin, *popt_C_I), )
Instead you need to plot H(V) against P(V), such that exactly the same V values are used for the pressure and the enthalpy:
plt.plot(P(V_C_I_lin, *popt_C_I), H(V_C_I_lin, *popt_C_I), )

Scikit-pandas, cross val score number of features

I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?