I am trying to place two subplots beside a dropdown but for some reason they keep going to the next row. Could someone tell me what I'm doing wrong?
import pandas as pd
pd.set_option('display.max_rows', None)
# Run this app with `python app.py`
from dash import Dash, dcc, html
#import plotly.express as px
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import dash_bootstrap_components as dbc
import pandas as pd
Dash(assets_ignore='.*ignored.*')
app = Dash(__name__)
colors = {
'grey_plot_bg':'#353535',
'background': '#111111',
'text': 'teal',
'plottext':'crimson',
"0":"silver",
"1":"#FBEC5D",
"5":"#50C878",
"10":"#40E0D0",
"15":"#A23D60",
0:"silver",
1:"#FBEC5D",
5:"#50C878",
10:"#40E0D0",
15:"#A23D60",
'Linux':'#50C878',
'Windows':'#66D3F4',
'Primer':'#FFD700',
}
# pandas dataframe used for plots
current_merged_df = pd.DataFrame({
'student': ['scooby','doobie','doo'],
'completed' : [ 55 , 55 , 100 ]
})
students_topics_earned = pd.DataFrame({
'topic': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J','A','A','B','B','C','C'],
'completed' : [ 55 , 55 , 100 , 95 , 45 , 99 , 75 , 64 , 93 , 10 , 15 , 55 , 45 , 78 , 98 , 33 ],
'platform' : ['Primer','Primer','Primer','Primer','Primer','Primer','Primer','Primer', 'Primer', 'Primer','Linux','Windows','Linux','Windows','Linux','Windows']
})
# add the average scatter polar graphs for the class
class_category_earned = students_category_earned.groupby(['platform','Category']).mean().round().reset_index()
class_topics_earned = students_topics_earned.groupby(['platform','topic']).mean().round().reset_index()
# functions to creating the plots
def student_topic_scatter_polar_graph(df,platform):
print(df)
df_filtered = df[df['platform']==platform].sort_values(by='topic',ascending=True)
color = colors[platform]
fig = go.Scatterpolar(
r=df_filtered.completed,
theta=df_filtered.topic,
fill='toself',
name="%s - Focused Topics"%platform,
fillcolor=color,
opacity=0.6,
line=dict(color=color),
mode='markers'
)
return fig
fig = go.Figure()
# calling the plot functions
class_linux_topic_scatter_polar_fig = student_topic_scatter_polar_graph(class_topics_earned,'Linux')
class_windows_topic_scatter_polar_fig = student_topic_scatter_polar_graph(class_topics_earned,'Windows')
class_Primer_topic_scatter_polar_fig = student_topic_scatter_polar_graph(class_topics_earned,'Primer')
class_topic_subplot = make_subplots(rows=2, cols=2,
subplot_titles=['Primer','Linux','Windows'],
specs=[
[{"rowspan": 1,"colspan":2,"type": "polar"},None],
[ {"rowspan": 1,"type": "polar"}, {"rowspan": 1,"type": "polar"}]
])
class_topic_subplot.add_trace(class_Primer_topic_scatter_polar_fig,row=1,col=1)
class_topic_subplot.add_trace(class_windows_topic_scatter_polar_fig,row=2,col=1)
class_topic_subplot.add_trace(class_linux_topic_scatter_polar_fig,row=2,col=2)
class_topic_subplot.update_layout(
autosize=False,
width=800,
height=600,)
def populate_student_selection_dropdown(current_merged_df):
if current_merged_df is not None:
merged_df = pd.DataFrame.from_records(current_merged_df)
return [s for s in merged_df['student'].unique()]
student_selection_card = dbc.Card(
[
html.H6("Select Individual Students for Analysis:",className="card-text"),
dcc.Dropdown(['scoobie','doobie','doo'],id="student_selection_dropdown",multi=True,style={'display':False},persistence=True)
]
)
# creating card that the subplots will be in
card2 = dbc.Card(
[
dbc.CardBody([
html.H4("Student Name", className="card-title"),
dcc.Graph(figure=class_topic_subplot),
])
]
)
# second page layout
class_analysis_layout = dbc.Container([
dbc.Row(
[
dbc.Col([card2],width=3),
dbc.Col([student_selection_card],width=3),
dbc.Col([card2],width=3)
])
])
app.layout = html.Div([
class_analysis_layout
],
)
if __name__ == '__main__':
app.run_server(host="0.0.0.0", port=8070, debug=True)
If you take a look at the class_analysis_layout, that is where I'm having the issue, I believe. I looked to see if there was a width property for the dropdown element but did not find own. I would have thought that setting the elements up in columns and setting the column width would do it but no luck. Any advice?
Related
I'd like to fill the charts with selectors like the example below. Any tips on how to get this to work in a faceted chart?
np.random.seed(42)
source = pd.DataFrame(np.cumsum(np.random.rand(8, 4), 0).round(2),
columns=['A', 'B', 'C', 'D'], index=pd.RangeIndex(8, name='x'))
source = source.reset_index().melt('x', var_name='category', value_name='y')
xRange= pd.DataFrame(np.linspace(min(source['x']), max(source['x']), num=100), columns=['x'])
pts = alt.selection_multi(fields=['x'], nearest=True, on='click',empty='none')
# The basic line
main = alt.Chart(source).mark_line(interpolate='basis').encode(
x='x:Q',
y='y:Q',
).transform_filter(
alt.FieldEqualPredicate(field='category', equal='A')
)
line = alt.Chart(source).mark_line(color='Maroon').encode(
x='x:Q',
y='y:Q',
).transform_filter(
alt.FieldEqualPredicate(field='category', equal='B')
)
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(xRange).mark_rule(size=2).encode(
x='x:Q',
#y='y:Q',
#opacity=alt.value(0.4),
opacity = alt.condition(pts, alt.value(1.0), alt.value(0.2))
).add_selection(pts)
position = alt.Chart(xRange).mark_text(
align='right', dy=140, dx=-8, fontSize=14).encode(
x=alt.X('x'),
text=alt.Text('x',format='.1f')
).transform_filter(pts)
alt.vconcat(
main + selectors + position,
line + selectors + position
)
But ideally using facet, however i have not found a way around that you can only use a single DataFrame/source. Is there a way to use alt.sequence of impute to generate additional points on the x-axis?
pts = alt.selection_multi(fields=['x'], nearest=True, on='click',empty='none')
# The basic line
line = alt.Chart().mark_line(interpolate='basis').encode(
x='x:Q',
y='y:Q',
)
# Transparent rules across the chart.
rules = alt.Chart().mark_rule(size=2).encode(
x='x:Q',
opacity = alt.condition(pts, alt.value(1.0), alt.value(0.3))
).add_selection(pts)
text = alt.Chart().mark_text(
align='right', dy=140, dx=-8, fontSize=14).encode(
x=alt.X('x'),
text=alt.Text('x',format='.1f')
).transform_filter(pts)
alt.layer(line, rules, text, data=source).facet(
'category:N',
columns=2
)
You can use the sequence generator. It is almost the same to what you had already:
import numpy as np
import pandas as pd
import altair as alt
np.random.seed(42)
source = pd.DataFrame(np.cumsum(np.random.rand(8, 4), 0).round(2),
columns=['A', 'B', 'C', 'D'], index=pd.RangeIndex(8, name='x'))
source = source.reset_index().melt('x', var_name='category', value_name='y')
# xRange= pd.DataFrame(np.linspace(min(source['x']), max(source['x']), num=100), columns=['x'])
xRange = alt.sequence(0, 7.1, 0.1, as_='x')
pts = alt.selection_multi(fields=['x'], nearest=True, on='mouseover',empty='none')
# The basic line
line = alt.Chart().mark_line(interpolate='linear').encode(
x='x:Q',
y='y:Q',
)
# Transparent rules across the chart.
rules = alt.Chart(xRange).mark_rule(size=2).encode(
x='x:Q',
opacity = alt.condition(pts, alt.value(1.0), alt.value(0.3))
).add_selection(pts)
text = alt.Chart(xRange).mark_text(
align='right', dy=140, dx=-8, fontSize=14).encode(
x=alt.X('x:Q'),
text=alt.Text('x:Q',format='.1f')
).transform_filter(pts)
alt.layer(line, rules, text, data=source).facet(
'category:N',
columns=2
)
My data has 4 attributes: dataset (D1/D2), model (M1/M2), layer (L1/L2), scene (S1/S2). I can make a chart grouped by scenes and then merge plots horizontally and vertically (pic above).
However, I would like to have 'double grouping' by scene and dataset, like merging the D1 and D2 plots by placing blue/orange bars from next to each other but with different opacity or pattern/hatch.
Basically something like this (pretend that the black traits are a hatch pattern).
Here is the code to reproduce the first plot
import numpy as np
import itertools
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import os
import altair as alt
alt.renderers.enable('altair_viewer')
np.random.seed(0)
################################################################################
model_keys = ['M1', 'M2']
data_keys = ['D1', 'D2']
scene_keys = ['S1', 'S2']
layer_keys = ['L1', 'L2']
ys = []
models = []
dataset = []
layers = []
scenes = []
for sc in scene_keys:
for m in model_keys:
for d in data_keys:
for l in layer_keys:
for s in range(10):
data_y = list(np.random.rand(10) / 10)
ys += data_y
scenes += [sc] * len(data_y)
models += [m] * len(data_y)
dataset += [d] * len(data_y)
layers += [l] * len(data_y)
# ------------------------------------------------------------------------------
df = pd.DataFrame({'Y': ys,
'Model': models,
'Dataset': dataset,
'Layer': layers,
'Scenes': scenes})
bars = alt.Chart(df, width=100, height=90).mark_bar().encode(
# field to group columns on
x=alt.X('Scenes:N',
title=None,
axis=alt.Axis(
grid=False,
title=None,
labels=False,
),
),
# field to use as Y values and how to calculate
y=alt.Y('Y:Q',
aggregate='mean',
axis=alt.Axis(
grid=True,
title='Y',
titleFontWeight='normal',
),
),
# field to use for sorting
order=alt.Order('Scenes',
sort='ascending',
),
# field to use for color segmentation
color=alt.Color('Scenes',
legend=alt.Legend(orient='bottom',
padding=-10,
),
title=None,
),
)
error_bars = alt.Chart(df).mark_errorbar(extent='ci').encode(
x=alt.X('Scenes:N'),
y=alt.Y('Y:Q'),
)
text = alt.Chart(df).mark_text(align='center',
baseline='line-bottom',
color='black',
dy=-5 # y-shift
).encode(
x=alt.X('Scenes:N'),
y=alt.Y('mean(Y):Q'),
text=alt.Text('mean(Y):Q', format='.1f'),
)
chart_base = bars + error_bars + text
chart_base = chart_base.facet(
# field to use to use as the set of columns to be represented in each group
column=alt.Column('Layer:N',
# header=alt.Header(
# labelFontStyle='bold',
# ),
title=None,
sort=list(set(models)), # get unique indices
),
spacing={"row": 0, "column": 15},
)
def unique(sequence):
seen = set()
return [x for x in sequence if not (x in seen or seen.add(x))]
for i, m in enumerate(unique(models)):
chart_imnet = chart_base.transform_filter(
alt.FieldEqualPredicate(field='Dataset', equal='D1'),
).transform_filter(
alt.FieldEqualPredicate(field='Model', equal=m)
)
chart_places = chart_base.transform_filter(
alt.FieldEqualPredicate(field='Dataset', equal='D2')
).transform_filter(
alt.FieldEqualPredicate(field='Model', equal=m)
)
if i == 0:
title_params = dict({'align': 'center', 'anchor': 'middle', 'dy': -10})
chart_imnet = chart_imnet.properties(title=alt.TitleParams('D1', **title_params))
chart_places = chart_places.properties(title=alt.TitleParams('D2', **title_params))
chart_places = alt.concat(chart_places,
title=alt.TitleParams(
m,
baseline='middle',
orient='right',
anchor='middle',
angle=90,
# dy=10,
dx=30 if i == 0 else 0,
),
)
if i == 0:
chart = (chart_imnet | chart_places).resolve_scale(x='shared')
else:
chart = (chart & (chart_imnet | chart_places).resolve_scale(x='shared'))
chart.save('test.html')
For now, I don't know a good answer, but once https://github.com/altair-viz/altair/pull/2528 is accepted you can use the xOffset encoding channel as such:
alt.Chart(df, height=90).mark_bar(tooltip=True).encode(
x=alt.X("Scenes:N"),
y=alt.Y("mean(Y):Q"),
color=alt.Color("Scenes:N"),
opacity=alt.Opacity("Dataset:N"),
xOffset=alt.XOffset("Dataset:N"),
column=alt.Column('Layer:N'),
row=alt.Row("Model:N")
).resolve_scale(x='independent')
Which will result in:
See Colab Notebook or Vega Editor
EDIT
To control the opacity and legend names one can do as such
alt.Chart(df, height=90).mark_bar(tooltip=True).encode(
x=alt.X("Scenes:N"),
y=alt.Y("mean(Y):Q"),
color=alt.Color("Scenes:N"),
opacity=alt.Opacity("Dataset:N",
scale=alt.Scale(domain=['D1', 'D2'],
range=[0.2, 1.0]),
legend=alt.Legend(labelExpr="datum.label == 'D1' ? 'D1 - transparent' : 'D2 - full'")),
xOffset=alt.XOffset("Dataset:N"),
column=alt.Column('Layer:N'),
row=alt.Row("Model:N")
).resolve_scale(x='independent')
I have a data frame consisting of a mixture of NaN's and strings e.g
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
ddf = pd.DataFrame(data)
I want to
1:count the total number of items and put in a new data frame e.g
NaN=2
tree=5
car=2
fish=1
cat=1
dog=1
2:Count the total number of items when compared to a separate longer list (column of a another data frame, e.g
df['compare'] =
NaN
tree
car
fish
cat
dog
rabbit
Pear
Orange
snow
rain
Thanks
Jason
For the first question:
from collections import Counter
data = {
"String1": ["NaN", "tree", "car", "tree"],
"String2": ["cat", "dog", "car", "tree"],
"String3": ["fish", "tree", "NaN", "tree"],
}
ddf = pd.DataFrame(data)
a = Counter(ddf.stack().tolist())
df_result = pd.DataFrame(dict(a), index=['Count']).T
df = pd.DataFrame({'vals':['NaN', 'tree', 'car', 'fish', 'cat', 'dog', 'rabbit', 'Pear', 'Orange', 'snow', 'rain']})
df_counts = df.vals.map(df_result.to_dict()['Count'])
THis should do :)
You can use the following code for count of items over all data frame.
import pandas as pd
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
df = pd.DataFrame(data)
def get_counts(df: pd.DataFrame) -> dict:
res = {}
for col in df.columns:
vc = df[col].value_counts().to_dict()
for k,v in vc.items():
if k in res:
res[k] += v
else:
res[k] = v
return res
counts = get_counts(df)
Output
>>> print(counts)
{'tree': 5, 'car': 2, 'NaN': 2, 'cat': 1, 'dog': 1, 'fish': 1}
I am trying to annotate my subplots inside a for loop. Each subplot will have RMS value printed on the plot. I tried to do it the following way:
from plotly import tools
figg = tools.make_subplots(rows=4, cols=1)
fake_date = {"X": np.arange(1, 101, 0.5), "Y": np.sin(x), "Z": [x + 1 for x in range(10)] * 20}
fake_date = pd.DataFrame(fake_date)
fake_date.sort_values("Z")
unique_ids = fake_date['Z'].unique()
train_id, test_id = np.split(np.random.permutation(unique_ids), [int(.6 * len(unique_ids))])
for i, j in enumerate(test_id):
x_test = fake_date[fake_date['Z'].isin([test_id[i]])]
y_test = fake_date[fake_date['Z'].isin([test_id[i]])]
# Evaluate
rms_test = 0.04
r_test = 0.9
Real = {'type' : 'scatter',
'x' : x_test.X,
'y' : x_test.Y,
"mode" : 'lines+markers',
"name" : 'Real'}
figg.append_trace(Real, i+1, 1)
figg['layout'].update( annotations=[dict(x = 10,y = 0.2, text= rms_test, xref= "x1",yref="y1")] )
figg['layout'].update(height=1800, width=600, title='Testing')
pyo.iplot(figg)
This does not work, although the answer given here seems to work for others. Can anyone point out what wrong am I doing?
I generated fake date for reproducibility
I am not sure where to exactly place the RMS value, but below is a sample code which will help you achieve what you want.
We create an array annotation_arr where we store the annotations using the for loop.
We need to set the xval and yval for each of the individual axes. Remember, first axis will be x, second will be x2 so, I have written a ternary condition for that, please checkout the below code and let me know if there is any issues!
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
from plotly import tools
import numpy as np
import pandas as pd
init_notebook_mode(connected=True)
rows = 4
figg = tools.make_subplots(rows=rows, cols=1)
fake_date = {"X": np.arange(0, 100, 0.5), "Y": [np.sin(x) for x in range(200)], "Z": [x + 1 for x in range(10)] * 20}
fake_date = pd.DataFrame(fake_date)
fake_date.sort_values("Z")
unique_ids = fake_date['Z'].unique()
train_id, test_id = np.split(np.random.permutation(unique_ids), [int(.6 * len(unique_ids))])
top = 0
annotation_arr = []
for i, j in enumerate(test_id):
x_test = fake_date[fake_date['Z'].isin([test_id[i]])]
y_test = fake_date[fake_date['Z'].isin([test_id[i]])]
# Evaluate
rms_test = 0.04
r_test = 0.9
Real = {'type' : 'scatter',
'x' : x_test.X,
'y' : x_test.Y,
"mode" : 'lines+markers',
"name" : 'Real'}
top = top + 1/rows
i_val = "" if i == 0 else i + 1
annotation_arr.append(dict(x = r_test,y = top, text= rms_test, xref= "x"+str(i_val),yref="y"+str(i_val)))
figg.append_trace(Real, i+1, 1)
figg['layout'].update( annotations=annotation_arr )
figg['layout'].update(height=1800, width=600, title='Testing')
iplot(figg)
I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?