median imputation by groups in pandas (handling group medians that are NaN) - pandas

I have the following DataFrame train:
train = {'NAME_EDUCATION_TYPE': {5: 'Secondary / secondary special',
6: 'Higher education',
7: 'Higher education',
8: 'Secondary / secondary special',
9: 'Secondary / secondary special',
10: 'Higher education',
11: 'Secondary / secondary special',
12: 'Secondary / secondary special',
13: 'Secondary / secondary special',
14: 'Secondary / secondary special'},
'OCCUPATION_TYPE': {5: 'Laborers',
6: 'Accountants',
7: 'Managers',
8: nan,
9: 'Laborers',
10: 'Core staff',
11: nan,
12: 'Laborers',
13: 'Drivers',
14: 'Laborers'},
'AGE_GROUP': {5: '45-60',
6: '21-45',
7: '45-60',
8: '45-60',
9: '21-45',
10: '21-45',
11: '45-60',
12: '21-45',
13: '21-45',
14: '21-45'},
'DAYS_EMPLOYED': {5: -1588.0,
6: -3130.0,
7: -449.0,
8: nan,
9: -2019.0,
10: -679.0,
11: nan,
12: -2717.0,
13: -3028.0,
14: -203.0},
'DAYS_EMPLOYED_ANOM': {5: False,
6: False,
7: False,
8: True,
9: False,
10: False,
11: True,
12: False,
13: False,
14: False},
'DAYS_LAST_PHONE_CHANGE': {5: -2536.0,
6: -1562.0,
7: -1070.0,
8: 0.0,
9: -1673.0,
10: -844.0,
11: -2396.0,
12: -2370.0,
13: -4.0,
14: -188.0}}
I have a few NaN in the column DAYS_EMPLOYED. They are flagged as "True" in the column DAYS_EMPLOYED_ANOM.
I want to impute these NaN using the median of DAYS_EMPLOYED by the following group of columns : NAME_EDUCATION_TYPE, OCCUPATION_TYPE and AGE_GROUP
I believe this can be done in a few lines in pandas but I could not figure it out. I have tried to apply the following code that I found in an example of mean imputation for a Series, but the NaN values do not get imputed.
fill_median = lambda g: g.fillna(g.median())
train.loc[train['DAYS_EMPLOYED_ANOM'] == True,'DAYS_EMPLOYED'] = train.groupby(['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_GROUP'])['DAYS_EMPLOYED'].apply(fill_median)`
I also tried to apply the code from this post without success:
How can I impute values to outlier cells based on groups?

You could do:
train['DAYS_EMPLOYED'] = (train.groupby(['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_GROUP'],
dropna=False)
['DAYS_EMPLOYED']
.apply(lambda x: x.fillna(x.median()))
)
However, note that this will not work on your particular dataset as you need to have at least one non NaN value per group to be able to calculate the median.
You could use the population median instead:
train['DAYS_EMPLOYED'] = (train.groupby(['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_GROUP'],
dropna=False)
['DAYS_EMPLOYED']
.apply(lambda x: x.fillna(train['DAYS_EMPLOYED'].median()))
)
Here is an hybrid approach to try calculating the group median and otherwise fall back to population one:
def median(s):
m = s.median()
if np.isnan(m):
m = train['DAYS_EMPLOYED'].median()
return m
train['DAYS_EMPLOYED'] = (train.groupby(['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_GROUP'],
dropna=False
)
['DAYS_EMPLOYED'].apply(lambda x: x.fillna(median(s)))
)

Related

How to keep the number and names of columns in training and test dataset equal after one hot encoding?

Shape of the original dataset is 82580×30 with multiple string columns. Example dataset:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
df = pd.DataFrame({'Nationality': {0: 'DEU', 1: 'PRT', 2: 'PRT', 3: 'PRT', 4: 'FRA', 5: 'DEU', 6: 'CHE', 7: 'DEU', 8: 'GBR', 9: 'AUT', 10: 'PRT', 11: 'FRA', 12: 'OTR', 13: 'GBR', 14: 'ESP', 15: 'PRT', 16: 'OTR', 17: 'PRT', 18: 'ESP', 19: 'AUT'},
'Age': {0: 27.0, 1: 45.46, 2: 45.46, 3: 58.0, 4: 57.0, 5: 27.0, 6: 49.0, 7: 62.0, 8: 44.0, 9: 61.0, 10: 54.0, 11: 53.0, 12: 50.0, 13: 30.0, 14: 51.0, 15: 45.46, 16: 40.0, 17: 49.0, 18: 49.0, 19: 14.0},
'DaysSinceCreation': {0: 370, 1: 213, 2: 206, 3: 1018, 4: 835, 5: 52, 6: 597, 7: 217, 8: 999, 9: 1004, 10: 402, 11: 879, 12: 393, 13: 923, 14: 249, 15: 52, 16: 159, 17: 929, 18: 49, 19: 131},
'BookingsCheckedIn': {0: 1, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 2, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 0, 16: 0, 17: 1, 18: 1, 19: 0}})
# Encoding Variables
transformer = make_column_transformer((OneHotEncoder(sparse=False), ['Nationality']), remainder='passthrough')
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
transformed_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([transformed_df, df], axis=1)
# Remove old columns
df.drop(['Nationality'], axis = 1, inplace = True)
print('The shape after encoding: {}'.format(df.shape))
print(df.columns.unique())
The shape after encoding: (20, 14)
Index(['onehotencoder__Nationality_AUT', 'onehotencoder__Nationality_CHE',
'onehotencoder__Nationality_DEU', 'onehotencoder__Nationality_ESP',
'onehotencoder__Nationality_FRA', 'onehotencoder__Nationality_GBR',
'onehotencoder__Nationality_OTR', 'onehotencoder__Nationality_PRT',
'remainder__Age', 'remainder__DaysSinceCreation',
'remainder__BookingsCheckedIn', 'Age', 'DaysSinceCreation',
'BookingsCheckedIn'],
dtype='object')
After modeling, trying to test on a completely different test set:
df = pd.DataFrame({'Nationality': {0: 'CAN', 1: 'DEU', 2: 'PRT', 3: 'PRT', 4: 'FRA'},
'Age': {0: 27.0, 1: 29.0, 2: 24.0, 3: 24.0, 4: 46.0},
'DaysSinceCreation': {0: 222, 1: 988, 2: 212, 3: 685, 4: 1052},
'BookingsCheckedIn': {0: 0, 1: 1, 2: 1, 3: 1, 4: 0}})
# Encoding Variables
transformer = make_column_transformer(
(OneHotEncoder(sparse=False), ['Nationality']),
remainder='passthrough')
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
transformed_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([transformed_df, df], axis=1)
# Remove old columns
df.drop(['Nationality'], axis = 1, inplace = True)
print('The shape after encoding: {}'.format(df.shape))
print(df.columns.unique())
The shape after encoding: (5, 10)
Index(['onehotencoder__Nationality_CAN', 'onehotencoder__Nationality_DEU',
'onehotencoder__Nationality_FRA', 'onehotencoder__Nationality_PRT',
'remainder__Age', 'remainder__DaysSinceCreation',
'remainder__BookingsCheckedIn', 'Age', 'DaysSinceCreation',
'BookingsCheckedIn'],
dtype='object')
As can be seen, testing dataset has some features that were not present in the original training set and many features of training set are not present in test set. If I only use .values of X_train, y_train, X_test, y_test, I can run from logistic regression to Neural Net with >99% accuracy, but that feels like cheating and is not working out with Decision Trees. How do we deal with this?
I would like to contribute 2 inputs:
(1) the test set should be a subset of the training set, so the unknown Nationality 'CAN' is not allowed. Either: try to include the new 'CAN' in the training data, or try to replace it with 'GBR' instead in the test data.
(2) you should not do fit_transform() separately on training and test set. The right way is to fit on training set, then... transform on training set and transform on test set. To illustrate:
# Encoding Variables
transformer = make_column_transformer((OneHotEncoder(sparse=False), ['Nationality']), remainder='passthrough')
####transformed = transformer.fit_transform(df) #delete this
transformer.fit(df) #use this instead
transformed = transformer.transform(df) #use this instead
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
<truncated>
print('The shape after encoding: {}'.format(df.shape))
The shape after encoding: (20, 14)
Second part, note that I have replaced 'CAN' with 'GBR'. And only use the previously fitted transformer to transform the test set:
df = pd.DataFrame({'Nationality': {0: 'GBR', 1: 'DEU', 2: 'PRT', 3: 'PRT', 4: 'FRA'},
'Age': {0: 27.0, 1: 29.0, 2: 24.0, 3: 24.0, 4: 46.0},
'DaysSinceCreation': {0: 222, 1: 988, 2: 212, 3: 685, 4: 1052},
'BookingsCheckedIn': {0: 0, 1: 1, 2: 1, 3: 1, 4: 0}})
# Encoding Variables
####transformer = make_column_transformer((OneHotEncoder(sparse=False), ['Nationality']), remainder='passthrough') #do not repeat, use the previous fitted model
####transformed = transformer.fit_transform(df) #delete this, NO fitting on test set
transformed = transformer.transform(df) #only do transform on test set
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
<truncated>
print('The shape after encoding: {}'.format(df.shape))
The shape after encoding: (5, 14)
So the number of columns (14) are the same for both training set and test set

Using shift function along with max function Pandas

I am attempting to create a technical indicator ('Supertrend') using Pandas. The formula for this column is recursive.
(For people familiar with Pinescript, this column will replicate the result of this Pinescript function):
df['st_trendup'] = np.select(df['Close'].shift() > df['st_trendup'].shift(),df[['st_up','st_trendup'.shift()]].max(axis=1),df['st_up'])
The problem occurs in the true part of the np.select()because I cannot call .shift() on a string.
Normally, I would make a new column that uses .shift() beforehand but since this is recursive, I have to do it all in one line.
If possible I'd like to avoid using loops for speed; prefer solutions using native pandas or numpy functions.
What I am looking for
A way to find max function that can accomodate a .shift() call
Columns that are used:
def tr(high,low,close1):
return max(high - low, abs(high - close1), abs(low - close1))
df['st_closeprev'] = df['Close'].shift()
df['st_hl2'] = (df['High']+df['Low'])/2
df['st_tr'] = df.apply(lambda row: tr(row['High'],row['Low'],row['st_closeprev']),axis=1)
df['st_atr'] = df['st_tr'].ewm(alpha = 1/pd,adjust=False,min_periods=pd).mean()
df['st_up'] = df['st_hl2'] - factor * df['st_atr']
df['st_dn'] = df['st_hl2'] + factor * df['st_atr']
df['st_trendup'] = np.select(df['Close'].shift() > df['st_trendup'].shift(),df[['st_up','st_trendup'.shift()]].max(axis=1),df['st_up'])
Sample data obtained by the df.to_dict
{'Date': {0: Timestamp('2021-01-01 09:15:00'),
1: Timestamp('2021-01-01 09:30:00'),
2: Timestamp('2021-01-01 09:45:00'),
3: Timestamp('2021-01-01 10:00:00'),
4: Timestamp('2021-01-01 10:15:00'),
5: Timestamp('2021-01-01 10:30:00'),
6: Timestamp('2021-01-01 10:45:00'),
7: Timestamp('2021-01-01 11:00:00'),
8: Timestamp('2021-01-01 11:15:00'),
9: Timestamp('2021-01-01 11:30:00'),
10: Timestamp('2021-01-01 11:45:00'),
11: Timestamp('2021-01-01 12:00:00'),
12: Timestamp('2021-01-01 12:15:00'),
13: Timestamp('2021-01-01 12:30:00'),
14: Timestamp('2021-01-01 12:45:00'),
15: Timestamp('2021-01-01 13:00:00'),
16: Timestamp('2021-01-01 13:15:00'),
17: Timestamp('2021-01-01 13:30:00'),
18: Timestamp('2021-01-01 13:45:00'),
19: Timestamp('2021-01-01 14:00:00'),
20: Timestamp('2021-01-01 14:15:00'),
21: Timestamp('2021-01-01 14:30:00'),
22: Timestamp('2021-01-01 14:45:00'),
23: Timestamp('2021-01-01 15:00:00'),
24: Timestamp('2021-01-01 15:15:00'),
25: Timestamp('2021-01-04 09:15:00')},
'Open': {0: 31250.0,
1: 31376.0,
2: 31405.0,
3: 31389.4,
4: 31377.5,
5: 31347.8,
6: 31310.8,
7: 31343.4,
8: 31349.5,
9: 31349.9,
10: 31325.1,
11: 31310.9,
12: 31329.0,
13: 31376.0,
14: 31375.5,
15: 31357.4,
16: 31325.0,
17: 31341.1,
18: 31300.0,
19: 31324.5,
20: 31353.3,
21: 31350.0,
22: 31346.9,
23: 31330.0,
24: 31314.3,
25: 31450.2},
'High': {0: 31407.0,
1: 31425.0,
2: 31411.95,
3: 31389.45,
4: 31382.0,
5: 31350.0,
6: 31354.6,
7: 31359.0,
8: 31370.0,
9: 31364.7,
10: 31350.0,
11: 31337.9,
12: 31378.9,
13: 31419.5,
14: 31377.75,
15: 31360.0,
16: 31367.15,
17: 31345.2,
18: 31340.0,
19: 31367.0,
20: 31375.0,
21: 31370.0,
22: 31350.0,
23: 31334.6,
24: 31329.6,
25: 31599.0},
'Low': {0: 31250.0,
1: 31367.95,
2: 31352.5,
3: 31331.65,
4: 31301.4,
5: 31303.05,
6: 31310.0,
7: 31325.05,
8: 31335.35,
9: 31315.35,
10: 31281.9,
11: 31292.0,
12: 31316.25,
13: 31352.05,
14: 31335.0,
15: 31322.0,
16: 31318.25,
17: 31261.55,
18: 31283.3,
19: 31324.5,
20: 31322.0,
21: 31332.15,
22: 31324.1,
23: 31300.15,
24: 31280.0,
25: 31430.0},
'Close': {0: 31375.0,
1: 31398.3,
2: 31386.0,
3: 31377.0,
4: 31342.3,
5: 31311.7,
6: 31345.0,
7: 31349.0,
8: 31344.2,
9: 31327.6,
10: 31311.3,
11: 31325.6,
12: 31373.0,
13: 31375.0,
14: 31357.4,
15: 31326.0,
16: 31345.9,
17: 31300.6,
18: 31324.4,
19: 31353.8,
20: 31345.6,
21: 31341.6,
22: 31332.5,
23: 31311.0,
24: 31285.0,
25: 31558.4},
'Volume': {0: 259952,
1: 163775,
2: 105900,
3: 99725,
4: 115175,
5: 78625,
6: 67675,
7: 46575,
8: 53350,
9: 54175,
10: 96975,
11: 80925,
12: 79475,
13: 147775,
14: 38900,
15: 64925,
16: 52425,
17: 142175,
18: 81800,
19: 74950,
20: 68550,
21: 40350,
22: 47150,
23: 119200,
24: 222875,
25: 524625}}
Change:
df[['st_up','st_trendup'.shift()]].max(axis=1)
to:
df[['st_up','st_trendup']].assign(st_trendup = df['st_trendup'].shift()).max(axis=1)

how to plot the loss from the log file

The below mentioned are the loss values generated in the file 'log'(the iterations are actually more than this what I listed below). Attached the screenshot of the contents of the log file for ref. How to plot the Iteration (x-axis) vs Loss (y-axis) from these contents in the 'log' file ?
0: combined_hm_loss: 0.17613089
1: combined_hm_loss: 0.20243575
2: combined_hm_loss: 0.07203530
3: combined_hm_loss: 0.03444689
4: combined_hm_loss: 0.02623464
5: combined_hm_loss: 0.02061908
6: combined_hm_loss: 0.01562270
7: combined_hm_loss: 0.01253260
8: combined_hm_loss: 0.01102418
9: combined_hm_loss: 0.00958306
10: combined_hm_loss: 0.00824807
11: combined_hm_loss: 0.00694697
12: combined_hm_loss: 0.00640630
13: combined_hm_loss: 0.00593691
14: combined_hm_loss: 0.00521284
15: combined_hm_loss: 0.00445185
16: combined_hm_loss: 0.00408901
17: combined_hm_loss: 0.00377806
18: combined_hm_loss: 0.00314004
19: combined_hm_loss: 0.00287649
enter image description here
try this:
import pandas as pd
import numpy as np
import io
data = '''
index combined_hm_loss
0: 0.17613089
1: 0.20243575
2: 0.07203530
3: 0.03444689
4: 0.02623464
5: 0.02061908
6: 0.01562270
7: 0.01253260
8: 0.01102418
9: 0.00958306
10: 0.00824807
11: 0.00694697
12: 0.00640630
13: 0.00593691
14: 0.00521284
15: 0.00445185
16: 0.00408901
17: 0.00377806
18: 0.00314004
19: 0.00287649
'''
df = pd.read_csv(io.StringIO(data), delim_whitespace=True)
ax = df.plot.area(y='combined_hm_loss')
ax.invert_yaxis()

How to create multiple line graph using seaborn and find rate?

I need help to create a multiple line graph using below DataFrame
num user_id first_result second_result result date point1 point2 point3 point4
0 0 1480R clear clear pass 9/19/2016 clear consider clear consider
1 1 419M consider consider fail 5/18/2016 consider consider clear clear
2 2 416N consider consider fail 11/15/2016 consider consider consider consider
3 3 1913I consider consider fail 11/25/2016 consider consider consider clear
4 4 1938T clear clear pass 8/1/2016 clear consider clear clear
5 5 1530C clear clear pass 6/22/2016 clear clear consider clear
6 6 1075L consider consider fail 9/13/2016 consider consider clear consider
7 7 1466N consider clear fail 6/21/2016 consider clear clear consider
8 8 662V consider consider fail 11/1/2016 consider consider clear consider
9 9 1187Y consider consider fail 9/13/2016 consider consider clear clear
10 10 138T consider consider fail 9/19/2016 consider clear consider consider
11 11 1461Z consider clear fail 7/18/2016 consider consider clear consider
12 12 807N consider clear fail 8/16/2016 consider consider clear clear
13 13 416Y consider consider fail 10/2/2016 consider clear clear clear
14 14 638A consider clear fail 6/21/2016 consider clear consider clear
data file linke data.xlsx or data as dict
data = {'num': {0: 0,
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 6,
7: 7,
8: 8,
9: 9,
10: 10,
11: 11,
12: 12,
13: 13,
14: 14},
'user_id': {0: '1480R',
1: '419M',
2: '416N',
3: '1913I',
4: '1938T',
5: '1530C',
6: '1075L',
7: '1466N',
8: '662V',
9: '1187Y',
10: '138T',
11: '1461Z',
12: '807N',
13: '416Y',
14: '638A'},
'first_result': {0: 'clear',
1: 'consider',
2: 'consider',
3: 'consider',
4: 'clear',
5: 'clear',
6: 'consider',
7: 'consider',
8: 'consider',
9: 'consider',
10: 'consider',
11: 'consider',
12: 'consider',
13: 'consider',
14: 'consider'},
'second_result': {0: 'clear',
1: 'consider',
2: 'consider',
3: 'consider',
4: 'clear',
5: 'clear',
6: 'consider',
7: 'clear',
8: 'consider',
9: 'consider',
10: 'consider',
11: 'clear',
12: 'clear',
13: 'consider',
14: 'clear'},
'result': {0: 'pass',
1: 'fail',
2: 'fail',
3: 'fail',
4: 'pass',
5: 'pass',
6: 'fail',
7: 'fail',
8: 'fail',
9: 'fail',
10: 'fail',
11: 'fail',
12: 'fail',
13: 'fail',
14: 'fail'},
'date': {0: '9/19/2016',
1: '5/18/2016',
2: '11/15/2016',
3: '11/25/2016',
4: '8/1/2016',
5: '6/22/2016',
6: '9/13/2016',
7: '6/21/2016',
8: '11/1/2016',
9: '9/13/2016',
10: '9/19/2016',
11: '7/18/2016',
12: '8/16/2016',
13: '10/2/2016',
14: '6/21/2016'},
'point1': {0: 'clear',
1: 'consider',
2: 'consider',
3: 'consider',
4: 'clear',
5: 'clear',
6: 'consider',
7: 'consider',
8: 'consider',
9: 'consider',
10: 'consider',
11: 'consider',
12: 'consider',
13: 'consider',
14: 'consider'},
'point2': {0: 'consider',
1: 'consider',
2: 'consider',
3: 'consider',
4: 'consider',
5: 'clear',
6: 'consider',
7: 'clear',
8: 'consider',
9: 'consider',
10: 'clear',
11: 'consider',
12: 'consider',
13: 'clear',
14: 'clear'},
'point3': {0: 'clear',
1: 'clear',
2: 'consider',
3: 'consider',
4: 'clear',
5: 'consider',
6: 'clear',
7: 'clear',
8: 'clear',
9: 'clear',
10: 'consider',
11: 'clear',
12: 'clear',
13: 'clear',
14: 'consider'},
'point4': {0: 'consider',
1: 'clear',
2: 'consider',
3: 'clear',
4: 'clear',
5: 'clear',
6: 'consider',
7: 'consider',
8: 'consider',
9: 'clear',
10: 'consider',
11: 'consider',
12: 'clear',
13: 'clear',
14: 'clear'}
}
I need to create a bar graph and a line graph, I have created the bar graph using point1 where x = consider, clear and y = count of consider and clear
but I have no idea how to create a line graph by this scenario
x = date
y = pass rate (%)
Pass Rate is a number of clear/(consider + clear)
graph the rate for first_result, second_result, result all on the same graph
and the graph should look like below
please comment or answer how can I do it. if I can get an idea of grouping dates and getting the ratio then also great.
Here's my idea how to do it:
# first convert all `clear`, `consider` to 1,0
tmp_df = df[['first_result', 'second_result']].apply(lambda x: x.eq('clear').astype(int))
# convert `pass`, `fail` to 1,0
tmp_df['result'] = df.result.eq('pass').astype(int)
# copy the date
tmp_df['date'] = df['date']
# groupby and compute mean, i.e. number_pass/total_count
tmp_df = tmp_df.groupby('date').mean()
tmp_df.plot()
Output for this dataset

Creating a Dropdown menu in Plotly from Pandas

I've had a look at the following link but its not very clear https://plot.ly/pandas/dropdowns/.
I have the following figure generated in plotly but would like a dropdown menu (of A, B and C) to select and display the respective line only
import pandas as pd
import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
from plotly.graph_objs import *
df = pd.DataFrame({'freq': {0: 0.01, 1: 0.02, 2: 0.029999999999999999, 3: 0.040000000000000001, 4: 0.050000000000000003, 5: 0.059999999999999998, 6: 0.070000000000000007, 7: 0.080000000000000002, 8: 0.089999999999999997, 9: 0.10000000000000001, 10: 0.01, 11: 0.02, 12: 0.029999999999999999, 13: 0.040000000000000001, 14: 0.050000000000000003, 15: 0.059999999999999998, 16: 0.070000000000000007, 17: 0.080000000000000002, 18: 0.089999999999999997, 19: 0.10000000000000001, 20: 0.01, 21: 0.02, 22: 0.029999999999999999, 23: 0.040000000000000001, 24: 0.050000000000000003, 25: 0.059999999999999998, 26: 0.070000000000000007, 27: 0.080000000000000002, 28: 0.089999999999999997, 29: 0.10000000000000001}, 'kit': {0: 'B', 1: 'B', 2: 'B', 3: 'B', 4: 'B', 5: 'B', 6: 'B', 7: 'B', 8: 'B', 9: 'B', 10: 'A', 11: 'A', 12: 'A', 13: 'A', 14: 'A', 15: 'A', 16: 'A', 17: 'A', 18: 'A', 19: 'A', 20: 'C', 21: 'C', 22: 'C', 23: 'C', 24: 'C', 25: 'C', 26: 'C', 27: 'C', 28: 'C', 29: 'C'}, 'SNS': {0: 91.198979591799997, 1: 90.263605442199989, 2: 88.818027210899999, 3: 85.671768707499993, 4: 76.23299319729999, 5: 61.0969387755, 6: 45.1530612245, 7: 36.267006802700003, 8: 33.0782312925, 9: 30.739795918400002, 10: 90.646258503400006, 11: 90.306122449, 12: 90.178571428600009, 13: 89.498299319699996, 14: 88.435374149599994, 15: 83.588435374200003, 16: 75.212585034, 17: 60.969387755100001, 18: 47.278911564600001, 19: 37.627551020399999, 20: 90.986394557800011, 21: 90.136054421799997, 22: 89.540816326499993, 23: 88.690476190499993, 24: 86.479591836799997, 25: 82.397959183699996, 26: 73.809523809499993, 27: 63.180272108800004, 28: 50.935374149700003, 29: 41.241496598699996}, 'FPR': {0: 1.0953616823100001, 1: 0.24489252678500001, 2: 0.15106142277199999, 3: 0.104478605177, 4: 0.089172822253300005, 5: 0.079856258734300009, 6: 0.065881413455800009, 7: 0.059892194050699996, 8: 0.059892194050699996, 9: 0.0578957875824, 10: 0.94097291541899997, 11: 0.208291741532, 12: 0.14773407865800001, 13: 0.107805949291, 14: 0.093165635189999998, 15: 0.082518134025399995, 16: 0.074532508152000007, 17: 0.065881413455800009, 18: 0.062554069341799995, 19: 0.061888600519100001, 20: 0.85313103081100006, 21: 0.18899314567100001, 22: 0.14107939043000001, 23: 0.110467824582, 24: 0.099820323417899995, 25: 0.085180009316599997, 26: 0.078525321088700001, 27: 0.073201570506399985, 28: 0.071870632860800004, 29: 0.0705396952153}})
fig = {
'data': [
{
'x': df[df['kit']==kit]['FPR'],
'y': df[df['kit']==kit]['SNS'],
'name': kit,
} for kit in ['A', 'B', 'C']
],
}
py.iplot(fig)
I'm not sure how to do this directly from plotly; however, you can use interact function from ipywidgets library.
In your case it will be the following:
from ipywidgets import interact
df = pd.DataFrame({'freq': {0: 0.01, 1: 0.02, 2: 0.029999999999999999, 3: 0.040000000000000001, 4: 0.050000000000000003, 5: 0.059999999999999998, 6: 0.070000000000000007, 7: 0.080000000000000002, 8: 0.089999999999999997, 9: 0.10000000000000001, 10: 0.01, 11: 0.02, 12: 0.029999999999999999, 13: 0.040000000000000001, 14: 0.050000000000000003, 15: 0.059999999999999998, 16: 0.070000000000000007, 17: 0.080000000000000002, 18: 0.089999999999999997, 19: 0.10000000000000001, 20: 0.01, 21: 0.02, 22: 0.029999999999999999, 23: 0.040000000000000001, 24: 0.050000000000000003, 25: 0.059999999999999998, 26: 0.070000000000000007, 27: 0.080000000000000002, 28: 0.089999999999999997, 29: 0.10000000000000001}, 'kit': {0: 'B', 1: 'B', 2: 'B', 3: 'B', 4: 'B', 5: 'B', 6: 'B', 7: 'B', 8: 'B', 9: 'B', 10: 'A', 11: 'A', 12: 'A', 13: 'A', 14: 'A', 15: 'A', 16: 'A', 17: 'A', 18: 'A', 19: 'A', 20: 'C', 21: 'C', 22: 'C', 23: 'C', 24: 'C', 25: 'C', 26: 'C', 27: 'C', 28: 'C', 29: 'C'}, 'SNS': {0: 91.198979591799997, 1: 90.263605442199989, 2: 88.818027210899999, 3: 85.671768707499993, 4: 76.23299319729999, 5: 61.0969387755, 6: 45.1530612245, 7: 36.267006802700003, 8: 33.0782312925, 9: 30.739795918400002, 10: 90.646258503400006, 11: 90.306122449, 12: 90.178571428600009, 13: 89.498299319699996, 14: 88.435374149599994, 15: 83.588435374200003, 16: 75.212585034, 17: 60.969387755100001, 18: 47.278911564600001, 19: 37.627551020399999, 20: 90.986394557800011, 21: 90.136054421799997, 22: 89.540816326499993, 23: 88.690476190499993, 24: 86.479591836799997, 25: 82.397959183699996, 26: 73.809523809499993, 27: 63.180272108800004, 28: 50.935374149700003, 29: 41.241496598699996}, 'FPR': {0: 1.0953616823100001, 1: 0.24489252678500001, 2: 0.15106142277199999, 3: 0.104478605177, 4: 0.089172822253300005, 5: 0.079856258734300009, 6: 0.065881413455800009, 7: 0.059892194050699996, 8: 0.059892194050699996, 9: 0.0578957875824, 10: 0.94097291541899997, 11: 0.208291741532, 12: 0.14773407865800001, 13: 0.107805949291, 14: 0.093165635189999998, 15: 0.082518134025399995, 16: 0.074532508152000007, 17: 0.065881413455800009, 18: 0.062554069341799995, 19: 0.061888600519100001, 20: 0.85313103081100006, 21: 0.18899314567100001, 22: 0.14107939043000001, 23: 0.110467824582, 24: 0.099820323417899995, 25: 0.085180009316599997, 26: 0.078525321088700001, 27: 0.073201570506399985, 28: 0.071870632860800004, 29: 0.0705396952153}})
def plot_it(kit):
fig = {
'data': [
{
'x': df[df['kit']==kit]['FPR'],
'y': df[df['kit']==kit]['SNS'],
'name': kit
}
]
}
py.iplot(fig)
interact(plot_it, kit=('A', 'B', 'C'))