Create a drop down menu in plotly - pandas

I have the following dataframe:
# Create DataFrame
df = pd.DataFrame({"Col_A_date":[2018-09-04,2018-09-05,2018-09-04,2018-09-05],
"Col_B_hour":[7,7,8,8],
"Col_C":[1,1,2,2],
"Col_value":[1.9,2.2,2.6,3.8]
})
I want to create a graph where col_A is shown as drop drown menus (2018-09-04 and 2018-09-05), Col_B is x-axis, Col_value is y-axis and Col_C as traces. So I can see the data for different dates in the same graph. Is it possible to do using plotly?

Yep, it is possible. Updated with your explanations.
If I am correct understand what you need, that`s code deal what you want:
# import libraries
import pandas as pd
import plotly
import plotly.graph_objs as go
# Create DataFrame
df = pd.DataFrame({"Col_A_date":["2018-09-04","2018-09-05","2018-09-04","2018-09-05"],
"Col_B_hour":[7,7,8,8],
"Col_C":[1,1,2,2],
"Col_value":[1.9,2.2,2.6,3.8]
})
# create four df for traces
df1 = df.loc[(df["Col_A_date"] == "2018-09-04") & (df["Col_C"] == 1)]
df2 = df.loc[(df["Col_A_date"] == "2018-09-04") & (df["Col_C"] == 2)]
df3 = df.loc[(df["Col_A_date"] == "2018-09-05") & (df["Col_C"] == 1)]
df4 = df.loc[(df["Col_A_date"] == "2018-09-05") & (df["Col_C"] == 2)]
print(df1,df2,df3,df4)
# Create traces
trace1 = go.Bar(x=list(df1["Col_B_hour"]),
y=list(df1["Col_value"]),
name="1",
text = list(df1["Col_value"]),
textposition="auto",
hoverinfo="name",
marker=dict(color="rgb(0,102,204)")
)
trace2 = go.Bar(x=list(df2["Col_B_hour"]),
y=list(df2["Col_value"]),
name="2",
text=list(df2["Col_value"]),
textposition="auto",
hoverinfo="name",
marker=dict(color="rgb(255,128,0)")
)
trace3 = go.Bar(x=list(df3["Col_B_hour"]),
y=list(df3["Col_value"]),
name="3",
text = list(df3["Col_value"]),
textposition="auto",
hoverinfo="name",
marker=dict(color="rgb(255,178,102)")
)
trace4 = go.Bar(x=list(df4["Col_B_hour"]),
y=list(df4["Col_value"]),
name="4",
text=list(df4["Col_value"]),
textposition="auto",
hoverinfo="name",
marker=dict(color="rgb(255,255,153)")
)
# Pull traces to data
data = [trace1,trace2,trace3,trace4]
# Specify dropout parameters
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = "4 Aug 1",
method = "update",
args = [{"visible": [True, False, False, False]},
{"title": "4 Aug 1"}]),
dict(label = "4 Aug 2",
method = "update",
args = [{"visible": [False, True, False, False]},
{"title": "4 Aug 2"}]),
dict(label = "5 Aug 1",
method = "update",
args = [{"visible": [False, False, True, False]},
{"title": "5 Aug 1"}]),
dict(label = "5 Aug 2",
method = "update",
args = [{"visible": [False, False, False, True]},
{"title": "5 Aug 2"}]),
dict(label = "All",
method = "update",
args = [{"visible": [True, True, True, True]},
{"title": "All"}]),
dict(label = "Reset",
method = "update",
args = [{"visible": [False, False, False, False]},
{"title": "Reset"}])
]),
)
])
# Set layout
layout = dict(title="Dropdown",
showlegend=False,
xaxis=dict(title="Hours"),
yaxis=dict(title="Number"),
updatemenus=updatemenus)
# Create fig
fig = dict(data=data, layout=layout)
# Plot the plotly plot
plotly.offline.plot(fig, filename="update_dropdown.html")
Here how choice All looks like:
And first trace:
Here some useful links from docs: about bar charts; hover text; dropdown menu. Do not be afraid to look at the plotly documentation - there are excellent examples of how to use this package correctly.

Related

Groupby value_counts giving keyerror

I am trying to plot countries whose scale has changes over time.
this is the dataset i am using :'https://www.kaggle.com/datasets/whenamancodes/the-global-hunger-index'
wasting = pd.read_csv('/kaggle/input/the-global-hunger-index/share-of-children-with-a-weight-too-low-for-their-height-wasting.csv')
# rename the column
wasting.rename(columns={'Prevalence of wasting, weight for height (% of children under 5)':'Wasting'},inplace=True)
#create new column with pd.cut
bins = [0,9.9,19.99,34.99,49.99,np.inf]
labels = ['Low','Moderate','Serious','Alarming','Extremely Alarming']
wasting['W_Scale'] = pd.cut(wasting['Wasting'],bins=bins,labels=labels,right=False).astype('category')
wasting.head()
wasting.isna().sum()
#selecting countries with w_scale greater than 1
wasting_entity_scale = wasting.groupby('Entity').filter(lambda x: x['W_Scale'].nunique()>1)
wasting_entity_scale = wasting_entity_scale.groupby(['Year','Entity'])['W_Scale'].value_counts().reset_index(name='count')
wasting_entity_scale = wasting_entity_scale[wasting_entity_scale['count']>0]
wasting_entity_scale = wasting_entity_scale.reset_index(drop=True)
#until this point everything is fine.
traces = {}
for i, (loc, d) in enumerate(wasting_entity_scale.groupby("Entity")):
# use meta so that we know which country a trace belongs to
fig = px.histogram(
d, x="Year", y="Entity", color="level_2"
).update_traces(meta=loc, visible=(i == 0))
traces[loc] = fig.data
l = fig.layout
# integrate all the traces
fig = go.Figure([t for a in traces.values() for t in a]).update_layout(l)
# now buuld menu using meta to know which traces should be visible per country
fig.update_layout(
updatemenus=[
{
"active": 0,
"buttons": [
{
"label": c,
"method": "update",
"args": [
{"visible": [t.meta == c for t in fig.data]},
{"title": c},
],
}
for c in traces.keys()
],
}
]
)
when i try to plot it, it shows this error:
KeyError: 'Serious'
Can someone please teach me what is it that i am doing wrong.
Thank you.

youtube_dl video descriptions

I have a df containing a set of videoIDs from YT:
import pandas as pd
data = {'Order': ['1', '2', '3'],
'VideoID': ['jxwHmAoKte4', 'LsXM502SpiU','1I3f27iQ4pM']
}
df = pd.DataFrame (data, columns = ['Order','VideoID'])
print (df)
and want to download the video descriptions and save them in the same df in an extra column.
I tried to use youtube_dl in Jupyter this way:
import youtube_dl
def all_descriptions(URL):
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
try:
youtube_dl.YoutubeDL(ydl_opts).download(URL)
return webpage
except:
pass
df['descriptions']=all_descriptions(URL)
I see the output of the code as text, but in df only "None" as text of the column.
Obviously I can't transport the output of the function to df in the proper way.
Can you suggest how to get it right?
Thank you in advance for help.
#perl
I modify the df to include two URLs that are causing two types of error:
import pandas as pd
data = {'Order': ['1', '2', '3', '4', '5'],
'VideoId': ['jxwHmAoKte4', 'LsXM502SpiU','1I3f27iQ4pM', 'MGQOX2rK5s', 'wNayw_E7lIA']
}
df = pd.DataFrame (data, columns = ['Order','VideoId'])
print (df)
Then I test it in the way you suggested, including my definition of ydl_opts:
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
df['description'] = [
youtube_dl.YoutubeDL(ydl_opts).extract_info(
u, download=False)['description'] for u in URL]
df
Reaching to the first error I get the output:
TypeError: 'NoneType' object is not subscriptable
After that I replace 'forcedescription' in my code with 'extract_info':
def all_descriptions(URL):
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
try:
youtube_dl.YoutubeDL(ydl_opts).download(URL)
return webpage
except:
pass
It skips all errors, but as the result there is nothing in the 'description'-column.
Any sugggestions?
You can use extract_info method:
df['description'] = [
youtube_dl.YoutubeDL().extract_info(
u, download=False)['description'] for u in URL]
df
Output:
Order VideoID description
0 1 jxwHmAoKte4 Bundesweit gelten sie nun ab heute, die schärf...
1 2 LsXM502SpiU Wie sicher ist der Impfstoff? Wäre eine Impfpf...
2 3 1I3f27iQ4pM Impfen ja oder nein, diese Frage stellen sich ...
P.S. The forcedescription parameter only prints the description to standard output, it doesn't return it
Update: extract_info returns None if it fails, so in case we have videos that may fail before getting the description from the info we can check that the info is not None:
ydl = youtube_dl.YoutubeDL(ydl_opts)
infos = [ydl.extract_info(u, download=False) for u in URL]
df['description'] = [
info['description'] if info is not None else ''
for info in infos]

Fill forward a DataFrame with matching values

I have a DataFrame of booleans. I would like to replace the 2 False values that are directly positioned after a True value. I thought the .replace() method would do it since the 5th example seems to be what I am looking for.
Here is what I do:
dataIn = pd.DataFrame([False, False, False, True, False, False, False, False])
dataOut = dataIn.replace(to_replace=False, method='ffill', limit=2)
>>> TypeError: No matching signature found
Here is the output I am looking for:
dataOut = pd.DataFrame([False, False, False, True, True, True, False, False])
# create a series not a dateframe
# if you have a dataframe then assign to a new variable as a series
# s = df['bool_col']
s = pd.Series([False, True, False, True, False, False, False, False])
# create a mask based on the logic using shift
mask = (s == False) & (((s.shift(1) == True) & (s.shift(-1) == False))\
| ((s.shift(2) == True) & (s.shift(1) == False)))
# numpy.where to create the new output
np.where(mask, True, s)
# array([False, True, False, True, True, True, False, False])
# assign to a new column in the frame (if you want)
# df['new_col'] = np.where(mask, True, s)
Define a function which conditionally replaces 2 first elements with True:
def condRepl(grp):
rv = grp.copy()
if grp.size >= 2 and grp.eq(False).all():
rv.iloc[0:2] = [True] * 2
return rv
The condition triggering this replace is:
group has 2 elements or more,
the group is composed solely of False values.
Then, using this function, transform each group of "new" values
(each change in the value starts a new group):
dataIn[0] = dataIn[0].groupby(s.ne(s.shift()).cumsum()).transform(condRepl)
Thanks for both answers above. But actually, it seems the .replace() can be used, but it does not entirely handle booleans.
By replacing them temporarily by int, it is possible to use it:
dataIn = pd.DataFrame([False, False, False, True, False, False, False, False])
dataOut = dataIn.astype(int).replace(to_replace=False, method='ffill', limit=2).astype(bool)

All() is printing every time else statement

Basically pandas object is applying to entire data frame not individually
that is why it is going to else condition. we need to apply on each rows
I got proper output while applying on one row frame. While applying entire data frame I got the error No keys on each rows, Basically some rows of res have None only those rows are expected to be No keys
sample dataframe
res,url1,url2
{'bool': True, 'val':False},{'bool': False, 'val':False},{'bool': True, 'val':False}
None,{'bool': True, 'val':False},{'bool': False, 'val':False}
{'bool': False, 'val':False},},{'bool': True, 'val':False},{'bool': True, 'val':False}
Code
def func1():
return ('url1')
def func2():
return ('url2')
def test_func():
if df['res'].str['bool'].all() and df['url1'].str['bool'].all():
return func1()
elif df['res'].str['bool'].all() and df['url2'].str['bool'].all():
return func2()
else:
return ("No Keys")
Expected Out
output
url1
No Keys
url2
MY out
No keys
No Keys
No Kyes
I need to apply on the below code more than 5000 urls
df['output'] = df.apply(test_func)
While applying I got the error No keys on each rows
if i do any its passing False because first row of the url1 bools is False
What is the issue is if all() its checking all the rows since None is present in the second rows its printing No Keys
Recreating DataFrame
res url1 \
0 {'bool': True, 'val': False} {'bool': False, 'val': False}
1 None {'bool': True, 'val': False}
2 {'bool': False, 'val': False} {'bool': True, 'val': False}
url2
0 {'bool': True, 'val': False}
1 {'bool': False, 'val': False}
2 {'bool': True, 'val': False}
use pd.apply
df.apply(lambda x: 'url1' if (x['res'] != None and x['res'].get('bool') and x['url1'].get('bool'))\
else 'url2' if (x['res'] != None and x['res'].get('bool') and x['url2'].get('bool'))
else 'No Keys',1)
Output
0 url2
1 No Keys
2 No Keys
dtype: object
Note - for third row, res bool value is False, so doing and will give false and hence No Keys
You can also use a nested np.where:
import pandas as pd
import numpy as np
#Recreate dataframe
df = pd.DataFrame(data = {
'res': [{'bool': True, 'val':False}, None, {'bool': False, 'val':False}],
'url1':[{'bool': False, 'val':False}, {'bool': True, 'val':False}, {'bool': True, 'val':False}],
'url2':[{'bool': True, 'val':False},{'bool': False, 'val':False},{'bool': True, 'val':False}]})
# Define logic
df['Output'] = np.where(df['res'].str['bool'] & df['url1'].str['bool'], 'url1',
np.where(df['res'].str['bool'] & df['url2'].str['bool'], 'url2',
'No Keys'))
# Check Result
df
res ... Output
0 {'bool': True, 'val': False} ... url2
1 None ... No Keys
2 {'bool': False, 'val': False} ... No Keys

Exporting Tokenized SpaCy result into Excel or SQL tables

I'm using SpaCy with Pandas to get a sentence tokenised with Part of Speech (POS)export to excel. The code is as follow:
import spacy
import xlsxwriter
import pandas as pd
nlp = spacy.load('en_core_web_sm')
text ="""He is a good boy."""
doc = nlp(text)
for token in doc:
x=[token.text, token.lemma_, token.pos_, token.tag_,token.dep_,token.shape_, token.is_alpha, token.is_stop]
print(x)
When I print(x)I get the following:
['He', '-PRON-', 'PRON', 'PRP', 'nsubj', 'Xx', True, False]
['is', 'be', 'VERB', 'VBZ', 'ROOT', 'xx', True, True]
['a', 'a', 'DET', 'DT', 'det', 'x', True, True]
['good', 'good', 'ADJ', 'JJ', 'amod', 'xxxx', True, False]
['boy', 'boy', 'NOUN', 'NN', 'attr', 'xxx', True, False]
['.', '.', 'PUNCT', '.', 'punct', '.', False, False]
To the token loop, I added the DataFrame as follow:
for token in doc:
for token in doc:
x=[token.text, token.lemma_, token.pos_, token.tag_,token.dep_,token.shape_, token.is_alpha, token.is_stop]
df=pd.Dataframe(x)
print(df)
Now, I stat to get the following format:
0
0 He
1 -PRON-
2 PRON
3 PRP
4 nsubj
5 Xx
6 True
7 False
........
........
However, when I try exporting the output (df) to excel using Pandas as the following code, it only shows me the last iteration of x in the column
df=pd.DataFrame(x)
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
df.to_excel(writer,sheet_name='Sheet1')
Output (in Excel Sheet):
0
0 .
1 .
2 PUNCT
3 .
4 punct
5 .
6 False
7 False
How I can have all the iterations one after the other in the new column in this scenario as follow?
0 He is ….
1 -PRON- be ….
2 PRON VERB ….
3 PRP VBZ ….
4 nsubj ROOT ….
5 Xx xx ….
6 True True ….
7 False True ….
Some shorter code:
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_sm')
text ="""He is a good boy."""
param = [[token.text, token.lemma_, token.pos_,
token.tag_,token.dep_,token.shape_,
token.is_alpha, token.is_stop] for token in nlp(text)]
df=pd.DataFrame(param)
headers = ['text', 'lemma', 'pos', 'tag', 'dep',
'shape', 'is_alpha', 'is_stop']
df.columns = headers
In case you don't have your version yet:
import pandas as pd
rows =[
['He', '-PRON-', 'PRON', 'PRP', 'nsubj', 'Xx', True, False],
['is', 'be', 'VERB', 'VBZ', 'ROOT', 'xx', True, True],
['a', 'a', 'DET', 'DT', 'det', 'x', True, True],
['good', 'good', 'ADJ', 'JJ', 'amod', 'xxxx', True, False],
['boy', 'boy', 'NOUN', 'NN', 'attr', 'xxx', True, False],
['.', '.', 'PUNCT', '.', 'punct', '.', False, False],
]
headers = ['text', 'lemma', 'pos', 'tag', 'dep',
'shape', 'is_alpha', 'is_stop']
# example 1: list of lists of dicts
#following https://stackoverflow.com/a/28058264/1758363
d = []
for row in rows:
dict_ = {k:v for k, v in zip(headers, row)}
d.append(dict_)
df = pd.DataFrame(d)[headers]
# example 2: appending dicts
df2 = pd.DataFrame(columns=headers)
for row in rows:
dict_ = {k:v for k, v in zip(headers, row)}
df2 = df2.append(dict_, ignore_index=True)
#example 3: lists of dicts created with map() function
def as_dict(row):
return {k:v for k, v in zip(headers, row)}
df3 = pd.DataFrame(list(map(as_dict, rows)))[headers]
def is_equal(df_a, df_b):
"""Substitute for pd.DataFrame.equals()"""
return (df_a == df_b).all().all()
assert is_equal(df, df2)
assert is_equal(df2, df3)