I'm trying to do something like below.
Group by the date column, for example year or year/month.
And visualize a stacked bar chart, based on the True/False value.
What's the best way to go about this in Altair?
df = pd.DataFrame({
'date': ['20-03-2017', '20-03-2017', '20-03-2018', '21-03-2018', '20-10-2019', '20-03-2019', '1-02-2019', '10-03-2020', '20-06-2020'],
'value': [True, True, True, False, True, False, False, True, False]
})
import altair as alt
alt.Chart(df).mark_bar().encode(
x = 'groupbyyear(date)',
y = 'count(value)',
color = 'value'
)
You're very close: instead of groupbyyear you can use a year time unit transform, and since your dates are non-standard format, you can first use pandas to_datetime to convert them to standard date values. The result looks like this:
import altair as alt
import pandas as pd
df = pd.DataFrame({
'date': ['20-03-2017', '20-03-2017', '20-03-2018', '21-03-2018', '20-10-2019', '20-03-2019', '1-02-2019', '10-03-2020', '20-06-2020'],
'value': [True, True, True, False, True, False, False, True, False]
})
df['date'] = pd.to_datetime(df['date'])
alt.Chart(df).mark_bar().encode(
x = 'year(date):O',
y = 'count(value)',
color = 'value'
)
Related
I'm looking for a way to decide if a pandas Series of strings is contained in the values of a list of strings of another Series.
Preferably a one-liner - I'm aware that I can solve this by looping over the rows and building up a new series.
Example:
import pandas as pd
df = pd.DataFrame([
{'value': 'foo', 'accepted_values': ['foo', 'bar']},
{'value': 'bar', 'accepted_values': ['foo']},
])
Desired output would be
pd.Series([True, False])
because 'foo' is in ['foo', 'bar'], but 'bar' is not in ['foo']
What I've tried:
df['value'].isin(df['accepted_values']), but that gives me [False, False]
Thanks!
You can use apply with in:
df.apply(lambda r: r.value in r.accepted_values, axis=1)
0 True
1 False
When using plotly I do get this picture with some straight lines on the graph. I do not have the same when using matplotlib.
import pandas as pd
import numpy as np
import cufflinks as cf
from plotly.offline import plot
from datetime import datetime
import io
import requests
df = cf.datagen.lines()
src = "https://iss.moex.com/iss/engines/stock/markets/index/securities/RTSI/candles.csv?iss.only=history&interval=31&iss.reverse=true&from=1995-09-01&till=2019-12-28&iss.json=extended&callback=JSON_CALLBACK&lang=en&limit=100&start=0&sort_order=TRADEDATE&sort_order_desc=desc&_=1563185736134'"
r = requests.get(src)
df = pd.read_csv(io.StringIO(r.content.decode('utf-8')),
sep=';',
names=[
'Open', 'Close', 'High', 'Low', 'Value',
'Volume', 'Date', 'End'
]).iloc[2:]
frame = {
'Date': df['Date'].astype(np.datetime64),
'Open': df['Open'].astype('float64'),
'Close': df['Close'].astype('float64'),
'High': df['High'].astype('float64'),
'Low': df['Low'].astype('float64'),
'Value': df['Value'].astype('float64'),
'Volume': df['Volume'].astype('float64'),
}
df = pd.DataFrame(frame)
plot([{
'x': df['Date'],
'y': df[col],
'name': col
} for col in df.columns[1:]])
df.iplot()
Is this a bug with plotly or am I doinf something wrong?
The principal problem is that the date values don't appear in sorted order. It is necessary to sort them explicitly.
Another problem is that the numbers in the 'Value' column have a complete different range than the others. To display them in the same plot, one could add a secondary y-axis.
As the 'Volume' column isn't filled in (containing only zeros), it can be left out.
Here is some sample code, skipping conversion steps which probably are unnecessary for the latest plotly versions:
import plotly.graph_objects as go
import pandas as pd
src = "https://...."
df = pd.read_csv(src,
sep=';',
names=[
'Open', 'Close', 'High', 'Low', 'Value',
'Volume', 'Date', 'End'
]).iloc[2:]
df = df.sort_values(by='Date')
for col in df.columns:
print(col, df[col].min(), df[col].max())
fig = go.Figure()
for col in ['Open', 'Close', 'High', 'Low', 'Value']:
fig.add_trace(
go.Scatter(
x=df['Date'],
y=df[col],
name=col,
yaxis='y1' if col == 'Value' else 'y2'
))
fig.update_layout(
yaxis=dict(
title="Price",
),
yaxis2=dict(
title="Volume",
anchor="x",
overlaying="y",
side="right"
))
fig.show()
I see several small problems plus data is not sorted as stated by #JohanC. Then you should really use 2 yaxis as suggested by #JohanC Here is my full code with comments
import pandas as pd
import io
import requests
import plotly.graph_objects as go
# get Data
src = "https://iss.moex.com/iss/engines/stock/markets/index/securities/RTSI/candles.csv?iss.only=history&interval=31&iss.reverse=true&from=1995-09-01&till=2019-12-28&iss.json=extended&callback=JSON_CALLBACK&lang=en&limit=100&start=0&sort_order=TRADEDATE&sort_order_desc=desc&_=1563185736134'"
r = requests.get(src)
df = pd.read_csv(io.StringIO(r.content.decode('utf-8')),
sep=';',
names=[
'Open', 'Close', 'High', 'Low', 'Value',
'Volume', 'Date', 'End'
]).iloc[2:]
# set Date as first column and drop End
df = df.set_index('Date')\
.drop("End", axis=1)\
.reset_index()
# change dtypes
df["Date"] = df["Date"].astype("M8[us]")
for col in df.columns[1:]:
df[col] = df[col].astype(float)
# sort Date
df = df.sort_values("Date")\
.reset_index(drop=True)
fig = go.Figure()
for col in df.columns[1:]:
fig.add_trace(
go.Scatter(x=df["Date"],
y=df[col],
name=col))
fig.show()
I have a column in my Dataframe that has data in the below format
id,value
101,[{'self': 'https://www.web.com/rest/api/101', 'value': 'Yes', 'id': '546'}]
The type of the column (value) is of type pandas.core.series.Series.
I am trying to extract text corresponding to value in the above dataframe.
Expected output:
id, output
101,Yes
See if his works for you
a=df['value'].str[0].apply(pd.Series)
df['value']=a['value']
print(df)
Output
id value
0 101 Yes
import pandas as pd
import numpy as np
cols = ['id', 'value']
data = [
[101, [{'self': 'https://www.web.com/rest/api/101', 'value': 'Yes', 'id': '546'}]]
]
df = pd.DataFrame(data=data, columns=cols)
df.value = df.apply(lambda x: x['value'][0]['value'], axis=1)
print(df)
Result
id value
0 101 Yes
I am trying to plot a matplotlib graph after based on the value chosen from the dropdown. I have made the dropdown and the plots are also ready for the values but i dont know how to connect both of them together.
Following is the code of the dropdown:
app.layout = html.Div([
dcc.Dropdown(
id='first-dropdown',
options = [
{'label': 'Chest Pain', 'value': 'cp'},
{'label': 'Resting Blood Pressure', 'value': 'trestbps'},
{'label': 'Serum Cholestrol in mg/dl', 'value': 'chol'},
{'label': 'Fasting Blood Pressure', 'value': 'fbs'},
{'label': 'Resting electrocardiographic results', 'value': 'restecg'},
{'label': 'Maximum heart rate achieved', 'value': 'thalach'},
{'label': 'Exercise induced angina', 'value': 'exang'},
{'label': 'Old Peak', 'value': 'oldpeak'},
{'label': 'Slope of the peak exercise ST segment', 'value': 'slope'},
{'label': 'Number of major vessels (0-3) colored by flourosopy', 'value': 'ca'},
{'label': 'Thalassemia', 'value': 'thal'}
],
value= 'thalach'
)
])
and for each value in the dropdown i have a separate function which returns a plot. For eg:
What i am trying to do is that if the Label 'Max Heart Rate Achieved' is selected from the dropdown whose value is 'thalach'. I have a function called plotThalach which returns a plot like this:
def plotThalach(df):
df_men = df[df['sex'] == 1.0]
df_women = df[df['sex'] == 0.0]
plt.figure(figsize=(20, 8))
plt.bar(df_men['age'] + 0.00, df_men['thalach'], color='b', width=0.25, label='Men')
plt.bar(df_women['age'] + 0.25, df_women['thalach'], color='r', width=0.25, label='Women')
plt.legend(loc='upper right')
plt.xlabel("Age")
plt.ylabel("Max Heart Rate")
plt.title("Age vs Max Heart Rate")
return plt
Now how do i connect both of these in such a way that when a value is selected from the dropdown my function gets called and plot gets displayed on the screen.
It's not so clear why you want to mix plotly-dash and matplotlib, you can easily do it using just plotly-dash
Here is a sample code,
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objs as go
df = pd.read_csv(
'https://raw.githubusercontent.com/plotly/'
'datasets/master/gapminderDataFiveYear.csv')
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
#main div
app.layout = html.Div([
#drop down with a default value set
dcc.Dropdown(
id='xaxis-column',
options=[{'label': str(year), 'value': year} for year in df['year'].unique()],
value=df['year'].min(),
),
#graph that is to be updated
dcc.Graph(id='graph-with-slider')
])
#callback which will be spawned when the input changes, in this case the input is the dropdown value
#app.callback(
Output('graph-with-slider', 'figure'),
[Input('xaxis-column', 'value')])
def update_figure(selected_year):
filtered_df = df[df.year == selected_year]
traces = []
for i in filtered_df.continent.unique():
df_by_continent = filtered_df[filtered_df['continent'] == i]
traces.append(go.Scatter(
x=df_by_continent['gdpPercap'],
y=df_by_continent['lifeExp'],
text=df_by_continent['country'],
mode='markers',
opacity=0.7,
marker={
'size': 15,
'line': {'width': 0.5, 'color': 'white'}
},
name=i
))
return {
'data': traces,
'layout': go.Layout(
xaxis={'type': 'log', 'title': 'GDP Per Capita'},
yaxis={'title': 'Life Expectancy', 'range': [20, 90]},
margin={'l': 40, 'b': 40, 't': 10, 'r': 10},
legend={'x': 0, 'y': 1},
hovermode='closest'
)
}
if __name__ == '__main__':
app.run_server(debug=True)
But if you want to show the matplotlib graph instead of plotly-dash graph, you can refer the "Incorporating Matplotlib Plots" section here
For example, I have a array with value [1,2,4,3,6,7,33,2]. I want to get all the values which are bigger than 6. As I know the numpy.take can only get values with indices.
Which function should I use?
You can index into the array with a boolean array index:
>>> a = np.array([1,2,4,3,6,7,33,2])
>>> a > 6
array([False, False, False, False, False, True, True, False], dtype=bool)
>>> a[a > 6]
array([ 7, 33])
If you want the indices where this occurs, you can use np.where:
>>> np.where(a>6)
(array([5, 6]),)