how to check the normality of data on a column grouped by an index - pandas

i'm working on a dataset which represents the completion time of some activities performed in some processes. There are just 6 types of activities that repeat themselves throughout all the dataset and that are described by a numerical value. The example dataset is as follows:
name duration
1 10
2 12
3 34
4 89
5 44
6 23
1 15
2 12
3 39
4 67
5 47
6 13
I'm trying to check if the duration of the activity is normally distributed with the following code:
import numpy as np
import pylab
import scipy.stats as stats
import seaborn as sns
from scipy.stats import normaltest
measurements = df['duration']
stats.probplot(measurements, dist='norm', plot=pylab)
pylab.show()
ax = sns.distplot(measurements)
stat,p = normaltest(measurements)
print('stat=%.3f, p=%.3f\n' % (stat, p))
if p > 0.05:
print('probably gaussian')
else:
print('probably non gaussian')
But i want to do it for each type of activity, which means applying the stats.probplot(), sns.distplot() and the normaltest() to each group of activities (e.g. checking if all the activities called 1 have a duration which is normally distributed).
Any idea on how can i specify in the functions to return different plots for each group of activities?

With the assumption that you have at least 8 samples per activity (as normaltest will throw an error if you don't) then you can loop through your data based on the unique activity values. You'll have to place pylab.show at the end of each graph so that they are not added to each other:
import numpy as np
import pandas as pd
import pylab
import scipy.stats as stats
import seaborn as sns
import random # Only needed by me to create a mock dataframe
import warnings # "distplot" is depricated. Look into using "displot"... in the meantime
warnings.filterwarnings('ignore') # I got sick of seeing the warning so I muted it
name = [1,2,3,4,5,6]*8
duration = [random.choice(range(0,100)) for _ in range(8*6)]
df = pd.DataFrame({"name":name, "duration":duration})
for name in df.name.unique():
nameDF = df[df.name.eq(name)]
measurements = nameDF['duration']
stats.probplot(measurements, dist='norm', plot=pylab)
pylab.show()
ax = sns.distplot(measurements)
ax.set_title(f'Name: {name}')
pylab.show()
stat,p = normaltest(measurements)
print('stat=%.3f, p=%.3f\n' % (stat, p))
if p > 0.05:
print('probably gaussian')
else:
print('probably non gaussian')
.
.
.
etc.

Related

Plot a barchart after pandas.value_counts() [duplicate]

This question already has an answer here:
Using pandas value_counts and matplotlib
(1 answer)
Closed 8 months ago.
I have a dataframe with several columns and I need to plot a graph based on the number of counts in the 'Total'column.
I performed the following code:
df['Total'].value_counts()
The output are as follows:
2 10
20 15
4 8
8 20
This means the the number 2 appears in the Total columns 10 times, number 20 appears 15 times and so on.
How do I plot a barchart with the x-axis as the number itself and the y-axis as the occurances and in ascending
order? The x-axis will plot 2 -> 4 -> 8 -> 20.
What are the next steps after:
%matplotlib inline
import matplotlib.pyplot as plt
Consider this as an example:
This denoted your 'Total' column -> [2,2,2,2,2,20,20,20,20,4,4,4,8,8,8,8,8]
import pandas as pd
import matplotlib.pyplot as plt
import collections
total = [2,2,2,2,2,20,20,20,20,4,4,4,8,8,8,8,8]
df = pd.DataFrame(total, columns=['total'])
#print(df.value_counts())
fig,ax = plt.subplots()
df['total'].value_counts().plot(ax = ax, kind = 'bar', ylabel = 'frequency')
plt.show()
This gives the following output:

statsmodelformula.api.ols.fit().pvalues returns a Pandas series instead of numpy array

So this may be hard to explain cause its a chunk of some really large code - I don't expect it to be reproducible.
But essentially it's a simulation which (using multiple simulated datasets) creates a one-way or two-way regression and calculates the respective t-values and p-values for them.
However, putting some of the datasets (with the same information and no missing values), results in stats.model.formula.ols.fit() returning the pvals / tvals as a pandas series instead of a numpy array (even one way studies).
Could someone please explain why / if there is a way to specify the output?
An example dataframe looks like this: (x0-x187 is our y, genotype and treatment are the desired factors, staging is a factor used for normalisation)
x0
x1
...
treatment
genotype
200926_ku20_e1_wt_veh
0.075821
0.012796
...
veh
wt
201210_ku25_e7_wt_veh
0.082307
0.007596
...
veh
wt
201127_ku55_e6_wt_veh
0.083049
0.008978
...
veh
wt
201220_ku52_e2_wt_veh
0.078414
0.013488
...
veh
wt
...
...
...
...
...
...
210913_b6ku_22297_e5_wt
0.067858
0.008081
...
treat
wt
210821_b6ku_3_e5_wt
0.070417
0.012396
...
treat
wt
And then the code:
'''import subprocess as sub
import os
import struct
from pathlib import Path
import tempfile
from typing import Tuple
import shutil
from logzero import logger as logging
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
for col in range(data.shape[1]):
if not df[f'x{col}'].any():
p = np.nan
t = np.nan
else:
if two_way:
# two way model - if its just the geno or treat comparison; the one-factor col
# will
# be ignored
# for some simulations smf is returning a Series.
fit = smf.ols(formula=f'x{col} ~ genotype * treatment + staging', data=df, missing='drop').fit()
# get all pvals except intercept and staging
p = fit.pvalues[~fit.pvalues.index.isin(['Intercept', 'staging'])]
t = fit.tvalues[~fit.tvalues.index.isin(['Intercept', 'staging'])]
else:
fit = smf.ols(formula=f'x{col} ~ genotype + staging', data=df, missing='drop').fit()
p = fit.pvalues['genotype[T.wt]']
t = fit.tvalues['genotype[T.wt]']
pvals.append(p)
tvals.append(t)
p_all = np.array(pvals)
print("example", p_all[0])
print(type(p_all[0][0]), p_all[0][0])
And finally some output:
Desired output:
'''example [1.63688492e-01 6.05907115e-06 7.70710934e-02]
<class 'numpy.float64'> 0.16368849176977607 '''
"Error" output:
'''example genotype[T.wt] 0.862423
treatment[T.veh] 0.000177
genotype[T.wt]:treatment[T.veh] 0.522066
dtype: float64
< class 'numpy.float64'> 0.8624226150886212'''
I've manually corrected the data but I would rather not have to do dumb fixes in the future.

How to construct a temporal network using python

I have a data for different stations at different days:
Station_start Station_end Day Hour
A B 1 14
B C 1 10
C A 1 10
B A 2 15
A C 2 13
D E 2 12
E B 2 14
F C 3 12
I want to construct a dynamic/interactive network, where the network connections change according to day.
I found an example of it in the tutorial of pathpy.
But, How to load a pandas dataframe to it with nodes Station_start and Station_end?
Here is a way to do what you want. First, load your data into a pandas dataframe using pd.read_fwf (I saved your data in a file called data_net.txt).
Then incrementally add edges to your temporal network using pp.add_edge. Run t in a cell to see the animation.
See code below for more details:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import pathpy as pp
df=pd.read_fwf('data_net.txt')
t = pp.TemporalNetwork()
[t.add_edge(df['Station_start'][i],df['Station_end'][i],int(df['Day'][i])) for i in range(len(df))]
t # run t in a cell to start the animation
Below is what this code returns. Based on the link you gave, you can also control the speed of the animation by styling the network with pathpy.

Multiprocessing the Fuzzy match in pandas

I have two data frames.
DF_Address, which is having 347k distinct addresses and DF_Project which is having 24k records having
Project_Id, Project_Start_Date and Project_Address
I want to check if there is a fuzzy match of my Project_Address in Df_Address. If there is a match, I want to extract the Project_ID and Project_Start_Date for the same. Below is code of what I am trying
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
Df_Address = pd.read_csv("Cantractor_Addresses.csv")
Df_Project = pd.read_csv("Project_info.csv")
#address = list(Df_Project["Project_Address"])
def fuzzy_match(x, choices, cutoff):
print(x)
return process.extractOne(
x, choices=choices, score_cutoff=cutoff
)
Matched = Df_Address ["Address"].apply(
fuzzy_match,
args=(
Df_Project ["Project_Address"],
80
)
)
This code does provide an output in the form of a tuple
('matched_string', score)
But it is also giving similar strings. Also I need to extract
Project_Id and Project_Start_Date
. Can someone help me to achieve this using parallel processing as the data is huge.
You can convert the tuple into dataframe and then join out to your base data frame.
import pandas as pd
Df_Address = pd.DataFrame({'address': ['abc','cdf'],'random_stuff':[100,200]})
Matched = (('abc',10),('cdf',20))
dist = pd.DataFrame(x)
dist.columns = ['address','distance']
final = Df_Address.merge(dist,how='left',on='address')
print(final)
Output:
address random_stuff distance
0 abc 100 10
1 cdf 200 20

Integrating sql,pandas alongwith Bokeh

Here we are trying to load packages and then write an SQl query to integrate with Pandas and then finally using Bokeh to show the plot But bokeh is not showing anything.
You can consider the following as the dataset df_new_2:
name success_rate failure_rate
A 94.7 5.3
B 94.3 5.7
C 91 9
D 88 13
E 84 16
F 81 19
G 78 22
H 74.6 25.4
The code starts here
import pandas.io.sql
import pandas as pd
import pyodbc
from bokeh import mpl
from bokeh.plotting import output_file,show
server = 'root' #getting the server to work
db = 'y' #assigning database
# Create the connection
conn = pyodbc.connect("DRIVER={MySQL ODBC 3.51 Driver};SERVER=localhost;PORT= 3306;DATABASE=y;UID=root;PWD=123456789;")
cursor=conn.cursor()
# query db- Here we are trying to count the number of success in a table and the name for which the success has been found by joining tables
sql = """
SELECT count(*) AS TOTAL,
COUNT(CASE WHEN status=0 THEN 1 END) AS success,
b.name
FROM a
JOIN b
ON b.id=a.merchant
GROUP BY merchant
LIMIT 10
"""
df = pandas.io.sql.read_sql(sql, conn) #defining df as query result
df.head()
df_new=df.set_index('name') #indexing as the name of a
df_new['success_rate']=df_new['success']*100/df_new['TOTAL']
df_new['failure_rate']=100-df_new['success_rate'] #assigning failure rate
df_new2=pd.DataFrame(df_new,columns=['success_rate','failure_rate'])
p=df_new2.plot(kind='barh',stacked=True)
output_file("pandas_series.html", title="pandas_series.py example") #assigning the name of output screen
show(mpl.to_bokeh) #showing the output of bokeh
got something a bit more useful for you now. Had to avoid mpl as I couldn't get that to work. One possible reason is that I don't think horizontal bar charts are available in bokeh.
import pandas as pd
from bokeh.charts import Bar
from bokeh.plotting import output_file, show
from bokeh.charts.operations import blend
from bokeh.charts.attributes import cat, color
df_new2 = pd.DataFrame({'Success Rate' : [94.7,94.3,91,88,84,81,78,74.6], 'Failure Rate' : [5.3,5.7,9,12,16,19,22,25.4]})
df_new2['inds'] = ['A','B','C','D','E','F','G','H']
p = Bar(df_new2,
values=blend('Failure Rate','Success Rate', name='% Success/Failure', labels_name='stacked'),
label=cat('inds'),
stack=cat(columns='stacked', sort=False),
color=color(columns='stacked', palette=['Red', 'Green'],
sort=False),
legend='top_right',
title="Success Rate vs. Failure Rate")
output_file("pandas_series.html", title="pandas_series.py example") #assigning the name of output screen
show(p) #showing the output of bokeh