How can a function be applied on a pandas groupby that requires parameters from multiple columns of the groupby dataframe and returns two scaler values.
Below is the repeatable example. The last line gets the f_value
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
import plotly.express as px
n=100
df = pd.DataFrame({
'c': np.random.choice(['CATS', 'DOGS'], n),
'x': np.random.choice(list('ABCDE'), n),
'y': np.random.normal(5, 1, n)
})
signal = np.where(df['c'].eq('CATS') & df['x'].eq('A'), 1.1, 0)
df['y'] = df['y'] + signal
def get_ols_fp(df, x, y):
formula = y + '~' + x
model = ols(formula, df).fit()
f_value = model.fvalue
p_value = model.f_pvalue
return (f_value, p_value)
# getting f_value and p_value works with a single series.
get_ols_fp(df[df['c'].eq('CATS')], 'x', 'y')
This above code works and fetches the f_value and the p_value. However, the following does not work.
# how could we run the get_ols with a groupby().agg()
df.groupby('c').agg(get_ols_fp('x', 'y'))
The desired output would be a dataframe one row per level of the 'c' variable ('CATTS' and 'DOGS') in this case and one column for the p_value, and another for the f_value.
This is working :
def get_ols_fp(df, x=None, y=None):
formula = y + '~' + x
model = ols(formula, df).fit()
f_value = model.fvalue
p_value = model.f_pvalue
return pd.Series([f_value, p_value], index=['f_value', 'p_value'])
df.groupby('c').apply(get_ols_fp, x='x', y = 'y')
I'd do it a little different.
I don't know if it's the easiest way, but it works.
Example:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
n=100
df = pd.DataFrame({
'c': np.random.choice(['CATS', 'DOGS'], n),
'x': np.random.choice(list('ABCDE'), n),
'y': np.random.normal(5, 1, n)
})
signal = np.where(df['c'].eq('CATS') & df['x'].eq('A'), 1.1, 0)
df['y'] = df['y'] + signal
def get_ols_fp(df, x, y):
formula = y + '~' + x
model = ols(formula, df).fit()
f_value = model.fvalue
p_value = model.f_pvalue
return (f_value, p_value)
# getting f_value and p_value works with a single series.
# get_ols_fp(df[df['c'].eq('CATS')], 'x', 'y')
df_result = pd.DataFrame([], columns = ["c", "f_value", "p_value"])
for c, dd in df.groupby(['c']):
v = get_ols_fp(dd, 'x', 'y')
df_result.loc[len(df_result)] = [c, *v]
df_result
Related
My code looks like this:
import pandas as pd
import numpy as np
from skimage.io import imread
df = pd.DataFrame()
for i in range(1000):
try:
image = imread(f"Images/{i}.jpg")
featureMatrix = np.zeros((image.shape[0], image.shape[1]))
for j in range(0, image.shape[0]):
for k in range(0, image.shape[1]):
featureMatrix[j][k] = ((int(image[j, k, 0]) + int(image[j, k, 1]) + int(image[j, k, 2])) / 3)
features = pd.Series(np.reshape(featureMatrix, (image.shape[0] * image.shape[1])))
df[f"{i}"] = features
except:
pass
df.to_csv("Features.csv")
And when I run it I get a PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling frame.insert many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use newframe = frame.copy() when 'df[f"{i}"] = features' is run
I have tried using pd.concat but I am cannot get it to work. Any ideas on how I should replace the line?
To improve performance and optimize processing avoid inserting a new Series into a dataframe on each of 1000 iterations.
Instead yield all series (with setting their name) with a generator function and concat them at once with pd.concat:
def collect_features():
for i in range(1000):
try:
image = imread(f"Images/{i}.jpg")
featureMatrix = np.zeros((image.shape[0], image.shape[1]))
for j in range(0, image.shape[0]):
for k in range(0, image.shape[1]):
featureMatrix[j][k] = ((int(image[j, k, 0]) + int(image[j, k, 1]) + int(image[j, k, 2])) / 3)
yield pd.Series(np.reshape(featureMatrix, (image.shape[0] * image.shape[1])), name=f"{i}")
except:
pass
pd.concat(list(collect_features()), axis=1).to_csv("Features.csv")
what about this alternative approach ?
import pandas as pd
import numpy as np
from skimage.io import imread
df_list = []
for i in range(1000):
try:
image = imread(f"Images/{i}.jpg")
featureMatrix = np.zeros((image.shape[0], image.shape[1]))
for j in range(0, image.shape[0]):
for k in range(0, image.shape[1]):
featureMatrix[j][k] = ((int(image[j, k, 0]) + int(image[j, k, 1]) + int(image[j, k, 2])) / 3)
features = pd.Series(np.reshape(featureMatrix, (image.shape[0] * image.shape[1])))
df_list.append(features)
except:
pass
df = pd.concat(df_list, axis=1)
df.to_csv("Features.csv")
I'm trying to base off gradient shading of a dataframe, using values from another dataframe (same dimension).
I have the below code, based from an answer to a similar question. However I need the shading to have the effect of "axis=None", where as below applies a column-wise shade.
A = pd.DataFrame(np.random.randn(6, 3), columns=['a', 'b', 'c'])
B = pd.DataFrame(np.random.randn(6, 3), columns=['a', 'b', 'c'])
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
def b_g(s, cmap='PuBu', low=0, high=0):
# Pass the columns from Dataframe A
a = A.loc[:,s.name].copy()
rng = a.max() - a.min()
norm = colors.Normalize(a.min() - (rng * low),
a.max() + (rng * high))
normed = norm(a.values)
c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
return ['background-color: %s' % color for color in c]
B.style.apply(b_g,cmap='PuBu')
I will read data from a file, I have only two number in file are(1.63 , -0.21),output :
{'y': array([-0.21]), 'x': array([1.63])}
I need the output like this:
position = {'x': 1.63 , 'y' : -0.21}
this my code:
import pandas as pd
import numpy as np
def read():
data = pd.read_csv('distance.csv', skipinitialspace=True, header=None)
x0, y0 = np.array(data.ix[:,0]), np.array(data.ix[:,1])
position = {'x': x0 , 'y' : y0}
print position
if __name__ == '__main__':
try:
read()
except KeyboardInterrupt:
rospy.loginfo('Shutting down')
please help me
Thank you in advance
Change this
x0, y0 = np.array(data.ix[:,0]), np.array(data.ix[:,1]) to x0, y0 = data.ix[:,0], data.ix[:,1]
Essentially you need to remove the np.array wrapping which converting your float to an nd.array
try this
import numpy as np
data = np.genfromtxt('distance.csv', dtype=list).tolist()
x0,y0 = float(data[0]), float(data[1])
position = {'x': x0 , 'y' : y0}
print position
the output is:
{'y': -0.7, 'x': 1.7}
by this way worked but it is long
import pandas as pd
import numpy as np
import csv
def read():
data_path = 'distance.csv'
with open(data_path, 'r') as f:
reader = csv.reader(f, delimiter=',')
# get all the rows as a list
data = list(reader)
# transform data into numpy array
data = np.array(data).astype(float)
a = data[0]
x0,y0 = a[0], a[1]
#
position = {'x': x0 , 'y' : y0}
print position
if __name__ == '__main__':
try:
read()
except KeyboardInterrupt:
rospy.loginfo('Shutting down')
output:
{'y': -0.21, 'x': 1.63}
also this way is good:
import pandas as pd
import numpy as np
def read():
data = np.genfromtxt('distance.csv', dtype=str, delimiter=',')
x0, y0 = (data[0]), (data[1])
position = {'x': x0 , 'y' : y0}
print position
if __name__ == '__main__':
try:
read()
except KeyboardInterrupt:
rospy.loginfo('Shutting down')
output:
{'y': '-0.21', 'x': '1.63'}
I have some data (A,B) and have used seaborn to make a contour plot of it.
import pandas as pd
import seaborn as sns
# Dataframe 1
df_1 = pd.DataFrame({'A':[1,2,1,2,3,4,2,1,4], 'B': [2,1,2,1,2,3,4,2,1]})
# Plot A v B
ax = sns.kdeplot(df_1["A"], df_1["B"])
I would like to get the cumulative count please (C). I’d like to make a new plot with C on the Y axis, A on the X axis and contours of B. I think that if I could start off by making a new dataframe of A,B,H where H was the count (the height of the volcano) then that might be a start. The resulting plot might look a bit like this:
I think I've worked it out but this solution is messy:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import chain
Fruit = 9 # How many were there?
# Dataframe 1
df_1 = pd.DataFrame({'A':[1,2,1,2,3,4,2,1,4], 'B': [2,1,2,1,2,3,4,2,1]})
m1 = df_1["A"]
m2 = df_1["B"]
xmin = 0
xmax = 5
ymin = 0
ymax = 5
# Kernel density estimate:
X, Y = np.mgrid[xmin:xmax:5j, ymin:ymax:5j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
H = np.reshape(kernel(positions).T, X.shape)
# Re-jig it
X = X.reshape((25, 1))
Y = Y.reshape((25, 1))
H = H.reshape((25, 1))
X_L = list(chain.from_iterable(X))
Y_L = list(chain.from_iterable(Y))
H_L = list(chain.from_iterable(H))
df_2 = pd.DataFrame({'A': X_L, 'B': Y_L, 'H': H_L})
# Find the cumulative count C
df_2 = df_2.sort_values('B')
C = np.cumsum(H)
C = C.reshape((25, 1))
C_L = list(chain.from_iterable(C))
df_2['C'] = pd.DataFrame(C_L, index=df_2.index)
# Scale C
Max_C = np.amax(C)
df_2.loc[:,'C'] *= Fruit / Max_C
# Break it down to constant B
df_2_B_0 = df_2[df_2['B'] == 0]
df_2_B_1 = df_2[df_2['B'] == 1]
df_2_B_2 = df_2[df_2['B'] == 2]
df_2_B_3 = df_2[df_2['B'] == 3]
df_2_B_4 = df_2[df_2['B'] == 4]
# Plot A v C
ax = df_2_B_0.plot('A','C', label='0')
df_2_B_1.plot('A','C',ax=ax, label='1')
df_2_B_2.plot('A','C',ax=ax, label='2')
df_2_B_3.plot('A','C',ax=ax, label='3')
df_2_B_4.plot('A','C',ax=ax, label='4')
plt.ylabel('C')
plt.legend(title='B')
To keep track of all simulation-results in a parametric run, i create a MultIndex DataFrame named dfParRun in pandas as follows:
import pandas as pd
import numpy as np
import itertools
limOpt = [0.1,1,10]
reimbOpt = ['Cash','Time']
xOpt = [0.1, .02, .03, .04, .05, .06, .07, .08]
zOpt = [1,5n10]
arrays = [limOpt, reimbOpt, xOpt, zOpt]
parameters = list(itertools.product(*arrays))
nPar = len(parameters)
variables = ['X', 'Y', 'Z']
nVar = len(variables)
index = pd.MultiIndex.from_tuples(parameters, names=['lim', 'reimb', 'xMax', 'zMax'])
dfParRun = pd.DataFrame(np.random.rand((nPar, nVar)), index=index, columns=variables)
To analyse my parametric run, i want to slice this dataframe but this seems a burden. For example, i want to have all results for xMax above 0.5 and lim equal to 10. At this moment, the only working method i find is:
df = dfParRun.reset_index()
df.loc[(df.xMax>0.5) & (df.lim==10)]
and i wonder if there is a method without resetting the index of the DataFrame ?
option 1
use pd.IndexSlice
caveat: requires sort_index
dfParRun.sort_index().loc[pd.IndexSlice[10, :, .0500001:, :]]
option 2
use your df after having reset_index
df.query('xMax > 0.05 & lim == 10')
setup
import pandas as pd
import numpy as np
import itertools
limOpt = [0.1,1,10]
reimbOpt = ['Cash','Time']
xOpt = [0.1, .02, .03, .04, .05, .06, .07, .08]
zOpt = [1, 5, 10]
arrays = [limOpt, reimbOpt, xOpt, zOpt]
parameters = list(itertools.product(*arrays))
nPar = len(parameters)
variables = ['X', 'Y', 'Z']
nVar = len(variables)
index = pd.MultiIndex.from_tuples(parameters, names=['lim', 'reimb', 'xMax', 'zMax'])
dfParRun = pd.DataFrame(np.random.rand(*(nPar, nVar)), index=index, columns=variables)
df = dfParRun.reset_index()