Binning data and plotting - pandas

I have a dataframe of essentially random numbers, (except for one column), some of which are NaNs. MWE:
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
randomNumberGenerator = np.random.RandomState(1000)
z = 5 * randomNumberGenerator.rand(101)
A = 4 * z - 3+ randomNumberGenerator.randn(101)
B = 4 * z - 2+ randomNumberGenerator.randn(101)
C = 4 * z - 1+ randomNumberGenerator.randn(101)
D = 4 * z - 4+ randomNumberGenerator.randn(101)
A[50] = np.nan
A[:3] = np.nan
B[12:20] = np.nan
sources= pd.DataFrame({'z': z})
sources['A'] = A
sources['B'] = B
sources['C'] = C
sources['D'] = D
#sources= sources.dropna()
x = sources.z
y1 = sources.A
y2 = sources.B
y3 = sources.C
y4 = sources.D
for i in [y1, y2, y3, y4]:
count = np.count_nonzero(~np.logical_or(np.isnan(x), np.isnan(i)))
label = 'Points plotted: %d'%count
plt.scatter(x, i, label = label)
plt.legend()
I need to bin the data according to x and plot different columns in each bin, in 3 side-by-side subplots:
x_1 <= 1 plot A-B | 1 < x_2 < 3 plot B+C | 3 < x_3 plot C-D
I've tried to bin the data with
x1 = sources[sources['z']<1] # z < 1
x2 = sources[sources['z']<3]
x2 = x2[x2['z']>=1] # 1<= z < 3
x3 = sources[sources['z']<max(z)]
x3 = x3[x3['z']>=3] # 3 <= z <= max(z)
x1 = x1['z']
x2 = x2['z']
x3 = x3['z']
but there's got to be a better way to go about it. What's the best way to produce something like this?

For binning in pandas is used cut, so solution is:
sources= pd.DataFrame({'z': z})
sources['A'] = A
sources['B'] = B
sources['C'] = C
sources['D'] = D
#sources= sources.dropna()
bins = pd.cut(sources['z'], [-np.inf, 1, 3, max(z)], labels=[1,2,3])
m1 = bins == 1
m2 = bins == 2
m3 = bins == 3
x11 = sources.loc[m1, 'A']
x12 = sources.loc[m1, 'B']
x21 = sources.loc[m2, 'B']
x22 = sources.loc[m2, 'C']
x31 = sources.loc[m3, 'C']
x32 = sources.loc[m3, 'D']
y11 = sources.loc[m1, 'A']
y12 = sources.loc[m1, 'B']
y21 = sources.loc[m2, 'B']
y22 = sources.loc[m2, 'C']
y31 = sources.loc[m3, 'C']
y32 = sources.loc[m3, 'D']
tups = [(x11, x12, y11, y12), (x21, x22,y21, y22),(x31, x32, y31, y32)]
fig, ax = plt.subplots(1,3)
ax = ax.flatten()
for k, (i1, i2, j1, j2) in enumerate(tups):
count1 = np.count_nonzero(~np.logical_or(np.isnan(i1), np.isnan(j1)))
count2 = np.count_nonzero(~np.logical_or(np.isnan(i2), np.isnan(j2)))
label1 = 'Points plotted: %d'%count1
label2 = 'Points plotted: %d'%count2
ax[k].scatter(i1, j1, label = label1)
ax[k].scatter(i2, j2, label = label2)
ax[k].legend()

Related

centre the peak at x=0

Right now the rectangle signal is centre on x = 4, how can I make it centre on x = 0
def rect(n,T):
a = np.zeros(int((n-T)/2,))
b = np.ones((T,))
c= np.zeros(int((n-T)/2,))
a1 = np.append(a,b)
a2 = np.append(a1,c)
return a2
x =rect(11,6)
plt.step(x, 'r')
plt.show()
This is so far that I wrote. Appreciate anyone can give the Idea
A method to center the rectangle at x=0 is to provide x values to plt.step. One way to accomplish this is to use numpy arange and center the x values around 0 by using the length of a2 returned in the rects function
# Changed to y because it will be our y values in plt.step
y = rect(11, 6)
# Add 0.5 so it's centered
x = np.arange(-len(y)/2 + 0.5, len(y)/2 + 0.5)
And then plot it using plt.step and setting where to mid (more info in the plt.step docs):
plt.step(x, y, where='mid', color='r')
Hope this helps. Here is the full code:
import numpy as np
import matplotlib.pyplot as plt
def rect(n, T):
a = np.zeros(int((n-T)/2,))
b = np.ones((T,))
c = np.zeros(int((n-T)/2,))
a1 = np.append(a, b)
a2 = np.append(a1, c)
return a2
y = rect(11, 6)
# Add 0.5 so it's centered
x = np.arange(-len(y)/2 + 0.5, len(y)/2 + 0.5)
plt.step(x, y, where='mid', color='r')
plt.show()

Offset rotation matrix

I'm working with 2 imu's. I need to offset all frames with the first frame from the sensor. I have created a fictive scenario, where I precisely know the rotation and the wanted result. I need the two sensors to show the same result when their initial (start) orientation is subtracted.
import numpy as np
# Sensor 0,1 and 2 start orientation in degrees
s0_x = 30
s0_y = 0
s0_z = 0
s1_x = 0
s1_y = 40
s1_z = 0
s2_x = 10
s2_y = 40
s2_z= -10
# Change from start frame 1
x1 = 20
y1 = 10
z1 = 0
# Change from start frame 2
x2 = 60
y2 = 30
z2 = 30
GCS= [[1,0,0],[0,1,0],[0,0,1]]
sensor0 = [[s0_x, s0_y, s0_z], [s0_x, s0_y, s0_z], [s0_x, s0_y, s0_z]]
sensor1 = [[s1_x, s1_y, s1_z], [s1_x + x1, s1_y + y1, s1_z + z1],[s1_x + x1 + x2, s1_y + y1+ y2, s1_z + z1+ z2]]
sensor2 = [[s2_x, s2_y, s2_z], [s2_x + x1, s2_y + y1, s2_z + z1], [s2_x + x1+ x2, s2_y + y1+ y2, s2_z + z1+ z2]]
def Rot_Mat_X(theta):
r = np.array([[1,0,0],[0,np.cos(np.deg2rad(theta)),-np.sin(np.deg2rad(theta))],[0,np.sin(np.deg2rad(theta)),np.cos(np.deg2rad(theta))]])
return r
# rotation the rotation matrix around the Y axis (input in deg)
def Rot_Mat_Y(theta):
r = np.array([[np.cos(np.deg2rad(theta)),0,np.sin(np.deg2rad(theta))],
[0,1,0],
[-np.sin(np.deg2rad(theta)),0,np.cos(np.deg2rad(theta))]])
return r
# rotation the rotation matrix around the Z axis (input in deg)
def Rot_Mat_Z(theta):
r = np.array([[np.cos(np.deg2rad(theta)),-np.sin(np.deg2rad(theta)),0],
[np.sin(np.deg2rad(theta)),np.cos(np.deg2rad(theta)),0],
[0,0,1]])
return r
# Creating the rotation matrices
r_sensor0 = []
r_sensor1= []
r_sensor2= []
for i in range(len(sensor1)):
r_sensor1_z = np.matmul(Rot_Mat_X(sensor1[i][0]),GCS)
r_sensor1_zy = np.matmul(Rot_Mat_Y(sensor1[i][1]),r_sensor1_z)
r_R_Upperarm_medial_zyx = np.matmul(Rot_Mat_Z(sensor1[i][2]),r_sensor1_zy )
r_sensor1.append(r_R_Upperarm_medial_zyx )
r_sensor2_z = np.matmul(Rot_Mat_X(sensor2[i][0]),GCS)
r_sensor2_zy = np.matmul(Rot_Mat_Y(sensor2[i][1]),r_sensor2_z )
r_sensor2_zyx = np.matmul(Rot_Mat_Z(sensor2[i][2]),r_sensor2_zy )
r_sensor2.append(r_sensor2_zyx )
r_start_sensor1 = r_sensor1[0]
r_start_sensor2 = r_sensor2[0]
r_offset_sensor1 = []
r_offset_sensor2 = []
for i in range(len(sensor0)):
r_offset_sensor1.append(np.matmul(np.transpose(r_start_sensor1),r_sensor1[i]))
r_offset_sensor2.append(np.matmul(np.transpose(r_start_sensor2),r_sensor2[i]))
# result:
r_offset_sensor1[0] = [[1,0,0],[0,1,0],[0,0,1]]
r_offset_sensor1[1] = [[0.984,0.059,0.163],[0,0.939,-0.342],[-0.173,0.336,0.925]]
r_offset_sensor1[2] = [[0.748,0.466,0.471],[0.086,0.635,-0.767],[-0.657,0.615,0.434]]
r_offset_sensor2[0] = [[1,0,0],[0,1,0],[0,0,1]]
r_offset_sensor2[1] = [[0.984,0.086,0.150],[-0.03,0.938,-0.344],[-0.171,0.334,0.926]]
r_offset_sensor2[2] = [[0.748,0.541,0.383],[-0.028,0.603,-0.797],[-0.662,0.585,0.466]]
I expect the result of sensors 1 and 2 to be equal for all frames but it doesn't? And they should be:
frame[0] = [1,0,0],[0,1,0],[0,0,1]
frame[1] = [0.984,0,0.173],[0.059,0.939,-0.336],[-0.163,0.342,0.9254]
frame[2] = [0.750,-0.433,0.50],[0.625,0.216,-0.750],[0.216,0.875,0.433]

How to set 'y > 0' formula in set_xlim of matplotlib?

I want to set x range according to y value in plotting graph such as y > 0 but I'm not sure how to set this one. Could you let me know how to set it?
df = pd.read_csv(file.csv)
x = np.array(df1['A'])
y = np.array(df1['B'])
z = np.array(df1['C'])
x_for_ax1 = np.ma.masked_where((y < 0) | (y > 100), x)
fig, (ax2, ax1) = plt.subplots(ncols=1, nrows=2)
# range of ax1.set_xlim and ax1.set_xlim is same.
ax1.set_ylim([-10, 40])
ax2.set_ylim([-5, 5])
ax1.set_xlim([x_for_ax1.min(), x_for_ax1.max()])
ax2.set_xlim([x_for_ax1.min(), x_for_ax1.max()])
If you want to set the x-limits to the range of the y-axis, you can use a masked array and get its minimum and maximum.
In the example below, at the left both subplots get the x-limits where either y or z are in range. At the right, each subplot only gets the x-range where its corresponding y is in range.
For demonstration purposes, the example creates a data frame from some dummy data.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
a = np.linspace(-1, 4, 500)
b = np.sin(a) * 100
c = np.cos(a) * 150
df = pd.DataFrame({'A': a, 'B': b, 'C': c})
x = np.array(df['A'])
y = np.array(df['B'])
z = np.array(df['C'])
fig, ((ax1, ax3),(ax2, ax4)) = plt.subplots(ncols=2, nrows=2)
ax1.set_xlabel('x')
ax2.set_xlabel('x')
ax3.set_xlabel('x')
ax4.set_xlabel('x')
ax1.set_ylabel('y')
ax3.set_ylabel('y')
ax2.set_ylabel('z')
ax4.set_ylabel('z')
ymin = 1
ymax = 100
zmin = 1
zmax = 150
x_for_ax1 = np.ma.masked_where(((y < ymin) | (y > ymax)) & ((z < zmin) | (z > zmax)), x)
x_for_ax3 = np.ma.masked_where((y < ymin) | (y > ymax), x)
x_for_ax4 = np.ma.masked_where((z < zmin) | (z > zmax), x)
ax1.plot(x, y)
ax3.plot(x, y)
ax1.set_ylim([ymin, ymax])
ax3.set_ylim([ymin, ymax])
ax2.plot(x, z)
ax4.plot(x, z)
ax2.set_ylim([zmin, zmax])
ax4.set_ylim([zmin, zmax])
ax1.set_xlim([x_for_ax1.min(), x_for_ax1.max()])
ax2.set_xlim([x_for_ax1.min(), x_for_ax1.max()])
ax1.set_title('x limited to y and z range')
ax2.set_title('x limited to y and z range')
ax3.set_xlim([x_for_ax3.min(), x_for_ax3.max()])
ax3.set_title('x limited to y range')
ax4.set_xlim([x_for_ax4.min(), x_for_ax4.max()])
ax4.set_title('x limited to z range')
plt.tight_layout(w_pad=1)
plt.show()

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})
Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

coloring cell while exporting dataframe to excel

I have dataframe "df" as below:
x = [1,3,5,7]
y1 = [3,2,2,2]
y2 = [2,5,2,2]
y3 = [7,2,2,1]
df = pd.DataFrame({'x': x, 'y1': y1, 'y2': y2, 'y3': y3})
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
I want the excel output file shows the same color in mutual values of column x with other columns. :
You can use styles if colors are specify by dictionary:
def color(a):
d = {1:'yellow', 3:'green', 5:'blue', 7:'red'}
d1 = {k: 'background-color:' + v for k, v in d.items()}
df1 = pd.DataFrame(index=a.index, columns=a.columns)
df1 = a.applymap(d1.get).fillna('')
return df1
df.style.apply(color, axis=None).to_excel('styled.xlsx', engine='openpyxl', index=False)