Probabilistic Record Linkage in Pandas - pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})

Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

Related

Offset rotation matrix

I'm working with 2 imu's. I need to offset all frames with the first frame from the sensor. I have created a fictive scenario, where I precisely know the rotation and the wanted result. I need the two sensors to show the same result when their initial (start) orientation is subtracted.
import numpy as np
# Sensor 0,1 and 2 start orientation in degrees
s0_x = 30
s0_y = 0
s0_z = 0
s1_x = 0
s1_y = 40
s1_z = 0
s2_x = 10
s2_y = 40
s2_z= -10
# Change from start frame 1
x1 = 20
y1 = 10
z1 = 0
# Change from start frame 2
x2 = 60
y2 = 30
z2 = 30
GCS= [[1,0,0],[0,1,0],[0,0,1]]
sensor0 = [[s0_x, s0_y, s0_z], [s0_x, s0_y, s0_z], [s0_x, s0_y, s0_z]]
sensor1 = [[s1_x, s1_y, s1_z], [s1_x + x1, s1_y + y1, s1_z + z1],[s1_x + x1 + x2, s1_y + y1+ y2, s1_z + z1+ z2]]
sensor2 = [[s2_x, s2_y, s2_z], [s2_x + x1, s2_y + y1, s2_z + z1], [s2_x + x1+ x2, s2_y + y1+ y2, s2_z + z1+ z2]]
def Rot_Mat_X(theta):
r = np.array([[1,0,0],[0,np.cos(np.deg2rad(theta)),-np.sin(np.deg2rad(theta))],[0,np.sin(np.deg2rad(theta)),np.cos(np.deg2rad(theta))]])
return r
# rotation the rotation matrix around the Y axis (input in deg)
def Rot_Mat_Y(theta):
r = np.array([[np.cos(np.deg2rad(theta)),0,np.sin(np.deg2rad(theta))],
[0,1,0],
[-np.sin(np.deg2rad(theta)),0,np.cos(np.deg2rad(theta))]])
return r
# rotation the rotation matrix around the Z axis (input in deg)
def Rot_Mat_Z(theta):
r = np.array([[np.cos(np.deg2rad(theta)),-np.sin(np.deg2rad(theta)),0],
[np.sin(np.deg2rad(theta)),np.cos(np.deg2rad(theta)),0],
[0,0,1]])
return r
# Creating the rotation matrices
r_sensor0 = []
r_sensor1= []
r_sensor2= []
for i in range(len(sensor1)):
r_sensor1_z = np.matmul(Rot_Mat_X(sensor1[i][0]),GCS)
r_sensor1_zy = np.matmul(Rot_Mat_Y(sensor1[i][1]),r_sensor1_z)
r_R_Upperarm_medial_zyx = np.matmul(Rot_Mat_Z(sensor1[i][2]),r_sensor1_zy )
r_sensor1.append(r_R_Upperarm_medial_zyx )
r_sensor2_z = np.matmul(Rot_Mat_X(sensor2[i][0]),GCS)
r_sensor2_zy = np.matmul(Rot_Mat_Y(sensor2[i][1]),r_sensor2_z )
r_sensor2_zyx = np.matmul(Rot_Mat_Z(sensor2[i][2]),r_sensor2_zy )
r_sensor2.append(r_sensor2_zyx )
r_start_sensor1 = r_sensor1[0]
r_start_sensor2 = r_sensor2[0]
r_offset_sensor1 = []
r_offset_sensor2 = []
for i in range(len(sensor0)):
r_offset_sensor1.append(np.matmul(np.transpose(r_start_sensor1),r_sensor1[i]))
r_offset_sensor2.append(np.matmul(np.transpose(r_start_sensor2),r_sensor2[i]))
# result:
r_offset_sensor1[0] = [[1,0,0],[0,1,0],[0,0,1]]
r_offset_sensor1[1] = [[0.984,0.059,0.163],[0,0.939,-0.342],[-0.173,0.336,0.925]]
r_offset_sensor1[2] = [[0.748,0.466,0.471],[0.086,0.635,-0.767],[-0.657,0.615,0.434]]
r_offset_sensor2[0] = [[1,0,0],[0,1,0],[0,0,1]]
r_offset_sensor2[1] = [[0.984,0.086,0.150],[-0.03,0.938,-0.344],[-0.171,0.334,0.926]]
r_offset_sensor2[2] = [[0.748,0.541,0.383],[-0.028,0.603,-0.797],[-0.662,0.585,0.466]]
I expect the result of sensors 1 and 2 to be equal for all frames but it doesn't? And they should be:
frame[0] = [1,0,0],[0,1,0],[0,0,1]
frame[1] = [0.984,0,0.173],[0.059,0.939,-0.336],[-0.163,0.342,0.9254]
frame[2] = [0.750,-0.433,0.50],[0.625,0.216,-0.750],[0.216,0.875,0.433]

Sequential sampling from conditional multivariate normal

I'm trying to sequentially sample from a Gaussian Process prior.
The problem is that the samples eventually converge to zero or diverge to infinity.
I'm using the basic conditionals described e.g. here
Note: the kernel(X,X) function returns the squared exponential kernel with isometric noise.
Here is my code:
n = 32
x_grid = np.linspace(-5,5,n)
x_all = []
y_all = []
for x in x_grid:
x_all = [x] + x_all
X = np.array(x_all).reshape(-1, 1)
# Mean and covariance of the prior
mu = np.zeros((X.shape), np.float)
cov = kernel(X, X)
if len(mu)==1: # first sample is not conditional
y = np.random.randn()*cov + mu
else:
# condition on all previous samples
u1 = mu[0]
u2 = mu[1:]
y2 = np.atleast_2d(np.array(y_all)).T
C11 = cov[:1,:1] # dependent sample
C12 = np.atleast_2d(cov[0,1:])
C21 = np.atleast_2d(cov[1:,0]).T
C22 = np.atleast_2d(cov[1:, 1:])
C22_ = la.inv(C22)
u = u1 + np.dot(C12, np.dot(C22_, (y2 - u2)))
C22_xC21 = np.dot(C22_, C21)
C_minus = np.dot(C12, C22_xC21) # this weirdly becomes larger than C!
C = C11 - C_minus
y = u + np.random.randn()*C
y_all = [y.flatten()[0]] + y_all
Here's an example with 32 samples, where it collapses:
enter image description here
Here's an example with 34 samples, where it explodes:
enter image description here
(for this particular kernel, 34 is the number of samples at which (or more) the samples start to diverge.

Tensorflow: How to update only single variable at a time out of many variables based on conditions

k1 = tf.Variable(10.0)
k2 = tf.Variable(10.0)
pred = tf.pow(B, ?) / C
cost = tf.pow(pred_s1 - Y, 2)
optimizer = tf.train.AdamOptimizer(0.01).minimize(cost)
sess.run(optimizer, feed_dict{A:a, B:b, C:c})
Update:
pred = tf.pow(B, k1) / C if A == 0
pred = tf.pow(B, k2) / C if A == 1
Single prediction function which updates only one variable based on the value fed into placeholder 'A'
s1 = tf.Variable(tf.random_normal([1]))
s2 = tf.Variable(tf.random_normal([1]))
s3 = tf.Variable(tf.random_normal([1]))
s4 = tf.Variable(tf.random_normal([1]))
s5 = tf.Variable(tf.random_normal([1]))
D = tf.placeholder("float")
s2_s = tf.where(tf.logical_and(1.9<D,D<2.1),x=s2,y=s1)
s3_s = tf.where(tf.logical_and(2.9<D,D<3.1),x=s3,y=s2_s)
s4_s = tf.where(tf.logical_and(3.9<D,D<4.1),x=s4,y=s3_s)
s5_s = tf.where(tf.logical_and(4.9<D,D<5.1),x=s5,y=s4_s)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run([s1])[0], sess.run([s2])[0], sess.run([s3])[0], sess.run([s4])[0], sess.run([s5])[0])
print(sess.run(s5_s, feed_dict={D:5}))
sess.close()
Just use
pred = tf.pow(B, A*k2 + (1-A)* k1) / C
Which gives the switch. An alternative would be tf.where.

How to deal with many columns in Tensorflow

I am studying Tensorflow, and I have a question.
Original code is that
Columns = ['size' , 'room', 'price']
x1 = tf.Variable(np.array(columns['size']).astype(np.float32))
x2 = tf.Variable(np.array(columns['room']).astype(np.float32))
y = tf.Variable(np.array(columns['price']).astype(np.float32))enter code here
train_X1 = np.asarray([i[1] for i in data.loc[:,['size']].to_records()],dtype="float")
train_X2 = np.asarray([i[1] for i in data.loc[:,['room']].to_records()],dtype="float")
train_X = np.asarray([i[1] for i in data.loc[:,'size':'room'].to_records()],dtype="float")
train_Y = np.asarray([i[1] for i in data.loc[:,['price']].to_records()],dtype="float")
n_samples = train_X.shape[0]
X1 = tf.placeholder("float")
X2 = tf.placeholder("float")
Y = tf.placeholder("float")
W1 = tf.Variable(rng.randn(), name="weight1")
W2 = tf.Variable(rng.randn(), name="weight2")
b = tf.Variable(rng.randn(), name="bias")
sum_list = [tf.multiply(X1,W1),tf.multiply(X2,W2)]
pred_X = tf.add_n(sum_list)
pred = tf.add(pred_X,b)
cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples)
If I have many columns like that
Columns = ['price','lotsize','bedrooms','bathrms', 'stories', 'garagepl', 'driveway', 'recroom', \
'fullbase', 'gashw', 'airco', 'prefarea']
How do i deal with many columns in Tensorflow?
(Independent variable = 'price', dependent variable = else)
Do I have to make each train_set and W with columns?

NameError when running GMRes following FEniCS discretisation

I've discretised a diffusion equation with FEniCS as follows:
def DiscretiseEquation(h):
mesh = UnitSquareMesh(h, h)
V = FunctionSpace(mesh, 'Lagrange', 1)
def on_boundary(x, on_boundary):
return on_boundary
bc_value = Constant(0.0)
boundary_condition = DirichletBC(V, bc_value, on_boundary)
class RandomDiffusionField(Expression):
def __init__(self, m, n, element):
self._rand_field = np.exp(-np.random.randn(m, n))
self._m = m
self._n = n
self._ufl_element = element
def eval(self, value, x):
x_index = np.int(np.floor(self._m * x[0]))
y_index = np.int(np.floor(self._n * x[1]))
i = min(x_index, self._m - 1)
j = min(y_index, self._n - 1)
value[0] = self._rand_field[i, j]
def value_shape(self):
return(1, )
class RandomRhs(Expression):
def __init__(self, m, n, element):
self._rand_field = np.random.randn(m, n)
self._m = m
self._n = n
self._ufl_element = element
def eval(self, value, x):
x_index = np.int(np.floor(self._m * x[0]))
y_index = np.int(np.floor(self._n * x[1]))
i = min(x_index, self._m - 1)
j = min(y_index, self._n - 1)
value[0] = self._rand_field[i, j]
def value_shape(self):
return (1, )
u = TrialFunction(V)
v = TestFunction(V)
random_field = RandomDiffusionField(100, 100, element=V.ufl_element())
zero = Expression("0", element=V.ufl_element())
one = Expression("1", element=V.ufl_element())
diffusion = as_matrix(((random_field, zero), (zero, one)))
a = inner(diffusion * grad(u), grad(v)) * dx
L = RandomRhs(h, h, element=V.ufl_element()) * v * dx
A = assemble(a)
b = assemble(L)
boundary_condition.apply(A, b)
A = as_backend_type(A).mat()
(indptr, indices, data) = A.getValuesCSR()
mat = csr_matrix((data, indices, indptr), shape=A.size)
rhs = b.array()
#Solving
x = spsolve(mat, rhs)
#Conversion to a FEniCS function
u = Function(V)
u.vector()[:] = x
I am running the GMRES solver as normal. The callback argument is a separate iteration counter I've defined.
DiscretiseEquation(100)
A = mat
b = rhs
x, info = gmres(A, b, callback = IterCount())
The routine returns a NameError, stating that 'mat' is not defined:
NameError Traceback (most recent call last)
<ipython-input-18-e096b2eea097> in <module>()
1 DiscretiseEquation(200)
----> 2 A = mat
3 b = rhs
4 x_200, info_200 = gmres(A, b, callback = IterCount())
5 gmres_res = closure_variables["residuals"]
NameError: name 'mat' is not defined
As far as I'm aware, it should be defined when I call the DiscretiseEquation function?