Logistic regression with custom dataset - numpy

From deeplearning course on Coursera I've implemented logistic regression :
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s
def initialize_with_zeros(dim):
w = np.zeros(shape=(dim, 1))
b = 0
return w, b
def propagate(w, b, X, Y):
m = X.shape[1]
A = sigmoid(np.dot(w.T, X) + b) # compute activation
cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
dw = (1 / m) * np.dot(X, (A - Y).T)
db = (1 / m) * np.sum(A - Y)
cost = np.squeeze(cost)
grads = {"dw": dw,
"db": db}
return grads, cost
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
costs = []
for i in range(num_iterations):
grads, cost = propagate(w, b, X, Y)
dw = grads["dw"]
db = grads["db"]
w = w - learning_rate * dw # need to broadcast
b = b - learning_rate * db
if i % 100 == 0:
costs.append(cost)
# Print the cost every 100 training examples
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" % (i, cost))
params = {"w": w,
"b": b}
grads = {"dw": dw,
"db": db}
return params, grads, costs
def predict(w, b, X):
m = X.shape[1]
Y_prediction = np.zeros((1, m))
w = w.reshape(X.shape[0], 1)
A = sigmoid(np.dot(w.T, X) + b)
for i in range(A.shape[1]):
# Convert probabilities a[0,i] to actual predictions p[0,i]
### START CODE HERE ### (≈ 4 lines of code)
print(A)
Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
### END CODE HERE ###
assert(Y_prediction.shape == (1, m))
return Y_prediction
print ("sigmoid(0) = " + str(sigmoid(0)))
print ("sigmoid(9.2) = " + str(sigmoid(9.2)))
dim = 2
w, b = initialize_with_zeros(dim)
print ("w = " + str(w))
print ("b = " + str(b))
w, b, X, Y = np.array([[1], [2]]), 2, np.array([[-1,-2], [3,4]]), np.array([[1, 0]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))
params, grads, costs = optimize(w, b, X, Y, num_iterations= 10000, learning_rate = 0.01, print_cost = False)
print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print("predictions = " + str(predict(w, b, X)))
def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
w, b = initialize_with_zeros(X_train.shape[0])
parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
w = parameters["w"]
b = parameters["b"]
Y_prediction_test = predict(w, b, X_test)
Y_prediction_train = predict(w, b, X_train)
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
d = {"costs": costs,
"Y_prediction_test": Y_prediction_test,
"Y_prediction_train" : Y_prediction_train,
"w" : w,
"b" : b,
"learning_rate" : learning_rate,
"num_iterations": num_iterations}
return d
I'm attempting to use a generic dataset which contains 5 samples where each sample contain 4 elements :
train_set_x = np.array([[1,2,3,4],[4,3,2,1],[1,2,3,4],[4,3,2,1],[1,2,3,4]])
train_set_y = np.array([1,0,1,0,1])
test_set_x = np.array([[1,2,3,4],[4,3,2,1],[1,2,3,4],[4,3,2,1],[1,2,3,4]])
test_set_y = np.array([1,0,1,0,1])
train_set_x , train_set_y , test_set_x , test_set_y
d = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 2000, learning_rate = 0.005, print_cost = True)
But the following error is thrown :
<ipython-input-409-bd4e233a8f4e> in propagate(w, b, X, Y)
18
19 A = sigmoid(np.dot(w.T, X) + b) # compute activation
---> 20 cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
21
22 dw = (1 / m) * np.dot(X, (A - Y).T)
ValueError: operands could not be broadcast together with shapes (5,) (1,4)
Do I need to change the weight dimensions in order to compute the cost value ?
Update :
Using modification :
A = sigmoid(np.dot(X , w) + b) # compute activation
causes error :
<ipython-input-546-7a7980550834> in propagate(w, b, X, Y)
20 m = X.shape[1]
21
---> 22 A = sigmoid(np.dot(X , w) + b) # compute activation
23 print('w.T' , w.T , 'w' , w, 'X' , X , 'Y' , Y , 'A' , A)
24 cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
ValueError: shapes (5,4) and (5,1) not aligned: 4 (dim 1) != 5 (dim 0)

Related

python function as cvxpy parameter for dynamic optimization (optimal control)

import numpy as np
def af(a,b):
return np.array([[a,b],[b**2, b]])
np.random.seed(1)
n = 2
m = 2
T = 50
alpha = 0.2
beta = 3
# A = np.eye(n) - alpha * np.random.rand(n, n)
B = np.random.randn(n, m)
x_0 = beta * np.random.randn(n)
import cvxpy as cp
x = cp.Variable((n, T + 1))
u = cp.Variable((m, T))
A = cp.Parameter((2,2))
cost = 0
constr = []
for t in range(T):
cost += cp.sum_squares(x[:, t + 1]) + cp.sum_squares(u[:, t])
A = af(*x[:,t])
constr += [x[:, t + 1] == A # x[:, t] + B # u[:, t], cp.norm(u[:, t], "inf") <= 1]
# sums problem objectives and concatenates constraints.
constr += [x[:, T] == 0, x[:, 0] == x_0]
problem = cp.Problem(cp.Minimize(cost), constr)
problem.solve()
I want to use python function (lambdify function) as cvxpy parameter. I tried this method, please let me know if cvxpy support python function as parameter. thank you.

Calculating gradients of cusom loss function with Gradient.Tape

I am trying custom traning of the network using Gradient.Tape method.
This traning is unsupervised.
The details of network and cost function is as following,
My Network is,
def CreateNetwork(inplayer, hidlayer, outlayer,seed):
model = keras.Sequential()
model.add(Dense(hidlayer, input_dim=inplayer, kernel_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(inplayer),seed=seed), bias_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(inplayer),seed=seed), activation='tanh'))
model.add(Dense(outlayer, kernel_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(hidlayer),seed=seed), bias_initializer=initializers.Zeros(), activation='linear'))
return model
and my custom cost function is defined as,
def H_tilda(J,U,nsamples,nsites,configs,out_matrix):
EigenValue = 0.0
for k in range(nsamples):
config = configs[k,:]
out_n = out_matrix[k,:]
exp = 0.0
for i in range(nsamples):
n = configs[i,:]
out_nprime = out_matrix[i,:]
#------------------------------------------------------------------------------------------------
# Calculation of Hopping Term
#------------------------------------------------------------------------------------------------
hop = 0.0
for j in range(nsites):
if j == 0:
k = [nsites-1,j+1]
elif j == (nsites - 1):
k = [j-1,0]
else:
k = [j-1,j+1]
if n[k[0]] != 0:
annihiliate1 = np.sqrt(n[k[0]])
n1 = np.copy(n)
n1[k[0]] = n1[k[0]] - 1
n1[j] = n1[j] +1
if (config == n1).all():
delta1 = 1
else:
delta1 = 0
else:
annihiliate1 = 0
n1 = np.zeros(nsites)
delta1 = 0
if n[k[1]] != 0:
annihiliate2 = np.sqrt(n[k[1]])
n2 = np.copy(n)
n2[k[1]] = n2[k[1]] -1
n2[j] = n2[j] + 1
if (config == n2).all():
delta2 = 1
else:
delta2 = 0
else:
annihiliate2 = 0
n2 = np.zeros(nsites)
delta2 = 0
create = np.sqrt(n[j] + 1)
hop = hop + create*(annihiliate1*delta1 + annihiliate2*delta2)
#------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------
# Calculation of Onsite Term
#------------------------------------------------------------------------------------------------
if (config == n).all():
ons = np.sum(np.dot(np.square(n),n - 1))
else:
ons = 0.0
#------------------------------------------------------------------------------------------------
phi_value = phi(out_nprime.numpy())
exp = exp + ((hop + ons) * phi_value)
Phi_value = phi(out_n.numpy())
EigenValue = EigenValue + exp/Phi_value
return np.real(EigenValue/nsamples)
I want to do custom traning using GradientTape method, for which I used following lines ,
optimizer = optimizers.SGD(learning_rate=1e-3)
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch(tf.convert_to_tensor(configs))
out_matrix = model(configs)
print(out_matrix)
eival = H_tilda(J,U,nsamples,nsites,configs,out_matrix)
print(eival)
gradients = tape.gradient(tf.convert_to_tensor(eival), model.trainable_weights)
print(gradients)
But the gradient I am getting is NONE,
output: [None, None, None, None]

SQL syntax query order by

SELECT TCID, START_TIME, RESULT,
cast(START_TIME as date) as m_date,
max(cast(START_TIME as time)) as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID = 'A330506'
GROUP BY TCID, m_date;
This is my data:
ID TCID START_DATE RESULT
1545240 A435727 2020-11-08 03:11:43 PASS
1545334 A435727 2020-11-08 03:19:53 PASS
1547439 A435727 2020-11-09 03:11:52 PASS
1547621 A435727 2020-11-09 03:20:05 PASS
1548388 A435727 2020-11-09 07:51:29 PASS
1558801 A435727 2020-11-12 00:11:10 PASS
1561899 A435727 2020-11-12 08:48:59 PASS
I want to get result of each TCID follow date like this
ID TCID START_DATE RESULT
1545334 A435727 2020-11-08 03:19:53 PASS
1548388 A435727 2020-11-09 07:51:29 PASS
1561899 A435727 2020-11-12 08:48:59 PASS
But the result current like that:
1545240 A435727 2020-11-08 03:11:43 PASS 2020-11-08 03:19:53
1547439 A435727 2020-11-09 03:11:52 PASS 2020-11-09 07:51:29
1558801 A435727 2020-11-12 00:11:10 PASS 2020-11-12 08:48:59
def connect_cli_server(self):
connect_success = 0
if self.ssh_client is None:
self.ssh_client = paramiko.SSHClient()
self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy)
for cnt in range(self.retry_cnt):
try:
self.ssh_client.connect(self.ip, 22, self.id, self.pw, timeout=self.time_out,
banner_timeout=self.banner)
connect_success = 1
break
except:
if cnt < 10:
time.sleep(random.uniform(0.1, 0.3))
if 10 <= cnt < 20:
time.sleep(random.uniform(0.1, 1))
else:
time.sleep(random.uniform(0.5, 1.5))
continue
if not connect_success:
try:
self.connect_cli_server_thru_remote_server()
except Exception as error:
print(error)
return False
return True
def send_command(self, ssh_client, command):
chan = ssh_client.get_transport().open_session()
chan.get_pty()
fileobject = chan.makefile()
chan.exec_command(command)
byteoutput = fileobject.read()
convetedstring = byteoutput.decode("UTF-8")
return convetedstring
I created some sample input by sql command
create table tbl_mock (
id int,
tc int,
startdate datetime,
result varchar(20)
);
insert into tbl_mock(id, tc, startdate, result) values (1, 1, '2020/11/12 09:00:00', 'pass');
insert into tbl_mock(id, tc, startdate, result) values (2, 1, '2020/11/12 10:00:00', 'fail');
insert into tbl_mock(id, tc, startdate, result) values (3, 1, '2020/11/12 11:00:00', 'pass');
insert into tbl_mock(id, tc, startdate, result) values (4, 1, '2020/11/13 09:00:00', 'pass');
insert into tbl_mock(id, tc, startdate, result) values (5, 1, '2020/11/13 10:00:00', 'fail');
insert into tbl_mock(id, tc, startdate, result) values (6, 1, '2020/11/13 11:00:00', 'fail');
You can try the below sql command to get your result
select tbl_a.*
from tbl_mock as tbl_a,
(select tc,
cast(startdate as date) as m_date,
max(cast(startdate as time)) as m_time
from tbl_mock
group by tc, m_date) as tbl_b
where tbl_a.tc = tbl_b.tc
and timestamp(tbl_b.m_date, tbl_b.m_time) = tbl_a.startdate
You can try this:
SELECT TCID
,START_TIME
,RESULT
,cast(START_TIME as date) as m_date
,max(cast(START_TIME as time)) as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID='A330506'
GROUP BY TCID
,START_TIME
,RESULT
,cast(START_TIME as date)
ORDER BY TCID
,m_date;
which should be the same as this:
SELECT DISTINCT TCID
,START_TIME
,RESULT
,cast(START_TIME as date) as m_date
,max(cast(START_TIME as time)) OVER() as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID='A330506'
ORDER BY TCID
,m_date;
or if you need to get the MAX value per TCID:
SELECT DISTINCT TCID
,START_TIME
,RESULT
,cast(START_TIME as date) as m_date
,max(cast(START_TIME as time)) OVER(PARTITION BY TCID) as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID='A330506'
ORDER BY TCID
,m_date;
import pygame
import os
import numpy as np
import queue
pygame.init()
q = 8
w = 70
def normalize_image(img):
return pygame.transform.scale(img, (w - 10, w - 10))
agent = pygame.image.load(os.path.join('EmptyAgent.png'))
wumpus = pygame.image.load(os.path.join('Wumpus.png'))
gold = pygame.image.load(os.path.join('Gold.png'))
pit = pygame.image.load(os.path.join('Pit.png'))
agent = normalize_image(agent)
wumpus = normalize_image(wumpus)
gold = normalize_image(gold)
pit = normalize_image(pit)
gr_empty = 0
gr_agent = 4
gr_wumpus = 1
gr_gold = 3
gr_pit = 2
clock = pygame.time.Clock()
screen = pygame.display.set_mode((q * w, q * w))
xh, yh = q - 1, 0
grid = [[gr_empty] * q for _ in range(q)]
grid[xh][yh] = gr_agent
bg_color = (255, 255, 255)
border_color = (0, 0, 0)
dx = [0,1,0,-1]
dy = [1,0,-1,0]
def bfs():
vs = np.zeros((q,q,8))
tracex = np.zeros((q,q,8))
tracey = np.zeros((q,q,8))
traced = np.zeros((q,q,8))
tracex[0][0][0] = 2
que = queue.Queue()
que.put((q-1,0,0,0))
vs[q-1][0][0]=1
gx, gy, gd, gt = 0, 0, 0, 1000
while not que.empty():
(x,y,dir,t) = que.get()
d=0
if grid[x][y]==3 and dir<4:
if gt>t:
gx, gy, gd, gt = x,y,dir,t
if vs[x][y][dir+4]==0:
que.put((x,y,dir+4,t+1))
vs[x][y][dir+4]=1
tracex[x][y][dir+4],tracey[x][y][dir+4],traced[x][y][dir+4] = x,y,dir
continue
if x==q-1 and y==0 and dir>3:
return 1000-(t+1)*10, gx, gy, gd, tracex, tracey, traced
if x<0 or y<0 or x>=q or y>=q:
continue
if grid[x][y]==gr_wumpus or grid[x][y]==gr_pit:
continue
if dir>3:
d+=4
dir-=4
if vs[x][y][(dir + 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir + 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir + 1 + 4) % 4 + d] = 1
tracex[x][y][(dir + 1 + 4) % 4 + d], tracey[x][y][(dir + 1 + 4) % 4 + d], traced[x][y][(dir + 1 + 4) % 4 + d] = x, y, dir
if vs[x][y][(dir - 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir - 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir - 1 + 4) % 4 + d] = 1
tracex[x][y][(dir - 1 + 4) % 4 + d], tracey[x][y][(dir - 1 + 4) % 4 + d], traced[x][y][(dir - 1 + 4) % 4 + d] = x, y, dir
if x+dx[dir]<0 or x+dx[dir]>=q or y+dy[dir]<0 or y+dy[dir]>=q:
continue
if vs[x + dx[dir]][y + dy[dir]][dir + d] == 0:
que.put((x + dx[dir], y + dy[dir], dir + d, t + 1))
vs[x + dx[dir]][y + dy[dir]][dir + d] = 1
tracex[x + dx[dir]][y + dy[dir]][dir + d], tracey[x + dx[dir]][y + dy[dir]][dir + d], traced[x + dx[dir]][y + dy[dir]][dir + d] = x, y, dir
return -1, gx, gy, gd, tracex, tracey, traced
def test():
grid[q-2][1] = gr_wumpus
grid[q-3][2] = gr_gold
grid[q-3][0] = gr_pit
ans, gx, gy, gd, tracex, tracey, traced = bfs()
if ans<0:
ans=-1
print(ans)
tx, ty, td = gx, gy, gd
if ans>=0:
while 1:
print(tx, ty, td)
if tx==q-1 and ty==0:
break
xn, yn, dn = int(tracex[tx][ty][td]), int(tracey[tx][ty][td]), int(traced[tx][ty][td])
tx, ty, td = xn,yn,dn
# print(tracex[0][0][0])
def draw_img(img, x, y):
screen.blit(img, (x + 5, y + 5))
def draw():
x, y = 0, 0
screen.fill(bg_color)
for i in range(q):
for j in range(q):
if grid[i][j] == gr_agent:
draw_img(agent, x, y)
elif grid[i][j] == gr_wumpus:
draw_img(wumpus, x, y)
elif grid[i][j] == gr_gold:
draw_img(gold, x, y)
elif grid[i][j] == gr_pit:
draw_img(pit, x, y)
for k in range(4):
pygame.draw.rect(screen, border_color, (x - k, y - k, w, w), 1)
x = x + w
x = 0
y = y + w
def main():
test()
while True:
events = pygame.event.get()
keys = pygame.key.get_pressed()
for event in events:
if event.type == pygame.QUIT:
return
elif event.type == pygame.MOUSEBUTTONDOWN:
mx, my = event.pos
gy = mx // w
gx = my // w
if not grid[gx][gy] == gr_empty:
continue
elif event.button == 1:
# Left click
grid[gx][gy] = gr_wumpus
elif event.button == 3:
# Scroll slick
grid[gx][gy] = gr_gold
elif event.button == 2:
# Middle click
grid[gx][gy] = gr_pit
draw()
pygame.display.flip()
clock.tick(600)
main()
import pygame
import os
import numpy as np
import queue
pygame.init()
q = 3
w = 70
def normalize_image(img):
return pygame.transform.scale(img, (w - 10, w - 10))
agent = pygame.image.load(os.path.join('EmptyAgent.png'))
wumpus = pygame.image.load(os.path.join('Wumpus.png'))
gold = pygame.image.load(os.path.join('Gold.png'))
pit = pygame.image.load(os.path.join('Pit.png'))
agent = normalize_image(agent)
wumpus = normalize_image(wumpus)
gold = normalize_image(gold)
pit = normalize_image(pit)
gr_empty = 0
gr_agent = 4
gr_wumpus = 1
gr_gold = 3
gr_pit = 2
clock = pygame.time.Clock()
screen = pygame.display.set_mode((q * w, q * w))
xh, yh = q - 1, 0
grid = [[gr_empty] * q for _ in range(q)]
grid[xh][yh] = gr_agent
bg_color = (255, 255, 255)
border_color = (0, 0, 0)
dx = [0,1,0,-1]
dy = [1,0,-1,0]
def bfs():
vs = np.zeros((q,q,8))
tracex = np.zeros((q,q,8))
tracey = np.zeros((q,q,8))
traced = np.zeros((q,q,8))
tracex[0][0][0] = 2
que = queue.Queue()
que.put((q-1,0,0,0))
vs[q-1][0][0]=1
gx, gy, gd, gt = 0, 0, 0, 1000
while not que.empty():
(x,y,dir,t) = que.get()
d=0
if grid[x][y]==3 and dir<4:
if gt>t:
gx, gy, gd, gt = x,y,dir,t
if vs[x][y][dir+4]==0:
que.put((x,y,dir+4,t+1))
vs[x][y][dir+4]=1
tracex[x][y][dir+4],tracey[x][y][dir+4],traced[x][y][dir+4] = x,y,dir
continue
if x==q-1 and y==0 and dir>3:
gx, gy, gd, gt = x, y, dir, t
return 1000-(t+1)*10, gx, gy, gd, tracex, tracey, traced
if x<0 or y<0 or x>=q or y>=q:
continue
if grid[x][y]==gr_wumpus or grid[x][y]==gr_pit:
continue
if dir>3:
d+=4
dir-=4
if vs[x][y][(dir + 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir + 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir + 1 + 4) % 4 + d] = 1
tracex[x][y][(dir + 1 + 4) % 4 + d], tracey[x][y][(dir + 1 + 4) % 4 + d], traced[x][y][(dir + 1 + 4) % 4 + d] = x, y, dir + d
if vs[x][y][(dir - 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir - 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir - 1 + 4) % 4 + d] = 1
tracex[x][y][(dir - 1 + 4) % 4 + d], tracey[x][y][(dir - 1 + 4) % 4 + d], traced[x][y][(dir - 1 + 4) % 4 + d] = x, y, dir + d
if x+dx[dir]<0 or x+dx[dir]>=q or y+dy[dir]<0 or y+dy[dir]>=q:
continue
if vs[x + dx[dir]][y + dy[dir]][dir + d] == 0:
que.put((x + dx[dir], y + dy[dir], dir + d, t + 1))
vs[x + dx[dir]][y + dy[dir]][dir + d] = 1
tracex[x + dx[dir]][y + dy[dir]][dir + d], tracey[x + dx[dir]][y + dy[dir]][dir + d], traced[x + dx[dir]][y + dy[dir]][dir + d] = x, y, dir + d
return -1, gx, gy, gd, tracex, tracey, traced
def test():
grid[q-2][1] = gr_wumpus
grid[q-3][2] = gr_gold
grid[q-3][0] = gr_pit
ans, gx, gy, gd, tracex, tracey, traced = bfs()
if ans<0:
ans=-1
print(ans)
tx, ty, td = gx, gy, gd
if ans>=0:
while 1:
print(tx, ty, td)
if tx==q-1 and ty==0 and td==0:
break
xn, yn, dn = int(tracex[tx][ty][td]), int(tracey[tx][ty][td]), int(traced[tx][ty][td])
tx, ty, td = xn,yn,dn
# print(tracex[0][0][0])
def draw_img(img, x, y):
screen.blit(img, (x + 5, y + 5))
def draw():
x, y = 0, 0
screen.fill(bg_color)
for i in range(q):
for j in range(q):
if grid[i][j] == gr_agent:
draw_img(agent, x, y)
elif grid[i][j] == gr_wumpus:
draw_img(wumpus, x, y)
elif grid[i][j] == gr_gold:
draw_img(gold, x, y)
elif grid[i][j] == gr_pit:
draw_img(pit, x, y)
for k in range(4):
pygame.draw.rect(screen, border_color, (x - k, y - k, w, w), 1)
x = x + w
x = 0
y = y + w
def main():
test()
while True:
events = pygame.event.get()
keys = pygame.key.get_pressed()
for event in events:
if event.type == pygame.QUIT:
return
elif event.type == pygame.MOUSEBUTTONDOWN:
mx, my = event.pos
gy = mx // w
gx = my // w
if not grid[gx][gy] == gr_empty:
continue
elif event.button == 1:
# Left click
grid[gx][gy] = gr_wumpus
elif event.button == 3:
# Scroll slick
grid[gx][gy] = gr_gold
elif event.button == 2:
# Middle click
grid[gx][gy] = gr_pit
draw()
pygame.display.flip()
clock.tick(600)
main()

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})
Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

NameError when running GMRes following FEniCS discretisation

I've discretised a diffusion equation with FEniCS as follows:
def DiscretiseEquation(h):
mesh = UnitSquareMesh(h, h)
V = FunctionSpace(mesh, 'Lagrange', 1)
def on_boundary(x, on_boundary):
return on_boundary
bc_value = Constant(0.0)
boundary_condition = DirichletBC(V, bc_value, on_boundary)
class RandomDiffusionField(Expression):
def __init__(self, m, n, element):
self._rand_field = np.exp(-np.random.randn(m, n))
self._m = m
self._n = n
self._ufl_element = element
def eval(self, value, x):
x_index = np.int(np.floor(self._m * x[0]))
y_index = np.int(np.floor(self._n * x[1]))
i = min(x_index, self._m - 1)
j = min(y_index, self._n - 1)
value[0] = self._rand_field[i, j]
def value_shape(self):
return(1, )
class RandomRhs(Expression):
def __init__(self, m, n, element):
self._rand_field = np.random.randn(m, n)
self._m = m
self._n = n
self._ufl_element = element
def eval(self, value, x):
x_index = np.int(np.floor(self._m * x[0]))
y_index = np.int(np.floor(self._n * x[1]))
i = min(x_index, self._m - 1)
j = min(y_index, self._n - 1)
value[0] = self._rand_field[i, j]
def value_shape(self):
return (1, )
u = TrialFunction(V)
v = TestFunction(V)
random_field = RandomDiffusionField(100, 100, element=V.ufl_element())
zero = Expression("0", element=V.ufl_element())
one = Expression("1", element=V.ufl_element())
diffusion = as_matrix(((random_field, zero), (zero, one)))
a = inner(diffusion * grad(u), grad(v)) * dx
L = RandomRhs(h, h, element=V.ufl_element()) * v * dx
A = assemble(a)
b = assemble(L)
boundary_condition.apply(A, b)
A = as_backend_type(A).mat()
(indptr, indices, data) = A.getValuesCSR()
mat = csr_matrix((data, indices, indptr), shape=A.size)
rhs = b.array()
#Solving
x = spsolve(mat, rhs)
#Conversion to a FEniCS function
u = Function(V)
u.vector()[:] = x
I am running the GMRES solver as normal. The callback argument is a separate iteration counter I've defined.
DiscretiseEquation(100)
A = mat
b = rhs
x, info = gmres(A, b, callback = IterCount())
The routine returns a NameError, stating that 'mat' is not defined:
NameError Traceback (most recent call last)
<ipython-input-18-e096b2eea097> in <module>()
1 DiscretiseEquation(200)
----> 2 A = mat
3 b = rhs
4 x_200, info_200 = gmres(A, b, callback = IterCount())
5 gmres_res = closure_variables["residuals"]
NameError: name 'mat' is not defined
As far as I'm aware, it should be defined when I call the DiscretiseEquation function?