Logistic regression with custom dataset

Logistic regression with custom dataset - numpy

From deeplearning course on Coursera I've implemented logistic regression :
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s
def initialize_with_zeros(dim):
w = np.zeros(shape=(dim, 1))
b = 0
return w, b
def propagate(w, b, X, Y):
m = X.shape[1]
A = sigmoid(np.dot(w.T, X) + b) # compute activation
cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
dw = (1 / m) * np.dot(X, (A - Y).T)
db = (1 / m) * np.sum(A - Y)
cost = np.squeeze(cost)
grads = {"dw": dw,
"db": db}
return grads, cost
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
costs = []
for i in range(num_iterations):
grads, cost = propagate(w, b, X, Y)
dw = grads["dw"]
db = grads["db"]
w = w - learning_rate * dw # need to broadcast
b = b - learning_rate * db
if i % 100 == 0:
costs.append(cost)
# Print the cost every 100 training examples
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" % (i, cost))
params = {"w": w,
"b": b}
grads = {"dw": dw,
"db": db}
return params, grads, costs
def predict(w, b, X):
m = X.shape[1]
Y_prediction = np.zeros((1, m))
w = w.reshape(X.shape[0], 1)
A = sigmoid(np.dot(w.T, X) + b)
for i in range(A.shape[1]):
# Convert probabilities a[0,i] to actual predictions p[0,i]
### START CODE HERE ### (≈ 4 lines of code)
print(A)
Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
### END CODE HERE ###
assert(Y_prediction.shape == (1, m))
return Y_prediction
print ("sigmoid(0) = " + str(sigmoid(0)))
print ("sigmoid(9.2) = " + str(sigmoid(9.2)))
dim = 2
w, b = initialize_with_zeros(dim)
print ("w = " + str(w))
print ("b = " + str(b))
w, b, X, Y = np.array([[1], [2]]), 2, np.array([[-1,-2], [3,4]]), np.array([[1, 0]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))
params, grads, costs = optimize(w, b, X, Y, num_iterations= 10000, learning_rate = 0.01, print_cost = False)
print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print("predictions = " + str(predict(w, b, X)))
def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
w, b = initialize_with_zeros(X_train.shape[0])
parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
w = parameters["w"]
b = parameters["b"]
Y_prediction_test = predict(w, b, X_test)
Y_prediction_train = predict(w, b, X_train)
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
d = {"costs": costs,
"Y_prediction_test": Y_prediction_test,
"Y_prediction_train" : Y_prediction_train,
"w" : w,
"b" : b,
"learning_rate" : learning_rate,
"num_iterations": num_iterations}
return d
I'm attempting to use a generic dataset which contains 5 samples where each sample contain 4 elements :
train_set_x = np.array([[1,2,3,4],[4,3,2,1],[1,2,3,4],[4,3,2,1],[1,2,3,4]])
train_set_y = np.array([1,0,1,0,1])
test_set_x = np.array([[1,2,3,4],[4,3,2,1],[1,2,3,4],[4,3,2,1],[1,2,3,4]])
test_set_y = np.array([1,0,1,0,1])
train_set_x , train_set_y , test_set_x , test_set_y
d = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 2000, learning_rate = 0.005, print_cost = True)
But the following error is thrown :
<ipython-input-409-bd4e233a8f4e> in propagate(w, b, X, Y)
18
19 A = sigmoid(np.dot(w.T, X) + b) # compute activation
---> 20 cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
21
22 dw = (1 / m) * np.dot(X, (A - Y).T)
ValueError: operands could not be broadcast together with shapes (5,) (1,4)
Do I need to change the weight dimensions in order to compute the cost value ?
Update :
Using modification :
A = sigmoid(np.dot(X , w) + b) # compute activation
causes error :
<ipython-input-546-7a7980550834> in propagate(w, b, X, Y)
20 m = X.shape[1]
21
---> 22 A = sigmoid(np.dot(X , w) + b) # compute activation
23 print('w.T' , w.T , 'w' , w, 'X' , X , 'Y' , Y , 'A' , A)
24 cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
ValueError: shapes (5,4) and (5,1) not aligned: 4 (dim 1) != 5 (dim 0)

Related

python function as cvxpy parameter for dynamic optimization (optimal control)

import numpy as np
def af(a,b):
return np.array([[a,b],[b**2, b]])
np.random.seed(1)
n = 2
m = 2
T = 50
alpha = 0.2
beta = 3
# A = np.eye(n) - alpha * np.random.rand(n, n)
B = np.random.randn(n, m)
x_0 = beta * np.random.randn(n)
import cvxpy as cp
x = cp.Variable((n, T + 1))
u = cp.Variable((m, T))
A = cp.Parameter((2,2))
cost = 0
constr = []
for t in range(T):
cost += cp.sum_squares(x[:, t + 1]) + cp.sum_squares(u[:, t])
A = af(*x[:,t])
constr += [x[:, t + 1] == A # x[:, t] + B # u[:, t], cp.norm(u[:, t], "inf") <= 1]
# sums problem objectives and concatenates constraints.
constr += [x[:, T] == 0, x[:, 0] == x_0]
problem = cp.Problem(cp.Minimize(cost), constr)
problem.solve()
I want to use python function (lambdify function) as cvxpy parameter. I tried this method, please let me know if cvxpy support python function as parameter. thank you.

Calculating gradients of cusom loss function with Gradient.Tape

I am trying custom traning of the network using Gradient.Tape method.
This traning is unsupervised.
The details of network and cost function is as following,
My Network is,
def CreateNetwork(inplayer, hidlayer, outlayer,seed):
model = keras.Sequential()
model.add(Dense(hidlayer, input_dim=inplayer, kernel_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(inplayer),seed=seed), bias_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(inplayer),seed=seed), activation='tanh'))
model.add(Dense(outlayer, kernel_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(hidlayer),seed=seed), bias_initializer=initializers.Zeros(), activation='linear'))
return model
and my custom cost function is defined as,
def H_tilda(J,U,nsamples,nsites,configs,out_matrix):
EigenValue = 0.0
for k in range(nsamples):
config = configs[k,:]
out_n = out_matrix[k,:]
exp = 0.0
for i in range(nsamples):
n = configs[i,:]
out_nprime = out_matrix[i,:]
#------------------------------------------------------------------------------------------------
# Calculation of Hopping Term
#------------------------------------------------------------------------------------------------
hop = 0.0
for j in range(nsites):
if j == 0:
k = [nsites-1,j+1]
elif j == (nsites - 1):
k = [j-1,0]
else:
k = [j-1,j+1]
if n[k[0]] != 0:
annihiliate1 = np.sqrt(n[k[0]])
n1 = np.copy(n)
n1[k[0]] = n1[k[0]] - 1
n1[j] = n1[j] +1
if (config == n1).all():
delta1 = 1
else:
delta1 = 0
else:
annihiliate1 = 0
n1 = np.zeros(nsites)
delta1 = 0
if n[k[1]] != 0:
annihiliate2 = np.sqrt(n[k[1]])
n2 = np.copy(n)
n2[k[1]] = n2[k[1]] -1
n2[j] = n2[j] + 1
if (config == n2).all():
delta2 = 1
else:
delta2 = 0
else:
annihiliate2 = 0
n2 = np.zeros(nsites)
delta2 = 0
create = np.sqrt(n[j] + 1)
hop = hop + create*(annihiliate1*delta1 + annihiliate2*delta2)
#------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------
# Calculation of Onsite Term
#------------------------------------------------------------------------------------------------
if (config == n).all():
ons = np.sum(np.dot(np.square(n),n - 1))
else:
ons = 0.0
#------------------------------------------------------------------------------------------------
phi_value = phi(out_nprime.numpy())
exp = exp + ((hop + ons) * phi_value)
Phi_value = phi(out_n.numpy())
EigenValue = EigenValue + exp/Phi_value
return np.real(EigenValue/nsamples)
I want to do custom traning using GradientTape method, for which I used following lines ,
optimizer = optimizers.SGD(learning_rate=1e-3)
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch(tf.convert_to_tensor(configs))
out_matrix = model(configs)
print(out_matrix)
eival = H_tilda(J,U,nsamples,nsites,configs,out_matrix)
print(eival)
gradients = tape.gradient(tf.convert_to_tensor(eival), model.trainable_weights)
print(gradients)
But the gradient I am getting is NONE,
output: [None, None, None, None]

SQL syntax query order by

SELECT TCID, START_TIME, RESULT,
cast(START_TIME as date) as m_date,
max(cast(START_TIME as time)) as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID = 'A330506'
GROUP BY TCID, m_date;
This is my data:
ID TCID START_DATE RESULT
1545240 A435727 2020-11-08 03:11:43 PASS
1545334 A435727 2020-11-08 03:19:53 PASS
1547439 A435727 2020-11-09 03:11:52 PASS
1547621 A435727 2020-11-09 03:20:05 PASS
1548388 A435727 2020-11-09 07:51:29 PASS
1558801 A435727 2020-11-12 00:11:10 PASS
1561899 A435727 2020-11-12 08:48:59 PASS
I want to get result of each TCID follow date like this
ID TCID START_DATE RESULT
1545334 A435727 2020-11-08 03:19:53 PASS
1548388 A435727 2020-11-09 07:51:29 PASS
1561899 A435727 2020-11-12 08:48:59 PASS
But the result current like that:
1545240 A435727 2020-11-08 03:11:43 PASS 2020-11-08 03:19:53
1547439 A435727 2020-11-09 03:11:52 PASS 2020-11-09 07:51:29
1558801 A435727 2020-11-12 00:11:10 PASS 2020-11-12 08:48:59
def connect_cli_server(self):
connect_success = 0
if self.ssh_client is None:
self.ssh_client = paramiko.SSHClient()
self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy)
for cnt in range(self.retry_cnt):
try:
self.ssh_client.connect(self.ip, 22, self.id, self.pw, timeout=self.time_out,
banner_timeout=self.banner)
connect_success = 1
break
except:
if cnt < 10:
time.sleep(random.uniform(0.1, 0.3))
if 10 <= cnt < 20:
time.sleep(random.uniform(0.1, 1))
else:
time.sleep(random.uniform(0.5, 1.5))
continue
if not connect_success:
try:
self.connect_cli_server_thru_remote_server()
except Exception as error:
print(error)
return False
return True
def send_command(self, ssh_client, command):
chan = ssh_client.get_transport().open_session()
chan.get_pty()
fileobject = chan.makefile()
chan.exec_command(command)
byteoutput = fileobject.read()
convetedstring = byteoutput.decode("UTF-8")
return convetedstring

I created some sample input by sql command
create table tbl_mock (
id int,
tc int,
startdate datetime,
result varchar(20)
);
insert into tbl_mock(id, tc, startdate, result) values (1, 1, '2020/11/12 09:00:00', 'pass');
insert into tbl_mock(id, tc, startdate, result) values (2, 1, '2020/11/12 10:00:00', 'fail');
insert into tbl_mock(id, tc, startdate, result) values (3, 1, '2020/11/12 11:00:00', 'pass');
insert into tbl_mock(id, tc, startdate, result) values (4, 1, '2020/11/13 09:00:00', 'pass');
insert into tbl_mock(id, tc, startdate, result) values (5, 1, '2020/11/13 10:00:00', 'fail');
insert into tbl_mock(id, tc, startdate, result) values (6, 1, '2020/11/13 11:00:00', 'fail');
You can try the below sql command to get your result
select tbl_a.*
from tbl_mock as tbl_a,
(select tc,
cast(startdate as date) as m_date,
max(cast(startdate as time)) as m_time
from tbl_mock
group by tc, m_date) as tbl_b
where tbl_a.tc = tbl_b.tc
and timestamp(tbl_b.m_date, tbl_b.m_time) = tbl_a.startdate

You can try this:
SELECT TCID
,START_TIME
,RESULT
,cast(START_TIME as date) as m_date
,max(cast(START_TIME as time)) as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID='A330506'
GROUP BY TCID
,START_TIME
,RESULT
,cast(START_TIME as date)
ORDER BY TCID
,m_date;
which should be the same as this:
SELECT DISTINCT TCID
,START_TIME
,RESULT
,cast(START_TIME as date) as m_date
,max(cast(START_TIME as time)) OVER() as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID='A330506'
ORDER BY TCID
,m_date;
or if you need to get the MAX value per TCID:
SELECT DISTINCT TCID
,START_TIME
,RESULT
,cast(START_TIME as date) as m_date
,max(cast(START_TIME as time)) OVER(PARTITION BY TCID) as max_time
FROM jenkins_result.JENKINS_RESULT
WHERE TCID='A330506'
ORDER BY TCID
,m_date;

import pygame
import os
import numpy as np
import queue
pygame.init()
q = 8
w = 70
def normalize_image(img):
return pygame.transform.scale(img, (w - 10, w - 10))
agent = pygame.image.load(os.path.join('EmptyAgent.png'))
wumpus = pygame.image.load(os.path.join('Wumpus.png'))
gold = pygame.image.load(os.path.join('Gold.png'))
pit = pygame.image.load(os.path.join('Pit.png'))
agent = normalize_image(agent)
wumpus = normalize_image(wumpus)
gold = normalize_image(gold)
pit = normalize_image(pit)
gr_empty = 0
gr_agent = 4
gr_wumpus = 1
gr_gold = 3
gr_pit = 2
clock = pygame.time.Clock()
screen = pygame.display.set_mode((q * w, q * w))
xh, yh = q - 1, 0
grid = [[gr_empty] * q for _ in range(q)]
grid[xh][yh] = gr_agent
bg_color = (255, 255, 255)
border_color = (0, 0, 0)
dx = [0,1,0,-1]
dy = [1,0,-1,0]
def bfs():
vs = np.zeros((q,q,8))
tracex = np.zeros((q,q,8))
tracey = np.zeros((q,q,8))
traced = np.zeros((q,q,8))
tracex[0][0][0] = 2
que = queue.Queue()
que.put((q-1,0,0,0))
vs[q-1][0][0]=1
gx, gy, gd, gt = 0, 0, 0, 1000
while not que.empty():
(x,y,dir,t) = que.get()
d=0
if grid[x][y]==3 and dir<4:
if gt>t:
gx, gy, gd, gt = x,y,dir,t
if vs[x][y][dir+4]==0:
que.put((x,y,dir+4,t+1))
vs[x][y][dir+4]=1
tracex[x][y][dir+4],tracey[x][y][dir+4],traced[x][y][dir+4] = x,y,dir
continue
if x==q-1 and y==0 and dir>3:
return 1000-(t+1)*10, gx, gy, gd, tracex, tracey, traced
if x<0 or y<0 or x>=q or y>=q:
continue
if grid[x][y]==gr_wumpus or grid[x][y]==gr_pit:
continue
if dir>3:
d+=4
dir-=4
if vs[x][y][(dir + 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir + 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir + 1 + 4) % 4 + d] = 1
tracex[x][y][(dir + 1 + 4) % 4 + d], tracey[x][y][(dir + 1 + 4) % 4 + d], traced[x][y][(dir + 1 + 4) % 4 + d] = x, y, dir
if vs[x][y][(dir - 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir - 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir - 1 + 4) % 4 + d] = 1
tracex[x][y][(dir - 1 + 4) % 4 + d], tracey[x][y][(dir - 1 + 4) % 4 + d], traced[x][y][(dir - 1 + 4) % 4 + d] = x, y, dir
if x+dx[dir]<0 or x+dx[dir]>=q or y+dy[dir]<0 or y+dy[dir]>=q:
continue
if vs[x + dx[dir]][y + dy[dir]][dir + d] == 0:
que.put((x + dx[dir], y + dy[dir], dir + d, t + 1))
vs[x + dx[dir]][y + dy[dir]][dir + d] = 1
tracex[x + dx[dir]][y + dy[dir]][dir + d], tracey[x + dx[dir]][y + dy[dir]][dir + d], traced[x + dx[dir]][y + dy[dir]][dir + d] = x, y, dir
return -1, gx, gy, gd, tracex, tracey, traced
def test():
grid[q-2][1] = gr_wumpus
grid[q-3][2] = gr_gold
grid[q-3][0] = gr_pit
ans, gx, gy, gd, tracex, tracey, traced = bfs()
if ans<0:
ans=-1
print(ans)
tx, ty, td = gx, gy, gd
if ans>=0:
while 1:
print(tx, ty, td)
if tx==q-1 and ty==0:
break
xn, yn, dn = int(tracex[tx][ty][td]), int(tracey[tx][ty][td]), int(traced[tx][ty][td])
tx, ty, td = xn,yn,dn
# print(tracex[0][0][0])
def draw_img(img, x, y):
screen.blit(img, (x + 5, y + 5))
def draw():
x, y = 0, 0
screen.fill(bg_color)
for i in range(q):
for j in range(q):
if grid[i][j] == gr_agent:
draw_img(agent, x, y)
elif grid[i][j] == gr_wumpus:
draw_img(wumpus, x, y)
elif grid[i][j] == gr_gold:
draw_img(gold, x, y)
elif grid[i][j] == gr_pit:
draw_img(pit, x, y)
for k in range(4):
pygame.draw.rect(screen, border_color, (x - k, y - k, w, w), 1)
x = x + w
x = 0
y = y + w
def main():
test()
while True:
events = pygame.event.get()
keys = pygame.key.get_pressed()
for event in events:
if event.type == pygame.QUIT:
return
elif event.type == pygame.MOUSEBUTTONDOWN:
mx, my = event.pos
gy = mx // w
gx = my // w
if not grid[gx][gy] == gr_empty:
continue
elif event.button == 1:
# Left click
grid[gx][gy] = gr_wumpus
elif event.button == 3:
# Scroll slick
grid[gx][gy] = gr_gold
elif event.button == 2:
# Middle click
grid[gx][gy] = gr_pit
draw()
pygame.display.flip()
clock.tick(600)
main()

import pygame
import os
import numpy as np
import queue
pygame.init()
q = 3
w = 70
def normalize_image(img):
return pygame.transform.scale(img, (w - 10, w - 10))
agent = pygame.image.load(os.path.join('EmptyAgent.png'))
wumpus = pygame.image.load(os.path.join('Wumpus.png'))
gold = pygame.image.load(os.path.join('Gold.png'))
pit = pygame.image.load(os.path.join('Pit.png'))
agent = normalize_image(agent)
wumpus = normalize_image(wumpus)
gold = normalize_image(gold)
pit = normalize_image(pit)
gr_empty = 0
gr_agent = 4
gr_wumpus = 1
gr_gold = 3
gr_pit = 2
clock = pygame.time.Clock()
screen = pygame.display.set_mode((q * w, q * w))
xh, yh = q - 1, 0
grid = [[gr_empty] * q for _ in range(q)]
grid[xh][yh] = gr_agent
bg_color = (255, 255, 255)
border_color = (0, 0, 0)
dx = [0,1,0,-1]
dy = [1,0,-1,0]
def bfs():
vs = np.zeros((q,q,8))
tracex = np.zeros((q,q,8))
tracey = np.zeros((q,q,8))
traced = np.zeros((q,q,8))
tracex[0][0][0] = 2
que = queue.Queue()
que.put((q-1,0,0,0))
vs[q-1][0][0]=1
gx, gy, gd, gt = 0, 0, 0, 1000
while not que.empty():
(x,y,dir,t) = que.get()
d=0
if grid[x][y]==3 and dir<4:
if gt>t:
gx, gy, gd, gt = x,y,dir,t
if vs[x][y][dir+4]==0:
que.put((x,y,dir+4,t+1))
vs[x][y][dir+4]=1
tracex[x][y][dir+4],tracey[x][y][dir+4],traced[x][y][dir+4] = x,y,dir
continue
if x==q-1 and y==0 and dir>3:
gx, gy, gd, gt = x, y, dir, t
return 1000-(t+1)*10, gx, gy, gd, tracex, tracey, traced
if x<0 or y<0 or x>=q or y>=q:
continue
if grid[x][y]==gr_wumpus or grid[x][y]==gr_pit:
continue
if dir>3:
d+=4
dir-=4
if vs[x][y][(dir + 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir + 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir + 1 + 4) % 4 + d] = 1
tracex[x][y][(dir + 1 + 4) % 4 + d], tracey[x][y][(dir + 1 + 4) % 4 + d], traced[x][y][(dir + 1 + 4) % 4 + d] = x, y, dir + d
if vs[x][y][(dir - 1 + 4) % 4 + d] == 0:
que.put((x, y, (dir - 1 + 4) % 4 + d, t + 1))
vs[x][y][(dir - 1 + 4) % 4 + d] = 1
tracex[x][y][(dir - 1 + 4) % 4 + d], tracey[x][y][(dir - 1 + 4) % 4 + d], traced[x][y][(dir - 1 + 4) % 4 + d] = x, y, dir + d
if x+dx[dir]<0 or x+dx[dir]>=q or y+dy[dir]<0 or y+dy[dir]>=q:
continue
if vs[x + dx[dir]][y + dy[dir]][dir + d] == 0:
que.put((x + dx[dir], y + dy[dir], dir + d, t + 1))
vs[x + dx[dir]][y + dy[dir]][dir + d] = 1
tracex[x + dx[dir]][y + dy[dir]][dir + d], tracey[x + dx[dir]][y + dy[dir]][dir + d], traced[x + dx[dir]][y + dy[dir]][dir + d] = x, y, dir + d
return -1, gx, gy, gd, tracex, tracey, traced
def test():
grid[q-2][1] = gr_wumpus
grid[q-3][2] = gr_gold
grid[q-3][0] = gr_pit
ans, gx, gy, gd, tracex, tracey, traced = bfs()
if ans<0:
ans=-1
print(ans)
tx, ty, td = gx, gy, gd
if ans>=0:
while 1:
print(tx, ty, td)
if tx==q-1 and ty==0 and td==0:
break
xn, yn, dn = int(tracex[tx][ty][td]), int(tracey[tx][ty][td]), int(traced[tx][ty][td])
tx, ty, td = xn,yn,dn
# print(tracex[0][0][0])
def draw_img(img, x, y):
screen.blit(img, (x + 5, y + 5))
def draw():
x, y = 0, 0
screen.fill(bg_color)
for i in range(q):
for j in range(q):
if grid[i][j] == gr_agent:
draw_img(agent, x, y)
elif grid[i][j] == gr_wumpus:
draw_img(wumpus, x, y)
elif grid[i][j] == gr_gold:
draw_img(gold, x, y)
elif grid[i][j] == gr_pit:
draw_img(pit, x, y)
for k in range(4):
pygame.draw.rect(screen, border_color, (x - k, y - k, w, w), 1)
x = x + w
x = 0
y = y + w
def main():
test()
while True:
events = pygame.event.get()
keys = pygame.key.get_pressed()
for event in events:
if event.type == pygame.QUIT:
return
elif event.type == pygame.MOUSEBUTTONDOWN:
mx, my = event.pos
gy = mx // w
gx = my // w
if not grid[gx][gy] == gr_empty:
continue
elif event.button == 1:
# Left click
grid[gx][gy] = gr_wumpus
elif event.button == 3:
# Scroll slick
grid[gx][gy] = gr_gold
elif event.button == 2:
# Middle click
grid[gx][gy] = gr_pit
draw()
pygame.display.flip()
clock.tick(600)
main()

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})

Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

NameError when running GMRes following FEniCS discretisation

I've discretised a diffusion equation with FEniCS as follows:
def DiscretiseEquation(h):
mesh = UnitSquareMesh(h, h)
V = FunctionSpace(mesh, 'Lagrange', 1)
def on_boundary(x, on_boundary):
return on_boundary
bc_value = Constant(0.0)
boundary_condition = DirichletBC(V, bc_value, on_boundary)
class RandomDiffusionField(Expression):
def __init__(self, m, n, element):
self._rand_field = np.exp(-np.random.randn(m, n))
self._m = m
self._n = n
self._ufl_element = element
def eval(self, value, x):
x_index = np.int(np.floor(self._m * x[0]))
y_index = np.int(np.floor(self._n * x[1]))
i = min(x_index, self._m - 1)
j = min(y_index, self._n - 1)
value[0] = self._rand_field[i, j]
def value_shape(self):
return(1, )
class RandomRhs(Expression):
def __init__(self, m, n, element):
self._rand_field = np.random.randn(m, n)
self._m = m
self._n = n
self._ufl_element = element
def eval(self, value, x):
x_index = np.int(np.floor(self._m * x[0]))
y_index = np.int(np.floor(self._n * x[1]))
i = min(x_index, self._m - 1)
j = min(y_index, self._n - 1)
value[0] = self._rand_field[i, j]
def value_shape(self):
return (1, )
u = TrialFunction(V)
v = TestFunction(V)
random_field = RandomDiffusionField(100, 100, element=V.ufl_element())
zero = Expression("0", element=V.ufl_element())
one = Expression("1", element=V.ufl_element())
diffusion = as_matrix(((random_field, zero), (zero, one)))
a = inner(diffusion * grad(u), grad(v)) * dx
L = RandomRhs(h, h, element=V.ufl_element()) * v * dx
A = assemble(a)
b = assemble(L)
boundary_condition.apply(A, b)
A = as_backend_type(A).mat()
(indptr, indices, data) = A.getValuesCSR()
mat = csr_matrix((data, indices, indptr), shape=A.size)
rhs = b.array()
#Solving
x = spsolve(mat, rhs)
#Conversion to a FEniCS function
u = Function(V)
u.vector()[:] = x
I am running the GMRES solver as normal. The callback argument is a separate iteration counter I've defined.
DiscretiseEquation(100)
A = mat
b = rhs
x, info = gmres(A, b, callback = IterCount())
The routine returns a NameError, stating that 'mat' is not defined:
NameError Traceback (most recent call last)
<ipython-input-18-e096b2eea097> in <module>()
1 DiscretiseEquation(200)
----> 2 A = mat
3 b = rhs
4 x_200, info_200 = gmres(A, b, callback = IterCount())
5 gmres_res = closure_variables["residuals"]
NameError: name 'mat' is not defined
As far as I'm aware, it should be defined when I call the DiscretiseEquation function?

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Logistic regression with custom dataset - numpy

Related

python function as cvxpy parameter for dynamic optimization (optimal control)

Calculating gradients of cusom loss function with Gradient.Tape

SQL syntax query order by

Probabilistic Record Linkage in Pandas

NameError when running GMRes following FEniCS discretisation

Categories

Resources