Multiplying multidimensional array in python - numpy

I have two arrays:
L, M, N = 6, 31, 500
A = np.random.random((L, M, N))
B = np.random.random((L, L))
I am trying to get an array C such that:
C = B * A
C has dimension [L, M, N]
I tried answer posted at this link but it hasn't given me the desired output.
A for loop version of above code is:
L, M, N = 6, 31, 500
A = np.random.random((L, M, N))
B = np.random.random((L, L))
z1 = []
for j in range(M):
a = np.squeeze(A[:, j, :])
z1.append(np.dot(B, a))
z2 = np.stack(z1)

I think you are looking for numpy.tensordot() where you can specify along which axes to sum:
np.tensordot(B,A,axes=(1,0))

Related

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})
Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

How to vectorise an integration function?

I'm a beginner in numpy and I want to vectorise this function:
I don't quite understand what I need to do but this is what I've come up with:
n = 1000000
h = 1/n
x = np.arange(1,n,1)
def f(x):
return x ** 3
def rec(x):
result = np.zeros_like(x)
result[x < n] = f((x[x < n])*h)
return result
integral = 0.5*h + h*rec(x)
print integral
I end up with an array of 0's. Could someone please point me in the right direction?
Try:
def trap(f, a, b, n):
xs = np.linspace(a, b, n + 1)
ys = f(xs)
return (0.5 * ys[0] + 0.5 * ys[-1] + np.sum(ys[1:-1])) * (b - a) / n

How to find the value of integer k efficiently for which q divides b ^ k finitely?

We have given two integers b and q, and we want to find the minimum value of an integer 'k' for which q completely divides b^k or k does not exist. Can we find out the value of k efficiently? Not just iterating each value of k (0, 1, 2, 3, ...) and checking (b^k) % q == 0) where q <= k or q >= k.
First of all, k will never equal zero unless q=1. k will never equal one unless q=b.
Next, if you can factorize q and b, then you can reason about them.
If there are any prime factors of b that are not factors of q at all, then k does not exist. Otherwise, k has to be large enough so that every factor of b^k is represented in q.
Here's some pseudo-code:
if (q==1) return 0;
if (q==b) return 1;
// qfactors and bfactors are arrays, one element per factor
let qfactors = prime_factorization(q);
let bfactors = prime_factorization(b);
let kmin=0;
foreach (f in bfactors.unique) {
let bcount = bfactors.count(f);
let qcount = qfactors.count(f);
if (qcount==0 || qcount < bcount) return -1; // k does not exist
kmin_f = ceiling(bcount/qcount);
if (kmin_f > kmin) let kmin = kmin_f;
}
return kmin;
If q = 1 ; k = 0
If b = q ; k = 1
If b > q and factors ; k = 1
If b < q and factors ; k != I
If b != q and not factors ; k != I
We know,
Dividend = Divisor x Quotient + Reminder
=> Dividend = Divisor x Quotient [Here, Reminder = 0]
Now go for calculation of Maxima and Minima as lower the value of Quotient is lower the value of 'k'.
If you consider the Quotient as 1 (lowest but spl case) then your formula for 'k' becomes,
k = log q/log b
I found a solution-
If q divides pow(b,k) then all prime factors of q are prime factors of b. Now we can do iterations q = q ÷ gcd(b,q) while gcd(q,b)≠1. If q≠1 after iterations, there are prime factors of q which are not prime factors of b then k doesn't exist else k = no of iteration.

Pick random tensors from another one in Tensorflow

I have a Tensor X whith shape [B, L, E] (let's say, B batches of L vectors of length E). From this Tensor X, I want to randomly pick N vectors in each batch, and so create Y with shape [B, N, E].
I tried to combine tf.random_uniform and tf.gather but I really struggle with the dimension and can't get Y.
You can use something like this:
import tensorflow as tf
import numpy as np
B = 3
L = 5
E = 2
N = 3
input = np.array(range(B * L * E)).reshape([B, L, E])
print(input)
print("#################################")
X = tf.constant(input)
batch_range = tf.tile(tf.reshape(tf.range(B, dtype=tf.int32), shape=[B, 1, 1]), [1, N, 1])
random = tf.random_uniform([B, N, 1], minval = 0, maxval = L - 1, dtype = tf.int32)
indices = tf.concat([batch_range, random], axis = 2)
output = tf.gather_nd(X, indices)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(indices))
print("#################################")
print(sess.run(output))

batch process of graph_cnn in tensorflow

I want to use the graph_cnn (Defferrard et al. 2016) for inputs with variation of number of nodes. The author provided the example code (see graph_cnn). Below is the what I think the critical part of the code
def chebyshev5(self, x, L, Fout, K):
N, M, Fin = x.get_shape()
N, M, Fin = int(N), int(M), int(Fin)
# Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L.
L = scipy.sparse.csr_matrix(L)
L = graph.rescale_L(L, lmax=2)
L = L.tocoo()
indices = np.column_stack((L.row, L.col))
L = tf.SparseTensor(indices, L.data, L.shape)
L = tf.sparse_reorder(L)
# Transform to Chebyshev basis
x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N
x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N
x = tf.expand_dims(x0, 0) # 1 x M x Fin*N
def concat(x, x_):
x_ = tf.expand_dims(x_, 0) # 1 x M x Fin*N
return tf.concat([x, x_], axis=0) # K x M x Fin*N
if K > 1:
x1 = tf.sparse_tensor_dense_matmul(L, x0)
x = concat(x, x1)
for k in range(2, K):
x2 = 2 * tf.sparse_tensor_dense_matmul(L, x1) - x0 # M x Fin*N
x = concat(x, x2)
x0, x1 = x1, x2
x = tf.reshape(x, [K, M, Fin, N]) # K x M x Fin x N
x = tf.transpose(x, perm=[3,1,2,0]) # N x M x Fin x K
x = tf.reshape(x, [N*M, Fin*K]) # N*M x Fin*K
# Filter: Fin*Fout filters of order K, i.e. one filterbank per feature pair.
W = self._weight_variable([Fin*K, Fout], regularization=False)
x = tf.matmul(x, W) # N*M x Fout
return tf.reshape(x, [N, M, Fout]) # N x M x Fout
Essentially, I think what this does can be simplified as something like
return = concat{(L*x)^k for (k=0 to K-1)} * W
x is the input of N x M x Fin (size variable in any batch):
L is an array of operators on x each with the size of M x M matching the corresponding sample (size variable in any batch).
W is the neural network parameters to be optimized, its size is Fin x K x Fout
N: number of samples in a batch (size fixed for any batch);
M: the number of nodes in the graph (size variable in any batch);
Fin: the number of input features (size fixed for any batch)].
Fout is the number of output features (size fixed for any batch).
K is a constant representing the number of steps (hops) in the graph
For single example, the above code works. But since both x and L have variable length for each sample in a batch, I don't know how to make it work for a batch of samples.
The tf.matmul currently (v1.4) only supports batch matrix multiplication on the lowest 2 dims for dense tensors. If either of the input tensor is sparse, it will prompt dimension mismatch error. tf.sparse_tensor_dense_matmul cannot be applied to batch inputs either.
Therefore, my current solution is to move all L preparation steps before calling the function, pass the L as a dense tensor (shape: [N, M, M]), and use the tf.matmul to perform the batch matrix multiplication.
Here is my revised code:
'''
chebyshev5_batch
Purpose:
perform the graph filtering on the given layer
Args:
x: the batch of inputs for the given layer,
dense tensor, size: [N, M, Fin],
L: the batch of sorted Laplacian of the given layer (tf.Tensor)
if in dense format, size of [N, M, M]
Fout: the number of output features on the given layer
K: the filter size or number of hopes on the given layer.
lyr_num: the idx of the original Laplacian lyr (start form 0)
Output:
y: the filtered output from the given layer
'''
def chebyshev5_batch(x, L, Fout, K, lyr_num):
N, M, Fin = x.get_shape()
#N, M, Fin = int(N), int(M), int(Fin)
# # Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L.
# L = scipy.sparse.csr_matrix(L)
# L = graph.rescale_L(L, lmax=2)
# L = L.tocoo()
# indices = np.column_stack((L.row, L.col))
# L = tf.SparseTensor(indices, L.data, L.shape)
# L = tf.sparse_reorder(L)
# # Transform to Chebyshev basis
# x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N
# x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N
def expand_concat(orig, new):
new = tf.expand_dims(new, 0) # 1 x N x M x Fin
return tf.concat([orig, new], axis=0) # (shape(x)[0] + 1) x N x M x Fin
# L: # N x M x M
# x0: # N x M x Fin
# L*x0: # N x M x Fin
x0 = x # N x M x Fin
stk_x = tf.expand_dims(x0, axis=0) # 1 x N x M x Fin (eventually K x N x M x Fin, if K>1)
if K > 1:
x1 = tf.matmul(L, x0) # N x M x Fin
stk_x = expand_concat(stk_x, x1)
for kk in range(2, K):
x2 = tf.matmul(L, x1) - x0 # N x M x Fin
stk_x = expand_concat(stk_x, x2)
x0 = x1
x1 = x2
# now stk_x has the shape of K x N x M x Fin
# transpose to the shape of N x M x Fin x K
## source positions 1 2 3 0
stk_x_transp = tf.transpose(stk_x, perm=[1,2,3,0])
stk_x_forMul = tf.reshape(stk_x_transp, [N*M, Fin*K])
#W = self._weight_variable([Fin*K, Fout], regularization=False)
W_initial = tf.truncated_normal_initializer(0, 0.1)
W = tf.get_variable('weights_L_'+str(lyr_num), [Fin*K, Fout], tf.float32, initializer=W_initial)
tf.summary.histogram(W.op.name, W)
y = tf.matmul(stk_x_forMul, W)
y = tf.reshape(y, [N, M, Fout])
return y