Outer difference along axis in tensorflow

Outer difference along axis in tensorflow - tensorflow

Given a tensor T_0 of shape (I,J,K), I am trying to think of an efficient way to construct a new tensor T_1 of shape (I,I,J,K), whose elements are T_1[i1, i2, j, k] = T_0[i1, j, k] - T_0[i2, j, k]
Here is a pedestrian approach with 2 loops:
outer_list = [None] * I
for i1 in range(I):
inner_list = [0] * I
for i2 in range(I):
inner_list[i2] = T_0[i1] - T_0[i2]
outer_list [cl] = tf.stack(tf.stack(inner_list))
T_1 = tf.stack(outer_list )
Here is a pedestrian approach with 1 loop:
outer_list = [None] * I
for i1 in range(I):
outer_list[i1] = T_0[i1 : i1+1] - T_0
T_1 = tf.stack(outer_list )

Related

Efficient way to calculate 3D matrix multiplication using numpy

How can I efficiently write and calculate this multiplication using numpy:
for k in range(K):
for i in range(SIZE):
for j in range(SIZE):
for i_b in range(B_SIZE):
for j_b in range(B_SIZE):
for k_b in range(k+1):
data[k, i * w + i_b, j * h + j_b] += arr1[k_b, i_b, j_b] * arr2[k_b, i, j]
For example:
SIZE, B_SIZE = 32, 8
arr1.shape -> (8, 8, 8)
arr2.shape -> (8, 32, 32)
data.shape -> (K, 256, 256)
Thank you.

You can use Numba for such kind of non-trivial case and rework the loops to use efficiently the CPU cache. Here is an example:
import numba as nb
#nb.njit
def compute(data, arr1, arr2):
for k in range(K):
for k_b in range(k+1):
for i in range(SIZE):
for j in range(SIZE):
tmp = arr2[k_b, i, j]
for i_b in range(B_SIZE):
for j_b in range(B_SIZE):
data[k, i * w + i_b, j * h + j_b] += arr1[k_b, i_b, j_b] * tmp
If you do this operation once, then you can pre-compile the Numba code by providing the types of the arrays. If K is big, then you can parallelize the code using #nb.njit(parallel=True) and use for k in nb.prange(K) rather than for k in range(K). This should be several order of magnitude fater.

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})

Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

How to vectorise an integration function?

I'm a beginner in numpy and I want to vectorise this function:
I don't quite understand what I need to do but this is what I've come up with:
n = 1000000
h = 1/n
x = np.arange(1,n,1)
def f(x):
return x ** 3
def rec(x):
result = np.zeros_like(x)
result[x < n] = f((x[x < n])*h)
return result
integral = 0.5*h + h*rec(x)
print integral
I end up with an array of 0's. Could someone please point me in the right direction?

Try:
def trap(f, a, b, n):
xs = np.linspace(a, b, n + 1)
ys = f(xs)
return (0.5 * ys[0] + 0.5 * ys[-1] + np.sum(ys[1:-1])) * (b - a) / n

Multiplying multidimensional array in python

I have two arrays:
L, M, N = 6, 31, 500
A = np.random.random((L, M, N))
B = np.random.random((L, L))
I am trying to get an array C such that:
C = B * A
C has dimension [L, M, N]
I tried answer posted at this link but it hasn't given me the desired output.
A for loop version of above code is:
L, M, N = 6, 31, 500
A = np.random.random((L, M, N))
B = np.random.random((L, L))
z1 = []
for j in range(M):
a = np.squeeze(A[:, j, :])
z1.append(np.dot(B, a))
z2 = np.stack(z1)

I think you are looking for numpy.tensordot() where you can specify along which axes to sum:
np.tensordot(B,A,axes=(1,0))

batch process of graph_cnn in tensorflow

I want to use the graph_cnn (Defferrard et al. 2016) for inputs with variation of number of nodes. The author provided the example code (see graph_cnn). Below is the what I think the critical part of the code
def chebyshev5(self, x, L, Fout, K):
N, M, Fin = x.get_shape()
N, M, Fin = int(N), int(M), int(Fin)
# Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L.
L = scipy.sparse.csr_matrix(L)
L = graph.rescale_L(L, lmax=2)
L = L.tocoo()
indices = np.column_stack((L.row, L.col))
L = tf.SparseTensor(indices, L.data, L.shape)
L = tf.sparse_reorder(L)
# Transform to Chebyshev basis
x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N
x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N
x = tf.expand_dims(x0, 0) # 1 x M x Fin*N
def concat(x, x_):
x_ = tf.expand_dims(x_, 0) # 1 x M x Fin*N
return tf.concat([x, x_], axis=0) # K x M x Fin*N
if K > 1:
x1 = tf.sparse_tensor_dense_matmul(L, x0)
x = concat(x, x1)
for k in range(2, K):
x2 = 2 * tf.sparse_tensor_dense_matmul(L, x1) - x0 # M x Fin*N
x = concat(x, x2)
x0, x1 = x1, x2
x = tf.reshape(x, [K, M, Fin, N]) # K x M x Fin x N
x = tf.transpose(x, perm=[3,1,2,0]) # N x M x Fin x K
x = tf.reshape(x, [N*M, Fin*K]) # N*M x Fin*K
# Filter: Fin*Fout filters of order K, i.e. one filterbank per feature pair.
W = self._weight_variable([Fin*K, Fout], regularization=False)
x = tf.matmul(x, W) # N*M x Fout
return tf.reshape(x, [N, M, Fout]) # N x M x Fout
Essentially, I think what this does can be simplified as something like
return = concat{(L*x)^k for (k=0 to K-1)} * W
x is the input of N x M x Fin (size variable in any batch):
L is an array of operators on x each with the size of M x M matching the corresponding sample (size variable in any batch).
W is the neural network parameters to be optimized, its size is Fin x K x Fout
N: number of samples in a batch (size fixed for any batch);
M: the number of nodes in the graph (size variable in any batch);
Fin: the number of input features (size fixed for any batch)].
Fout is the number of output features (size fixed for any batch).
K is a constant representing the number of steps (hops) in the graph
For single example, the above code works. But since both x and L have variable length for each sample in a batch, I don't know how to make it work for a batch of samples.

The tf.matmul currently (v1.4) only supports batch matrix multiplication on the lowest 2 dims for dense tensors. If either of the input tensor is sparse, it will prompt dimension mismatch error. tf.sparse_tensor_dense_matmul cannot be applied to batch inputs either.
Therefore, my current solution is to move all L preparation steps before calling the function, pass the L as a dense tensor (shape: [N, M, M]), and use the tf.matmul to perform the batch matrix multiplication.
Here is my revised code:
'''
chebyshev5_batch
Purpose:
perform the graph filtering on the given layer
Args:
x: the batch of inputs for the given layer,
dense tensor, size: [N, M, Fin],
L: the batch of sorted Laplacian of the given layer (tf.Tensor)
if in dense format, size of [N, M, M]
Fout: the number of output features on the given layer
K: the filter size or number of hopes on the given layer.
lyr_num: the idx of the original Laplacian lyr (start form 0)
Output:
y: the filtered output from the given layer
'''
def chebyshev5_batch(x, L, Fout, K, lyr_num):
N, M, Fin = x.get_shape()
#N, M, Fin = int(N), int(M), int(Fin)
# # Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L.
# L = scipy.sparse.csr_matrix(L)
# L = graph.rescale_L(L, lmax=2)
# L = L.tocoo()
# indices = np.column_stack((L.row, L.col))
# L = tf.SparseTensor(indices, L.data, L.shape)
# L = tf.sparse_reorder(L)
# # Transform to Chebyshev basis
# x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N
# x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N
def expand_concat(orig, new):
new = tf.expand_dims(new, 0) # 1 x N x M x Fin
return tf.concat([orig, new], axis=0) # (shape(x)[0] + 1) x N x M x Fin
# L: # N x M x M
# x0: # N x M x Fin
# L*x0: # N x M x Fin
x0 = x # N x M x Fin
stk_x = tf.expand_dims(x0, axis=0) # 1 x N x M x Fin (eventually K x N x M x Fin, if K>1)
if K > 1:
x1 = tf.matmul(L, x0) # N x M x Fin
stk_x = expand_concat(stk_x, x1)
for kk in range(2, K):
x2 = tf.matmul(L, x1) - x0 # N x M x Fin
stk_x = expand_concat(stk_x, x2)
x0 = x1
x1 = x2
# now stk_x has the shape of K x N x M x Fin
# transpose to the shape of N x M x Fin x K
## source positions 1 2 3 0
stk_x_transp = tf.transpose(stk_x, perm=[1,2,3,0])
stk_x_forMul = tf.reshape(stk_x_transp, [N*M, Fin*K])
#W = self._weight_variable([Fin*K, Fout], regularization=False)
W_initial = tf.truncated_normal_initializer(0, 0.1)
W = tf.get_variable('weights_L_'+str(lyr_num), [Fin*K, Fout], tf.float32, initializer=W_initial)
tf.summary.histogram(W.op.name, W)
y = tf.matmul(stk_x_forMul, W)
y = tf.reshape(y, [N, M, Fout])
return y

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Outer difference along axis in tensorflow - tensorflow

Related

Efficient way to calculate 3D matrix multiplication using numpy

Probabilistic Record Linkage in Pandas

How to vectorise an integration function?

Multiplying multidimensional array in python

batch process of graph_cnn in tensorflow

Categories

Resources