Defining the correct vectorization axes for JAX vmap with arrays of different shapes and sizes - numpy

Following the answer to this post, the following function that
'f_switch' that dynamically switches between multiple functions based on an index array is defined (based on 'jax.lax.switch'):
import jax
from jax import vmap;
import jax.random as random
def g_0(x, y, z, u): return x + y + z + u
def g_1(x, y, z, u): return x * y * z * u
def g_2(x, y, z, u): return x - y + z - u
def g_3(x, y, z, u): return x / y / z / u
g_i = [g_0, g_1, g_2, g_3]
#jax.jit
def f_switch(i, x, y, z, u):
g = lambda i: jax.lax.switch(i, g_i, x, y, z, u)
return jax.vmap(g)(i)
With input arrays: i_ar of shape (len_i,), x_ar y_ar and z_ar of shapes (len_xyz,) and u_ar of shape (len_u, len_xyz), out = f_switch(i_ar, x_ar, y_ar, z_ar, u_ar), yields out of shape
(len_i, len_xyz, len_u):
len_i = 50
i_ar = random.randint(random.PRNGKey(5), shape=(len_i,), minval=0, maxval= len(g_i)) #related to
len_xyz = 3000
x_ar = random.uniform(random.PRNGKey(0), shape=(len_xyz,))
y_ar = random.uniform(random.PRNGKey(1), shape=(len_xyz,))
z_ar = random.uniform(random.PRNGKey(2), shape=(len_xyz,))
len_u = 1000
u_0 = random.uniform(random.PRNGKey(3), shape=(len_u,))
u_1 = jnp.repeat(u_0, len_xyz)
u_ar = u_1.reshape(len_u, len_xyz)
out = f_switch(i_ar, x_ar, y_ar, z_ar, u_ar)
print('The shape of out is', out.shape)
This worked. **But, How can the f_switch function be defined such that the result out of out = f_switch(i_ar, x_ar, y_ar, z_ar, u_ar) has a shape of (j_len, k_len, l_len) when the function is applied along the following axes: i_ar[j], x_ar[j], y_ar[j, k], z_ar[j, k], u_ar[l]? I am not sure about how ** Examples of these input arrays are here:
j_len = 82;
k_len = 20;
l_len = 100;
i_ar = random.randint(random.PRNGKey(0), shape=(j_len,), minval=0, maxval=len(g_i))
x_ar = random.uniform(random.PRNGKey(1), shape=(j_len,))
y_ar = random.uniform(random.PRNGKey(2), shape=(j_len,k_len))
z_ar = random.uniform(random.PRNGKey(3), shape=(j_len,k_len))
u_ar = random.uniform(random.PRNGKey(4), shape=(l_len,))
I tried to resolve this (i.e. with given input array to get output of shape: (j_len, k_len, l_len), with a nested vmap:
#jax.jit
def f_switch(i, x, y, z, u):
g = lambda i, x, y, z, u: jax.lax.switch(i, g_i, x, y, z, u)
g_map = jax.vmap(g, in_axes=(None, 0, 0, 0, 0))
wrapper = lambda x, y, z, u: g_map(i, x, y, z, u)
return jax.vmap(wrapper, in_axes=(0, None, None, None, 0))(x, y, z, u)
and to broadcast u_ar: u_ar_broadcast = jnp.broadcast_to(u_ar, (j_len, k_len, l_len)), and then apply it inside of the original f_switch. But, both of these attempts failed.

It looks like maybe you want something like this?
#jax.jit
def f_switch(i, x, y, z, u):
g = lambda i, x, y, z, u: jax.lax.switch(i, g_i, x, y, z, u)
g = jax.vmap(g, (None, None, None, None, 0))
g = jax.vmap(g, (None, None, 0, 0, None))
g = jax.vmap(g, (0, 0, 0, 0, None))
return g(i, x, y, z, u)
out = f_switch(i_ar, x_ar, y_ar, z_ar, u_ar)
print(out.shape)
# (82, 20, 100)
You should read the in_axes from bottom to top (because the bottom vmap is the outer one, and is therefore applied to the inputs first). Schematically, you can think of the effect of the maps on the shapes as something like this:
(i[82], x[82], y[82,20], z[82,20], u[100])
(0, 0, 0, 0, None) -> (i, x, y[20], z[20], u[100])
(None, None, 0, 0, None) -> (i, x, y, z, u[100])
(None, None, None, None, 0) -> (i, x, y, z, u)
That said, often it is easier to rely on numpy-style broadcasting rather than on multiple nested vmaps. For example, you could also do something like this:
#jax.jit
def f_switch(i, x, y, z, u):
g = lambda i, x, y, z, u: jax.lax.switch(i, g_i, x, y, z, u)
return jax.vmap(g, in_axes=(0, 0, 0, 0, None))(i, x, y, z, u)
out = f_switch(i_ar, x_ar[:, None, None], y_ar[:, :, None], z_ar[:, :, None], u_ar)
print(out.shape)
# (82, 20, 100)

Related

Is there a way for dynamic N-times replication of a tensor in Tensorflow custom layer (on TPU)?

I'm trying to solve quite a simple task (I thought it to be), which is replicating a tensor in custom layer on TPU.
My input is 2 tensors of shapes A=(BS, H, n, C) and B = (BS, n, W, C), where n in my case can be (1, 3, 5, 7), but should probably also work with other numbers.
My task is to repeat both tensors A & B to shape (BS, H, W, C) and them sum them for the output. It would be easy if H (or W) were always divisible by n, but they are not. So the number of repeats for each slice (BS, H, 1, C) of A would differ. Thus the output is calculated using the following pseudocode:
for i in range(W):
A1[BS, H, i, C] = A[BS, H, floor(n*i/W), C]
I tried implementing it in a multiple ways:
class StripPoolingCombine(tf.keras.layers.Layer):
def __init__(self, n=1):
super(StripPoolingCombine, self).__init__()
self.n = n
def call(self, v, h, training=False):
H, W = v.shape[1], h.shape[2]
v_repeats = tf.unique_with_counts(tf.math.floor(tf.range(W) * self.n / W))[-1]
h_repeats = tf.unique_with_counts(tf.math.floor(tf.range(H) * self.n / H))[-1]
v = tf.repeat(v, repeats=v_repeats, axis=2)
h = tf.repeat(h, repeats=h_repeats, axis=1)
return Add()([v, h])
Or by replacing unique_with_counts with the following logic:
tf.math.bincount(tf.cast(tf.math.floor(tf.range(W) * self.n / W), dtype=tf.int32)
Using improvised formula:
f = tf.cast(tf.math.ceil(W / self.n), dtype=tf.int32)
s = tf.cast(tf.math.floor(W / self.n), dtype=tf.int32)
b = tf.cast(f!=s, dtype=tf.int32)
r = W - f - s * (self.n - 1)
x1 = s * tf.ones(self.n-1, dtype=tf.int32)
x2 = (1 - tf.range(r*2) % 2) * b
x2 = tf.pad(x2, paddings=[[0, self.n-r*2-1]])
x3 = tf.concat([[f], tf.add(x1, x2)], axis=0)
But as could be seen at Available TensorFlow Ops for TPU, it doesn't support dynamic tf.range, tf.unique_with_counts or tf.math.bincount, and my implementations all result in errors when bulding a model and calling model.fit() or model.predict(). Yet I still hope that tensorflow has provided some way to work with dynamic shapes in a way that would suit my task, and won't me rewrite whole Ops module for such a trivial issue. Please, help!
Full reproducible example (using Colab TPU):
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Add
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print(f'Running on TPU: {tpu.master()}')
except ValueError:
print('Could not connect to TPU')
tpu = None
if tpu:
try:
print('Initializing TPU...')
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)
print('TPU initialized!')
except Exception:
print('Failed to initialize TPU')
# class StripPoolingCombine(tf.keras.layers.Layer):
# def __init__(self, n=1):
# super(StripPoolingCombine, self).__init__()
# self.n = n
# def call(self, v, h, training=False):
# H, W = v.shape[1], h.shape[2]
# v_repeats = tf.unique_with_counts(tf.math.floor(tf.range(W) * self.n / W))[-1]
# h_repeats = tf.unique_with_counts(tf.math.floor(tf.range(H) * self.n / H))[-1]
# v = tf.repeat(v, repeats=v_repeats, axis=2)
# h = tf.repeat(h, repeats=h_repeats, axis=1)
# return Add()([v, h])
class StripPoolingCombine(tf.keras.layers.Layer):
def __init__(self, n=1):
super(StripPoolingCombine, self).__init__()
self.n = n
def call(self, v, h, training=False):
H, W = tf.shape(v)[1], tf.shape(h)[2]
f = tf.cast(tf.math.ceil(W / self.n), dtype=tf.int32)
s = tf.cast(tf.math.floor(W / self.n), dtype=tf.int32)
b = tf.cast(f!=s, dtype=tf.int32)
r = W - f - s * (self.n - 1)
x1 = s * tf.ones(self.n-1, dtype=tf.int32)
x2 = (1 - tf.range(r*2) % 2) * b
x2 = tf.pad(x2, paddings=[[0, self.n-r*2-1]])
x3 = tf.concat([[f], tf.add(x1, x2)], axis=0)
v = tf.repeat(v, repeats=x3, axis=2)
h = tf.repeat(h, repeats=x3, axis=1)
output = tf.add(v, h)
return output
def build_model(n=7):
v = Input(shape=(256, n, 3))
h = Input(shape=(n, 256, 3))
outputs = StripPoolingCombine()(v, h)
model = Model(inputs=[v, h], outputs=outputs)
return model
tf.keras.backend.clear_session()
with strategy.scope():
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999)
model = build_model()
model.compile(optimizer=optimizer, loss='mean_squared_error')
rng_1 = tf.random.uniform([1, 256, 7, 3])
rng_2 = tf.random.uniform([1, 7, 256, 3])
model.predict([rng_1, rng_2])
Use tf.gather:
def call(self, v, h, training=False):
def out(A, H, axis):
r = tf.range(H)
inds = tf.floor(self.n * r / H)
inds = tf.cast(inds, tf.int32)
return tf.gather(A, inds, axis=axis)
H, W = tf.shape(v)[1], tf.shape(h)[2]
v = out(v, W, 2)
h = out(h, H, 1)
output = tf.add(v, h)
return output

Getting None Error while creating Spatial Transformer Network

Output feature map of a convolution layer is (Batch, Height, Width, Channels). When we initialize the CNN in tensorflow we get None value in place of Batch. I am trying to implement Spatial Transformer Network in custom layer, so to vectorize the layer as Convolution Layer Batch Size is required. When I try to initialize the network the Spatial Transformer Layer is giving the error that operations cant be performed with None value.
My code is show below
class SpatialTransformer(Layer):
def __init__(self):
super(SpatialTransformer, self).__init__()
def affine_transform(self, input_shape, theta):
N = theta.shape[0]
H, W = input_shape #output dimensions of grid
x_t, y_t = tf.meshgrid(tf.linspace(-1, 1, W), tf.linspace(-1, 1, H))
x_t = tf.cast(tf.reshape(x_t, [-1]), dtype = tf.float32)
y_t = tf.cast(tf.reshape(y_t, [-1]), dtype = tf.float32)
ones = tf.ones(x_t.shape, dtype=tf.float32)
sampling_grids = tf.stack([x_t, y_t, ones])
sampling_grids = tf.expand_dims(sampling_grids, axis = 0)
sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
batch_grids = tf.matmul(theta, sampling_grids)
batch_grids = tf.reshape(batch_grids, [N, 2, H, W])
return batch_grids
def get_pixel_value(self, feature_map, x_s, y_s):
"Util Function to get the value of pixel from 4d image tensors given position vectors x_s and y_s"
N, H, W = x_s.shape
batch_idx = tf.range(0, N)
batch_idx = tf.reshape(batch_idx, (N, 1, 1))
b = tf.tile(batch_idx, (1, H, W))
indices = tf.stack([b, y_s, x_s], 3) #creating indices of shape(N, H, W)
return tf.gather_nd(feature_map, indices) #extracting values corresponding to those indices
def bilinear_sampler(self, feature_map, x, y):
N, H, W, C = feature_map.shape
max_y = tf.cast(H - 1, dtype = tf.int32)
max_x = tf.cast(W - 1, dtype = tf.int32)
zero = tf.zeros([], dtype= tf.int32)
x = tf.cast(x, dtype = tf.float32)
y = tf.cast(y, dtype = tf.float32)
#Reshaping the batch grid from [-1, 1] to [0, W-1] and [0, H-1]
x = (x + 1.0) * tf.cast(max_x, dtype = tf.float32)/2.0
y = (y + 1.0) * tf.cast(max_y, dtype = tf.float32)/2.0
#Taking the 4 nearest points to the (x_i, y_i) to perform interpolation
x0 = tf.cast(tf.floor(x), dtype=tf.int32)
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), dtype = tf.int32)
y1 = y0 + 1
#clipping the value to be between [0, W-1] or [0, H-1]
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
#getting pixel values of the corner coordinates(x0,y0), (x0, y1), (x1, y0), (x1, y1)
Ia = self.get_pixel_value(feature_map, x0, y0)
Ib = self.get_pixel_value(feature_map, x0, y1)
Ic = self.get_pixel_value(feature_map, x1, y0)
Id = self.get_pixel_value(feature_map, x1, y1)
#Changing the data type to float32
x0 = tf.cast(x0, dtype = tf.float32)
x1 = tf.cast(x1, dtype = tf.float32)
y0 = tf.cast(y0, dtype = tf.float32)
y1 = tf.cast(y1, dtype = tf.float32)
#calculating delta (or simply area) weights for interpolation
Wa = tf.expand_dims((x1-x)*(y1-y), axis=3)
Wb = tf.expand_dims((x1-x)*(y-y0), axis=3)
Wc = tf.expand_dims((x-x0)*(y1-y), axis=3)
Wd = tf.expand_dims((x-x0)*(y-y0), axis=3)
out = tf.add_n([Wa*Ia, Wb*Ib, Wc*Ic, Wd*Id])
return out
def call(self, feature_map, theta, out_size = None):
N, H, W, _ = feature_map.shape
if out_size:
out_H = out_size[0]
out_W = out_size[1]
batch_grids = self.affine_transform([out_H, out_W], theta)
else:
batch_grids = self.affine_transform([H, W], theta)
x_s = batch_grids[:,0,:,:]
y_s = batch_grids[:,0,:,:]
output_feature_map = self.bilinear_sampler(feature_map, x_s, y_s)
return output_feature_map
class Localisation_Network(Layer):
def __init__(self):
super(Localisation_Network, self).__init__()
self.conv = Conv2D(4,(3, 3), padding = "valid", strides=2, activation="relu", kernel_initializer="he_normal")
self.flatten = Flatten()
self.dense_1 = Dense(64, activation="relu", kernel_initializer="he_normal")
self.dense_2 = Dense(6, activation="linear")
self.reshape = Reshape((2, 3))
def call(self, input_tensor):
x = self.conv(input_tensor)
x = self.flatten(x)
x = self.dense_1(x)
x = self.dense_2(x)
x = self.reshape(x)
return x
def get_model():
x_input = Input((28, 28, 1))
u = Conv2D(16, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x_input)
u = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(u)
theta = Localisation_Network()(u)
v = SpatialTransformer()(u, theta)
v = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(v)
x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(v)
x = GlobalAveragePooling2D()(x)
x = Flatten()(x)
x = Dense(10,activation ="softmax")(x)
model = Model(inputs = x_input, outputs = x)
return model
Error of the above code:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-47-d630585afd1d> in <module>()
4 u = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(u)
5 theta = Localisation_Network()(u)
----> 6 v = SpatialTransformer()(u, theta)
7 v = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(v)
8 x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(v)
4 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
668 except Exception as e: # pylint:disable=broad-except
669 if hasattr(e, 'ag_error_metadata'):
--> 670 raise e.ag_error_metadata.to_exception(e)
671 else:
672 raise
ValueError: in user code:
<ipython-input-7-910b0adb6eb7>:83 call *
batch_grids = self.affine_transform([H, W], theta)
<ipython-input-45-eb5ac5f8f722>:14 affine_transform *
sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper **
return target(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py:1405 stack
value_shape = ops.convert_to_tensor(values[0], name=name)._shape_tuple() # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/profiler/trace.py:163 wrapped
return func(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1540 convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:339 _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:265 constant
allow_broadcast=True)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:283 _constant_impl
allow_broadcast=allow_broadcast))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_util.py:445 make_tensor_proto
raise ValueError("None values not supported.")
ValueError: None values not supported.
It is hard to tell from here but based on stacktrace seems like this line is problematic - sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1])) (forwards None where is not expected).
2nd thing I have noticed - not sure if your call method override in SpatialTransformer should actually have 3 params def call(self, feature_map, theta, out_size = None): ?
Seems like since it inherits from Layer it should have input_tensor param only.
Not sure also if you need to override build for your use case and perhaps do the initializations required there.
Other than that you can try to extensively log (add print statements) and see where exactly None value 'enters'.
Finally, you can also upload an excerpt of your code sufficient to reproduce the same error and that could perhaps bring more help.
I have removed the tf.tile layer as the vectorized output of localisation network having dimension (None, 2, 3) will do the vectorization trick during tf.matmul operation. I have also replaced tf.reshape operation with predifined keras reshape layer tf.keras.layers.Reshape() for every reshape operation as they maintain the vectorization.
class SpatialTransformer(Layer):
def __init__(self, out_size, name= "spatial_transformer"):
super(SpatialTransformer, self).__init__()
self.out_size = out_size
self.reshape_1 = Reshape([2, self.out_size[0], self.out_size[1]]) #for replacing all the reshape to vectorized form
self.reshape_2 = Reshape([self.out_size[0], self.out_size[1]])
self.reshape_3 = Reshape([1, 1])
self.reshape_4 = Reshape([])
def affine_transform(self, input_shape, theta):
N = theta.shape[0]
H, W = input_shape #output dimensions of grid
x_t, y_t = tf.meshgrid(tf.linspace(-1, 1, W), tf.linspace(-1, 1, H))
x_t = tf.cast(tf.reshape(x_t, [-1]), dtype = tf.float32)
y_t = tf.cast(tf.reshape(y_t, [-1]), dtype = tf.float32)
ones = tf.ones(x_t.shape, dtype=tf.float32)
sampling_grids = tf.stack([x_t, y_t, ones])
# sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
batch_grids = tf.matmul(theta, sampling_grids)
batch_grids = self.reshape_1(batch_grids)
return batch_grids
def get_pixel_value(self, feature_map, x_s, y_s):
"Util Function to get the value of pixel from 4d image tensors given position vectors x_s and y_s"
N, H, W = x_s.shape
batch_idx = tf.range(0, N)
batch_idx = self.reshape_3(batch_idx)
b = tf.tile(batch_idx, (1, H, W))
indices = tf.stack([b, y_s, x_s], 3) #creating indices of shape(N, H, W)
return tf.gather_nd(feature_map, indices) #extracting values corresponding to those indices
def bilinear_sampler(self, feature_map, x, y):
N, H, W, _ = feature_map.shape
max_y = tf.cast(H - 1, dtype = tf.int32)
max_x = tf.cast(W - 1, dtype = tf.int32)
zero = tf.zeros([], dtype= tf.int32)
x = tf.cast(x, dtype = tf.float32)
y = tf.cast(y, dtype = tf.float32)
#Reshaping the batch grid from [-1, 1] to [0, W-1] and [0, H-1]
x = (x + 1.0) * tf.cast(max_x, dtype = tf.float32)/2.0
y = (y + 1.0) * tf.cast(max_y, dtype = tf.float32)/2.0
#Taking the 4 nearest points to the (x_i, y_i) to perform interpolation
x0 = tf.cast(tf.floor(x), dtype=tf.int32)
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), dtype = tf.int32)
y1 = y0 + 1
#clipping the value to be between [0, W-1] or [0, H-1]
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
#getting pixel values of the corner coordinates(x0,y0), (x0, y1), (x1, y0), (x1, y1)
Ia = self.get_pixel_value(feature_map, x0, y0)
Ib = self.get_pixel_value(feature_map, x0, y1)
Ic = self.get_pixel_value(feature_map, x1, y0)
Id = self.get_pixel_value(feature_map, x1, y1)
# print(f"Ia: {Ia}")
#Changing the data type to float32
x0 = tf.cast(x0, dtype = tf.float32)
x1 = tf.cast(x1, dtype = tf.float32)
y0 = tf.cast(y0, dtype = tf.float32)
y1 = tf.cast(y1, dtype = tf.float32)
#calculating delta (or simply area) weights for interpolation
Wa = tf.expand_dims((x1-x)*(y1-y), axis=3)
Wb = tf.expand_dims((x1-x)*(y-y0), axis=3)
Wc = tf.expand_dims((x-x0)*(y1-y), axis=3)
Wd = tf.expand_dims((x-x0)*(y-y0), axis=3)
out = tf.add_n([Wa*Ia, Wb*Ib, Wc*Ic, Wd*Id])
return out
def call(self, input_tensor):
feature_map, theta = input_tensor
N, H, W, _ = feature_map.shape
if self.out_size:
out_H = self.out_size[0]
out_W = self.out_size[1]
batch_grids = self.affine_transform([out_H, out_W], theta)
else:
batch_grids = self.affine_transform([H, W], theta)
x_s = self.reshape_2(batch_grids[:,0,:,:])
y_s = self.reshape_2(batch_grids[:,1,:,:])
output_feature_map = self.bilinear_sampler(feature_map, x_s, y_s)
return output_feature_map
class Localisation_Network(Layer):
def __init__(self):
super(Localisation_Network, self).__init__()
self.conv_1 = Conv2D(16, (3, 3), padding = "same", strides=1, activation="relu", kernel_initializer="he_normal")
self.conv_2 = Conv2D(32, (3, 3), padding = "same", strides=1, activation="relu", kernel_initializer="he_normal")
self.flatten = Flatten()
self.dense_1 = Dense(32, activation="relu", kernel_initializer="he_normal")
def bias_init(shape, dtype = None):
identitiy = tf.Variable([[1.0, 0.0, 0.0],[0.0, 1.0, 0.0]])
identitiy = tf.reshape(identitiy, -1)
return identitiy
self.dense_2 = Dense(6,kernel_initializer = "zeros", bias_initializer = bias_init)
self.reshape = Reshape((2, 3))
def call(self, input_tensor):
x = self.conv_1(input_tensor)
x = self.conv_2(x)
x = tf.reduce_mean(x, axis = [1, 2])
x = self.dense_1(x)
x = self.dense_2(x)
x = self.reshape(x)
return x
def transformer_model_2():
x_input = Input((28, 28, 1))
theta = Localisation_Network()(x_input)
x = SpatialTransformer(x_input.shape[1:3], name = "transformer_output" )([x_input, theta])
x = Conv2D(16, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x)
x = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(x)
x = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x)
x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(x)
x = GlobalAveragePooling2D()(x)
x = Flatten()(x)
x = Dense(10,activation ="softmax")(x)
return Model(inputs = x_input, outputs = x)
The only thing I am stuck on is localization network, as it is a regression network so linear activation is placed but the output of this network causes the value to be big and so clipped later in bilinear sampling which ultimately results in zero output and hence gradients are not able to flow through localization network.
I have looked up medium post and github to find the solution, many of them suggested to initialize the weights to zeros and biases to identity: [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]] in the last layer of the localization network but it's not working.

scipy.optimize.minimize: l2 norm constraints in matrix rows

I am interested to apply l2 norm constraint in each row of the parameters matrix in scipy.optimize.minimize. What I have tried so far is
def l2_const(x):
x = x.reshape(r, c)
b = np.sqrt((x**2).sum(axis=1)) - 1
return np.broadcast_to(b[:, None], (r, c)).flatten()
x0 = np.random.random((r, c))
const = ({'type': 'eq', 'fun': l2_const},)
f_min = minimize(fun=cost, x0=x0, method='SLSQP', jac=gradient, constraints=const)
but the computed parameters f_min.x are all zeros. Does anyone know how to implement correctly this type of constraints?
EDIT 1: An example to apply this type of constraints can be found in my answer of my previous post.
EDIT 2: Below you can find a complete working example. The results are very low when the constrains are used. Any suggestions are welcome.
Class:
import numpy as np
from scipy.optimize import minimize
from sklearn import preprocessing
class myLR():
def __init__(self, reltol=1e-8, maxit=1000, opt_method=None, verbose=True, seed=0):
self.maxit = maxit
self.reltol = reltol
self.seed = seed
self.verbose = verbose
self.opt_method = opt_method
self.lbin = preprocessing.LabelBinarizer()
def w_2d(self, w, n_classes):
return np.reshape(w, (n_classes, -1))
def softmax(self, W, X):
a = np.exp(X # W.T)
o = a / np.sum(a, axis=1, keepdims=True)
return o
def squared_norm(self, x):
x = np.ravel(x, order='K')
return np.dot(x, x)
def cost(self, W, X, T, n_samples, n_classes):
W = self.w_2d(W, n_classes)
log_O = np.log(self.softmax(W, X))
c = -(T * log_O).sum()
return c / n_samples
def gradient(self, W, X, T, n_samples, n_classes):
W = self.w_2d(W, n_classes)
O = self.softmax(W, X)
grad = -(T - O).T.dot(X)
return grad.ravel() / n_samples
def l1_constraint(self, x, n_classes, n_features):
x = x.reshape(n_classes, -1)
b = x.sum(axis=1) - 1
return np.broadcast_to(b[:, None], (n_classes, n_features)).flatten()
def fit(self, X, y=None):
n_classes = len(np.unique(y))
n_samples, n_features = X.shape
if n_classes == 2:
T = np.zeros((n_samples, n_classes), dtype=np.float64)
for i, cls in enumerate(np.unique(y)):
T[y == cls, i] = 1
else:
T = self.lbin.fit_transform(y)
np.random.seed(self.seed)
W_0 = np.random.random((n_classes, n_features))
const = ({'type': 'eq', 'fun': self.l1_constraint, 'args': (n_classes, n_features,)},)
options = {'disp': self.verbose, 'maxiter': self.maxit}
f_min = minimize(fun=self.cost, x0=W_0,
args=(X, T, n_samples, n_classes),
method=self.opt_method,
constraints=const,
jac=self.gradient,
options=options)
self.coef_ = self.w_2d(f_min.x, n_classes)
self.W_ = self.coef_
return self
def predict_proba(self, X):
O = self.softmax(self.W_, X)
return O
def predict(self, X):
sigma = self.predict_proba(X)
y_pred = np.argmax(sigma, axis=1)
return y_pred
Main:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from myLR import myLR
iris = datasets.load_iris()
X = iris.data[:, 0:2]
y = iris.target
par_dict2 = {'reltol': 1e-6,
'maxit': 20000,
'verbose': 20,
'seed': 0}
# Create different classifiers.
classifiers = {
'myLR': myLR(**par_dict2),
}
n_classifiers = len(classifiers)
plt.figure(figsize=(3 * 2, n_classifiers * 2))
plt.subplots_adjust(bottom=.2, top=.95)
xx = np.linspace(3, 9, 100)
yy = np.linspace(1, 5, 100).T
xx, yy = np.meshgrid(xx, yy)
Xfull = np.c_[xx.ravel(), yy.ravel()]
accuracy_score
for index, (name, classifier) in enumerate(classifiers.items()):
classifier.fit(X, y)
coef_ = classifier.coef_
print(np.linalg.norm(coef_, axis=1))
y_pred = classifier.predict(X)
accuracy = accuracy_score(y, y_pred)
print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
# View probabilities:
probas = classifier.predict_proba(Xfull)
n_classes = np.unique(y_pred).size
for k in range(n_classes):
plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)
plt.title("Class %d" % k)
if k == 0:
plt.ylabel(name)
imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)),
extent=(3, 9, 1, 5), origin='lower')
plt.xticks(())
plt.yticks(())
idx = (y_pred == k)
if idx.any():
plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k')
ax = plt.axes([0.15, 0.04, 0.7, 0.05])
plt.title("Probability")
plt.colorbar(imshow_handle, cax=ax, orientation='horizontal')
plt.show()
EDIT 3: I replaced the constraints, with
def l1_constraint(self, x, n_classes, n_features):
x = x.reshape(n_classes, -1)
b = x.sum(axis=1) - 1
return b
It produces better results. However, the computed components x1 and x2 do not sum to 1? Is that fine?

Implementing the Cosine similarity in tensor flow

My Question is for the below equation
The equation above of single vector. But if I have a batches of vectors, like my X and Y having the dimension of (None, 32), then there will some issue.
Also remember in coding environment, one example inside the batch is already in transpose shape. My problem is when we need to do transpose on [None, 32] the code will not accept and transpose for None dimenation.So I solve it in the following way:
def Cosine_similarity(X, Y, feature_dim):
L = tf.compat.v1.initializers.glorot_normal()(shape=[feature_dim, feature_dim])
out1 = tf.matmul(X, L)
out2 = tf.matmul(Y, L)
out_numerator = tf.reduce_sum(tf.multiply(out1, out2), axis = 1)
out3 = tf.reduce_sum(tf.multiply(out1, out1), axis = 1)
out3 = tf.sqrt(out3)
out4 = tf.reduce_sum(tf.multiply(out2, out2), axis = 1)
out4 = tf.sqrt(out4)
out_denominator = tf.multiply(out3, out4)
final_out = tf.divide(out_numerator, out_denominator)
return final_out
And this is coming from the following:
<XA.YA> = (XA)^T (YA)
= tf.reduce_sum(tf.multiply((X A) , (Y A)), axis = 1)
So I just to know if this implementation is right? Or you can correct me if I am missing something
Not sure I understand your concern for the (none) dimension.
If I understand correctly the cosine similarity between two identically shaped matrix X and Y ([batch, target_dim]) is just a matrix multiplication of X * Y^T with some L2 normalization. Note X would be your out1 and Y would be your out2.
def Cosine_similarity(x, y, A):
"""Pair-wise Cosine similarity.
First `x` and `y` are transformed by A.
`X = xA^T` with shape [batch, target_dim],
`Y = yA^T` with shape [batch, target_dim].
Args:
x: shaped [batch, feature_dim].
y: shaped [batch, feature_dim].
A: shaped [targte_dim, feature_dim]. Transformation matrix to project
from `feature_dim` to `target_dim`.
Returns:
A cosine similarity matrix shaped [batch, batch]. The entry
at (i, j) is the cosine similarity value between vector `X[i, :]` and
`Y[j, :]` where `X`, `Y` are the transformed `x` and y` by `A`
respectively. In the other word, entry at (i, j) is the pair-wise
cosine similarity value between the i-th example of `x` and the j-th
example of `y`.
"""
x = tf.matmul(x, A, transpose_b=True)
y = tf.matmul(y, A, transpose_b=True)
x_norm = tf.nn.l2_normalize(x, axis=-1)
y_norm = tf.nn.l2_normalize(y, axis=-1)
y_norm_trans = tf.transpose(y_norm, [1, 0])
sim = tf.matmul(x_norm, y_norm_trans)
return sim
import numpy as np
feature_dim = 8
target_dim = 4
batch_size = 2
x = tf.placeholder(tf.float32, shape=(None, dim))
y = tf.placeholder(tf.float32, shape=(None, dim))
A = tf.placeholder(tf.float32, shape=(target_dim, feature_dim))
sim = Cosine_similarity(x, y, A)
with tf.Session() as sess:
x, y, sim = sess.run([x, y, sim], feed_dict={
x: np.ones((batch_size, feature_dim)),
y: np.random.rand(batch_size, feature_dim),
A: np.random.rand(target_dim, feature_dim)})
print 'x=\n', x
print 'y=\n', y
print 'sim=\n', sim
Result:
x=
[[ 1. 1. 1. 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1. 1. 1. 1.]]
y=
[[ 0.01471654 0.76577073 0.97747731 0.06429122 0.91344446 0.47987637
0.09899797 0.773938 ]
[ 0.8555786 0.43403915 0.92445409 0.03393625 0.30154493 0.60895061
0.1233703 0.58597666]]
sim=
[[ 0.95917791 0.98181278]
[ 0.95917791 0.98181278]]

Emulating boolean masks in Theano

I'm porting a numpy expression to theano. The expression finds the number of true positive predictions for each class, given a one-hot matrix Y of ground truth classes and a one-hot matrix Y_hat of predicted classes. The numpy code is:
import numpy as np
y = np.array([1, 0, 1, 2, 2])
y_hat = np.array([2, 0, 1, 1, 0])
Y = np.zeros(shape=(len(y), len(np.unique(y))))
Y_hat = np.zeros_like(Y)
rows = np.arange(len(y))
Y[rows, y] = 1
Y_hat[rows, y_hat] = 1
((Y_hat == Y) & (Y == 1)).sum(axis=0)
The last expression yields array([1, 1, 0]). I've tried using theano's nonzero:
from theano import shared
Yt = shared(Y)
Yt_hat = shared(Y_hat)
Yt_hat[Yt.nonzero()].eval()
The eval results in array([ 0., 1., 1., 0., 0.]), which is a 0-1 mask of the rows of Yt_hat where the prediction is correct. Any suggestions for how to make this work? For different ways of doing it? Thanks.
Here are three variants demonstrating how to re-implement parts of your numpy code in Theano.
Note that Theano's Unique operation does not support running on the GPU and does not appear to support gradients either. As a result version 3 many not be of much use. Version 2 provides a workaround: compute the unique values outside Theano and pass them in. Version 1 is a Theano implementation of the final line of your numpy code only.
To address your specific issue: there is no need to use nonzero; in this case the indexing works in Theano just like it works in numpy. Maybe you were getting confused between y and Y? (common Python style is to stick with lower case for all variable and parameter names).
import numpy as np
import theano
import theano.tensor as tt
import theano.tensor.extra_ops
def numpy_ver(y, y_hat):
Y = np.zeros(shape=(len(y), len(np.unique(y))), dtype=np.int64)
Y_hat = np.zeros_like(Y, dtype=np.int64)
rows = np.arange(len(y), dtype=np.int64)
Y[rows, y] = 1
Y_hat[rows, y_hat] = 1
return ((Y_hat == Y) & (Y == 1)).sum(axis=0), Y, Y_hat
def compile_theano_ver1():
Y = tt.matrix(dtype='int64')
Y_hat = tt.matrix(dtype='int64')
z = (tt.eq(Y_hat, Y) & tt.eq(Y, 1)).sum(axis=0)
return theano.function([Y, Y_hat], outputs=z)
def compile_theano_ver2():
y = tt.vector(dtype='int64')
y_hat = tt.vector(dtype='int64')
y_uniq = tt.vector(dtype='int64')
Y = tt.zeros(shape=(y.shape[0], y_uniq.shape[0]), dtype='int64')
Y_hat = tt.zeros_like(Y, dtype='int64')
rows = tt.arange(y.shape[0], dtype='int64')
Y = tt.set_subtensor(Y[rows, y], 1)
Y_hat = tt.set_subtensor(Y_hat[rows, y_hat], 1)
z = (tt.eq(Y_hat, Y) & tt.eq(Y, 1)).sum(axis=0)
return theano.function([y, y_hat, y_uniq], outputs=z)
def compile_theano_ver3():
y = tt.vector(dtype='int64')
y_hat = tt.vector(dtype='int64')
y_uniq = tt.extra_ops.Unique()(y)
Y = tt.zeros(shape=(y.shape[0], y_uniq.shape[0]), dtype='int64')
Y_hat = tt.zeros_like(Y, dtype='int64')
rows = tt.arange(y.shape[0], dtype='int64')
Y = tt.set_subtensor(Y[rows, y], 1)
Y_hat = tt.set_subtensor(Y_hat[rows, y_hat], 1)
z = (tt.eq(Y_hat, Y) & tt.eq(Y, 1)).sum(axis=0)
return theano.function([y, y_hat], outputs=z)
def main():
y = np.array([1, 0, 1, 2, 2], dtype=np.int64)
y_hat = np.array([2, 0, 1, 1, 0], dtype=np.int64)
y_uniq = np.unique(y)
result, Y, Y_hat = numpy_ver(y, y_hat)
print result
theano_ver1 = compile_theano_ver1()
print theano_ver1(Y, Y_hat)
theano_ver2 = compile_theano_ver2()
print theano_ver2(y, y_hat, y_uniq)
theano_ver3 = compile_theano_ver3()
print theano_ver3(y, y_hat)
main()