Following the answer here: Tensorflow - matmul of input matrix with batch data
I compared between the tf.scan-based results and the tf.matmul-based results. As far as I can see, the results should be identical, but I'm getting different results, consistently. I also compared with Keras' K.dot with the same functionality as reference.
I'll appreciate any explanation as to why that is, or what is my mistake.
Attached is the full MWE, with two results, evaluated on two separate computers (with different GPUs).
import tensorflow as tf
import keras.backend as K
from keras.layers import Lambda, Input
import numpy as np
na = 100
nb = 10000
mb = 10
v = Input( (1,na) )
e = tf.placeholder(dtype=tf.float32, shape=(na,nb))
def dot_a(v):
res = K.dot(v,e)
return res
initer = np.zeros((1,nb),dtype=np.float32)
def dot_b(v):
res = tf.scan(lambda a,x: tf.matmul(x, e), v, initializer=tf.constant(initer))
return res
def dot_c(v):
v = tf.reshape(v, [-1, na])
h = tf.matmul(v, e)
res = tf.reshape(h, [-1, 1, nb])
return res
mul1 = Lambda(dot_a)(v)
mul2 = Lambda(dot_b)(v)
mul3 = Lambda(dot_c)(v)
# inputs
v0 = np.random.random((mb,1,na)).astype(np.float32)
e0 = np.random.random((na,nb)).astype(np.float32)
v0 = np.round(v0,decimals=2)
e0 = np.round(e0,decimals=2)
sess = tf.Session()
out1,out2,out3 = sess.run([mul1,mul2,mul3], feed_dict={v:v0,e:e0})
print 'norm(out, out-matmul)',np.linalg.norm(out1-out2)
print 'norm(out, out-scan)',np.linalg.norm(out1-out3)
Output in computer 1:
norm(out, out-matmul) 0.000715436
norm(out, out-scan) 0.0
Output in computer 2:
norm(out, out-matmul) 0.000511952
norm(out, out-scan) 0.0
Related
coding: utf-8
import numpy as np
import os
from keras import backend as K
def get_slope(output, slope):
conf = []
for sample in output:
conf.append([slope if num < 0 else 1 for num in sample])
conf = np.array(conf)
return conf
def get_weight(model, data):
weights = np.array(model.get_weights())
get_bn1_layer_output = K.function([model.layers[0].input, K.learning_phase()],[model.layers[2].output])
# output in test mode = 0
bn1_output = get_bn1_layer_output([data, 0])[0]
get_bn2_layer_output = K.function([model.layers[0].input, K.learning_phase()],[model.layers[6].output])
bn2_output = get_bn2_layer_output([data, 0])[0]
relu1_slope = get_slope(bn1_output, 0.3)
relu2_slope = get_slope(bn2_output, 0.3)
all_W = []
for i in range(len(relu1_slope)):
W1 = weights[0].T
W2 = weights[6].T * relu1_slope[i]*weights[2]/np.sqrt(weights[5]+0.001)
W3 = weights[12].T * relu2_slope[i]*weights[8]/np.sqrt(weights[11]+0.001)
W_im = np.matmul(W3, W2)
W = np.matmul(W_im, W1)
all_W.extend(W)
all_W = np.array(all_W)
return all_W
There was a problem when run the get_weight, ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: 0
How to solve it? .....
I have minibatches that I get from an sqlite database with data of integer and float type, x, and a binary label in 0 and 1, y. I am looking for something like X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(y, x, test_size=0.1, random_state=1, stratify=True) from scikit-learn, where a keyword could stratify the data (i.e. the same number of class-0 and class-1 instances).
In Tensorflow 2, stratification seems not straightforwardly possible. My very complicated solution works for me, but takes a lot of time because of all the reshaping and transposing:
def stratify(x, y):
# number of positive instances (the smaller class)
pos = np.sum(y).item() # how many positive bonds there are
x = np.transpose(x)
# number of features
f = np.shape(x)[1]
# filter only class 1
y = tf.transpose(y)
x_pos = tf.boolean_mask(x,
y_pos = tf.boolean_mask(y, y)
# filter only class 1
x_neg = tf.boolean_mask(x, tf.bitwise.invert(y)-254)
x_neg = tf.reshape(x_neg, [f,-1])
y_neg = tf.boolean_mask(y, tf.bitwise.invert(y)-254)
# just take randomy as many class-0 as there are class-1
x_neg = tf.transpose(tf.random.shuffle(tf.transpose(x_neg)))
x_neg = x_neg[:,0:pos]
y_neg = y_neg[0:pos]
# concat the class-1 and class-0 together, then shuffle, and concat back together
x = tf.concat([x_pos,tf.transpose(x_neg)],0)
y = tf.concat([y_pos, tf.transpose(y_neg)],0)
xy = tf.concat([tf.transpose(x), tf.cast(np.reshape(y,[1, -1]), tf.float64)],0)
xy = tf.transpose((tf.random.shuffle(tf.transpose(xy)))) # because there is no axis arg in shuffle
x = xy[0:f,:]
x = tf.transpose(x)
y = xy[f,:]
return x, y
I am happy to see some feedback/improvement on my own function or novel, easier ideas.
Data division is best if it is done in raw format only or before you transform it into tensors. If there is a strong requirement to do it in TensorFlow only, then I will suggest you to make use of tf.data.Dataset class. I have added the demo code with relevant comments explaining the steps.
import tensorflow as tf
import numpy as np
TEST_SIZE = 0.1
DATA_SIZE = 1000
# Create data
X_data = np.random.rand(DATA_SIZE, 28, 28, 1)
y_data = np.random.randint(0, 2, [DATA_SIZE])
samples1 = np.sum(y_data)
print('Percentage of 1 = ', samples1 / len(y_data))
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((X_data, y_data))
# Gather data with 0 and 1 labels separately
class0_dataset = dataset.filter(lambda x, y: y == 0)
class1_dataset = dataset.filter(lambda x, y: y == 1)
# Shuffle them
class0_dataset = class0_dataset.shuffle(DATA_SIZE)
class1_dataset = class1_dataset.shuffle(DATA_SIZE)
# Split them
class0_test_samples_len = int((DATA_SIZE - samples1) * TEST_SIZE)
class0_test = class0_dataset.take(class0_test_samples_len)
class0_train = class0_dataset.skip(class0_test_samples_len)
class1_test_samples_len = int(samples1 * TEST_SIZE)
class1_test = class1_dataset.take(class1_test_samples_len)
class1_train = class1_dataset.skip(class1_test_samples_len)
print('Train Class 0 = ', len(list(class0_train)), ' Class 1 = ', len(list(class1_train)))
print('Test Class 0 = ', len(list(class0_test)), ' Class 1 = ', len(list(class1_test)))
# Gather datasets
train_dataset = class0_train.concatenate(class1_train).shuffle(DATA_SIZE)
test_dataset = class0_test.concatenate(class1_test).shuffle(DATA_SIZE)
print('Train dataset size = ', len(list(train_dataset)))
print('Test dataset size = ', len(list(test_dataset)))
Sample output:
Percentage of 1 = 0.474
Train Class 0 = 474 Class 1 = 427
Test Class 0 = 52 Class 1 = 47
Train dataset size = 901
Test dataset size = 99
I am using tensorflow 2.0 and trying to make a actor critic algorithm to play the game of cartpole. I have done everything right but getting the following error: ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
Please help me out
Here is my code:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
MAX_EPISODES = 2000
GAMMA = 0.9
LR_A = 0.001
LR_C = 0.01
env = gym.make("CartPole-v0")
N_ACTIONS = env.action_space.n
N_FEATURES = 4
def make_actor(n_features, n_actions):
inputs = tf.keras.Input(shape=[n_features])
hidden = tf.keras.layers.Dense(20, activation=tf.nn.relu)(inputs)
dist = tf.keras.layers.Dense(n_actions, activation=tf.nn.softmax)(hidden)
model = tf.keras.Model(inputs=inputs, outputs=dist)
return model
def make_critic(n_features):
inputs = tf.keras.Input(shape=[n_features])
hidden = tf.keras.layers.Dense(20, activation=tf.nn.relu)(inputs)
value = tf.keras.layers.Dense(1)(hidden)
model = tf.keras.Model(inputs=inputs, outputs=value)
return model
actor = make_actor(N_FEATURES, N_ACTIONS)
critic = make_critic(N_FEATURES)
actor.summary()
critic.summary()
actor_optimizer = tf.keras.optimizers.Adam(LR_A)
critic_optimizer = tf.keras.optimizers.Adam(LR_C)
def loss_actor(s, a, td_error):
dist = actor(s.reshape(1, 4)).numpy()
log_prob = np.log(dist[0, a])
exp_v = np.mean(log_prob * td_error)
return tf.multiply(exp_v, -1)
def loss_critic(s, s_, r, gamma):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
v = critic(s)
v_ = critic(s_)
td_error = r + gamma * v_ - v
return tf.multiply(td_error, 1)
def train(max_episodes):
for episode in range(max_episodes):
s = env.reset().astype(np.float32)
t = 0
track_r = []
while True:
dist = actor(s.reshape(1, 4)).numpy()
a = np.random.choice(range(N_ACTIONS), p=dist.ravel())
s_, r, done, info = env.step(a)
s_ = s_.astype(np.float32)
if done: r=-20
track_r.append(r)
with tf.GradientTape() as cri_tape, tf.GradientTape() as act_tape:
td_error = loss_critic(s, s_, r, GAMMA)
gradient = cri_tape.gradient(td_error, critic.trainable_variables)
critic_optimizer.apply_gradients(zip(gradient,critic.trainable_variables))
with tf.GradientTape() as act_tape:
neg_exp_v = loss_actor(s, a, td_error.numpy())
gradient = act_tape.gradient(neg_exp_v, critic.trainable_variables)
actor_optimizer.apply_gradients(zip(gradient, actor.trainable_variables))
s = s_
t += 1
if done:
print("Episode:{} Steps:{}".format(episode+1, t))
train(MAX_EPISODES)
The error is on line 69:actor_optimizer.apply_gradients(zip(gradient, actor.trainable_variables))
When I tried to print out the gradients for the actor the result was None.
I am really not getting where the problem is.
I have 2 scenarios:
scenario 1:
op: sparse_tensor_dense_matmul
A: 1000x1000 sparsity = 90%
B: 1000x1000 sparsity = 0%
scenario 2:
op: matmul
A: 1000x1000 sparsity = 0%
B: 1000x1000 sparsity = 0%
I understand that GPUs do not compute sparse matrix multiplication well but I would certainly expect them to perform it atleast as well as they perform non-sparse matrix mulipliation. In my code I get 10x slower for sparse_tensor_dense_matmul!
import tensorflow as tf
import numpy as np
import time
import itertools
rate = 0.1
N = 1000
itrs = 1000
num = int(rate * N * N)
combs = np.array(list(itertools.product(range(N), range(N))))
choices = range(len(combs))
_idxs = np.random.choice(a=choices, size=num, replace=False).tolist()
_idxs = combs[_idxs]
_idxs = _idxs.tolist()
_idxs = sorted(_idxs)
_vals = np.float32(np.random.rand(num))
_y = np.random.uniform(low=-1., high=1., size=(N, N))
_z = np.random.uniform(low=-1., high=1., size=(N, N))
################################################
x = tf.SparseTensor(indices=_idxs, values=_vals, dense_shape=(N, N))
y = tf.Variable(_y, dtype=tf.float32)
z = tf.Variable(_z, dtype=tf.float32)
sparse_dot = tf.sparse_tensor_dense_matmul(x, y)
dot = tf.matmul(z, y)
################################################
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
start = time.time()
for i in range(itrs):
[_sparse_dot] = sess.run([sparse_dot], feed_dict={})
total = time.time() - start
print (total)
start = time.time()
for i in range(itrs):
[_dot] = sess.run([dot], feed_dict={})
total = time.time() - start
print (total)
################################################
25.357680797576904
2.7684502601623535
The following snippet is from a fairly large piece of code but hopefully I can give all the information necessary:
y2 = tf.matmul(y1,ymask)
dist = tf.norm(ystar-y2,axis=0)
y1 and y2 are 128x30 and ymask is 30x30. ystar is 128x30. dist is 1x30. When ymask is the identity matrix, everything works fine. But when I set it to be all zeros, apart from a single 1 along the diagonal (so as to set all columns but one in y2 to be zero), I get nans for the gradient of dist with respect to y2, using tf.gradients(dist, [y2]). The specific value of dist is [0,0,7.9,0,...], with all the ystar-y2 values being around the range (-1,1) in the third column and zero elsewhere.
I'm pretty confused as to why a numerical issue would occur here, given there are no logs or divisions, is this underflow? Am I missing something in the maths?
For context, I'm doing this to try to train individual dimensions of y, one at a time, using the whole network.
longer version to reproduce:
import tensorflow as tf
import numpy as np
import pandas as pd
batchSize = 128
eta = 0.8
tasks = 30
imageSize = 32**2
groups = 3
tasksPerGroup = 10
trainDatapoints = 10000
w = np.zeros([imageSize, groups * tasksPerGroup])
toyIndex = 0
for toyLoop in range(groups):
m = np.ones([imageSize]) * np.random.randn(imageSize)
for taskLoop in range(tasksPerGroup):
w[:, toyIndex] = m * 0.1 * np.random.randn(1)
toyIndex += 1
xRand = np.random.normal(0, 0.5, (trainDatapoints, imageSize))
taskLabels = np.matmul(xRand, w) + np.random.normal(0,0.5,(trainDatapoints, groups * tasksPerGroup))
DF = np.concatenate((xRand, taskLabels), axis=1)
trainDF = pd.DataFrame(DF[:trainDatapoints, ])
# define graph variables
x = tf.placeholder(tf.float32, [None, imageSize])
W = tf.Variable(tf.zeros([imageSize, tasks]))
b = tf.Variable(tf.zeros([tasks]))
ystar = tf.placeholder(tf.float32, [None, tasks])
ymask = tf.placeholder(tf.float32, [tasks, tasks])
dataLength = tf.cast(tf.shape(ystar)[0],dtype=tf.float32)
y1 = tf.matmul(x, W) + b
y2 = tf.matmul(y1,ymask)
dist = tf.norm(ystar-y2,axis=0)
mse = tf.reciprocal(dataLength) * tf.reduce_mean(tf.square(dist))
grads = tf.gradients(dist, [y2])
trainStep = tf.train.GradientDescentOptimizer(eta).minimize(mse)
# build graph
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
randTask = np.random.randint(0, 9)
ymaskIn = np.zeros([tasks, tasks])
ymaskIn[randTask, randTask] = 1
batch = trainDF.sample(batchSize)
batch_xs = batch.iloc[:, :imageSize]
batch_ys = np.zeros([batchSize, tasks])
batch_ys[:, randTask] = batch.iloc[:, imageSize + randTask]
gradOut = sess.run(grads, feed_dict={x: batch_xs, ystar: batch_ys, ymask: ymaskIn})
sess.run(trainStep, feed_dict={x: batch_xs, ystar: batch_ys, ymask:ymaskIn})
Here's a very simple reproduction:
import tensorflow as tf
with tf.Graph().as_default():
y = tf.zeros(shape=[1], dtype=tf.float32)
dist = tf.norm(y,axis=0)
(grad,) = tf.gradients(dist, [y])
with tf.Session():
print(grad.eval())
Prints:
[ nan]
The issue is that tf.norm computes sum(x**2)**0.5. The gradient is x / sum(x**2) ** 0.5 (see e.g. https://math.stackexchange.com/a/84333), so when sum(x**2) is zero we're dividing by zero.
There's not much to be done in terms of a special case: the gradient as x approaches all zeros depends on which direction it's approaching from. For example if x is a single-element vector, the limit as x approaches 0 could either be 1 or -1 depending on which side of zero it's approaching from.
So in terms of solutions, you could just add a small epsilon:
import tensorflow as tf
def safe_norm(x, epsilon=1e-12, axis=None):
return tf.sqrt(tf.reduce_sum(x ** 2, axis=axis) + epsilon)
with tf.Graph().as_default():
y = tf.constant([0.])
dist = safe_norm(y,axis=0)
(grad,) = tf.gradients(dist, [y])
with tf.Session():
print(grad.eval())
Prints:
[ 0.]
Note that this is not actually the Euclidean norm. It's a good approximation as long as the input is much larger than epsilon.