How to fix Tensor Size for Deep Learning EBM Error - tensorflow

I am attempting to implement DiffusionRecoveryLikelihood EBM, I am getting a tensor size error, but cannot spot my mistake
here is the code:
def l2normalize(v, eps=1e-12):
return v / (v.norm() + eps)
class SpectralNorm(nn.Module):
def __init__(self, module, name='weight', power_iterations=1):
super(SpectralNorm, self).__init__()
self.module = module
self.name = name
self.power_iterations = power_iterations
if not self._made_params():
self._make_params()
def _update_u_v(self):
u = getattr(self.module, self.name + "_u")
v = getattr(self.module, self.name + "_v")
w = getattr(self.module, self.name + "_bar")
height = w.data.shape[0]
for _ in range(self.power_iterations):
v.data = l2normalize(torch.mv(torch.t(w.view(height, -1).data), u.data))
u.data = l2normalize(torch.mv(w.view(height, -1).data, v.data))
# sigma = torch.dot(u.data, torch.mv(w.view(height,-1).data, v.data))
sigma = u.dot(w.view(height, -1).mv(v))
setattr(self.module, self.name, w / sigma.expand_as(w))
def _made_params(self):
try:
u = getattr(self.module, self.name + "_u")
v = getattr(self.module, self.name + "_v")
w = getattr(self.module, self.name + "_bar")
return True
except AttributeError:
return False
def _make_params(self):
w = getattr(self.module, self.name)
height = w.data.shape[0]
width = w.view(height, -1).data.shape[1]
u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
u.data = l2normalize(u.data)
v.data = l2normalize(v.data)
w_bar = Parameter(w.data)
del self.module._parameters[self.name]
self.module.register_parameter(self.name + "_u", u)
self.module.register_parameter(self.name + "_v", v)
self.module.register_parameter(self.name + "_bar", w_bar)
def forward(self, *args):
self._update_u_v()
return self.module.forward(*args)
def get_timestep_embedding(timesteps, embedding_dim: int):
assert len(timesteps.shape) == 1 # and timesteps.dtype == torch.int32
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(0, half_dim) * -emb).to(timesteps.device)
emb = torch.matmul(1.0 * timesteps.reshape(-1, 1), emb.reshape(1, -1))
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
if embedding_dim % 2 == 1: # zero pad
emb = F.pad(emb, [0, 1, 0, 0])
assert list(emb.shape) == [timesteps.shape[0], embedding_dim]
return emb
class Identity(nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
def conv3x3(in_planes, out_planes, stride=1):
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True)
class wide_basic(nn.Module):
def __init__(self, in_planes, planes, dropout_rate, stride=1, norm=None, leak=.2):
super(wide_basic, self).__init__()
self.norm = norm
self.lrelu = nn.LeakyReLU(leak)
self.bn1 = Identity()
self.conv1 = SpectralNorm(nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, bias=True))
self.dropout = Identity() if dropout_rate == 0.0 else nn.Dropout(p=dropout_rate)
self.bn2 = Identity()
self.conv2 = SpectralNorm(nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=True))
self.temb_dense = SpectralNorm(nn.Linear(512, planes))
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != planes:
self.shortcut = nn.Sequential(
SpectralNorm(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=True)),
)
def forward(self, x):
x, temb = x
out = self.bn1(x)
out = self.conv1(self.lrelu(out))
if temb is not None:
# add in timestep embedding
temp_o = self.lrelu(self.temb_dense(temb))
b, l = temp_o.shape
out += temp_o.view(b, l, 1, 1)
out = self.dropout(out)
out = self.bn2(out)
out = self.conv2(self.lrelu(out))
out += self.shortcut(x)
return out, temb
class Wide_ResNet(nn.Module):
def __init__(self, depth, widen_factor, num_classes=10, input_channels=3,
sum_pool=False, norm=None, leak=.2, dropout_rate=0.0):
super(Wide_ResNet, self).__init__()
self.leak = leak
self.in_planes = 16
self.sum_pool = sum_pool
self.norm = norm
self.lrelu = nn.LeakyReLU(leak)
self.n_classes = num_classes
assert ((depth - 4) % 6 == 0), 'Wide-reSpectralNormet depth should be 6n+4'
n = (depth - 4) // 6
k = widen_factor
print('| Wide-ReSpectralNormet %dx%d, SpectralNorm time embedding' % (depth, k))
nStages = [16, 16 * k, 32 * k, 64 * k]
self.layer_one_out = None
self.conv1 = SpectralNorm(conv3x3(input_channels, nStages[0]))
self.layer1 = self._wide_layer(wide_basic, nStages[1], n, dropout_rate, stride=1, leak=leak)
self.layer2 = self._wide_layer(wide_basic, nStages[2], n, dropout_rate, stride=2, leak=leak)
self.layer3 = self._wide_layer(wide_basic, nStages[3], n, dropout_rate, stride=2, leak=leak)
self.bn1 = Identity()
self.last_dim = nStages[3]
# self.linear = SpectralNorm(nn.Linear(nStages[3], num_classes))
self.linear = SpectralNorm(nn.Conv2d(nStages[3], num_classes, kernel_size=(10,1), stride=1))
self.temb_dense_0 = SpectralNorm(nn.Linear(128, 512))
self.temb_dense_1 = SpectralNorm(nn.Linear(512, 512))
self.temb_dense_2 = SpectralNorm(nn.Linear(512, nStages[3]))
def _wide_layer(self, block, planes, num_blocks, dropout_rate, stride, leak=0.2):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, dropout_rate, stride, leak=leak, norm=self.norm))
self.in_planes = planes
return nn.Sequential(*layers)
def forward(self, x, t, logits=False, feature=True):
out = self.conv1(x)
assert x.dtype == torch.float32
if isinstance(t, int) or len(t.shape) == 0:
t = torch.ones(x.shape[0], dtype=torch.int64, device=x.device) * t
temb = get_timestep_embedding(t, 128)
temb = self.temb_dense_0(temb)
temb = self.temb_dense_1(self.lrelu(temb))
out, _ = self.layer1([out, temb])
out, _ = self.layer2([out, temb])
out, _ = self.layer3([out, temb])
out = self.lrelu(self.bn1(out))
if self.sum_pool:
out = out.view(out.size(0), out.size(1), -1).sum(2)
else:
if self.n_classes > 100:
out = F.adaptive_avg_pool2d(out, 1)
else:
out = F.avg_pool2d(out, 8)
temb = self.lrelu(temb)
temb = self.temb_dense_2(temb)
out = out.view(out.size(0),-1)
out *= temb
if logits:
out = self.linear(out)
return out
and the error is :
189 # temb = temb.reshape(-1, self.feature_maps)
190 out = out.view(out.size(0), -1)
--> 191 out *= temb
192 if logits:
193 out = self.linear(out)
RuntimeError: The size of tensor a (768) must match the size of tensor b (192) at non-singleton dimension 1
I have tried reshaping, different view procedure and get the same error

Related

tensorflow implementation is much slower than pytorch (with slow gpu utility) and occupy more memory

I am trying to converting a pytorch implementation to tensorflow, but I found that the program occupies more memory, has lower gpu utility and is much more slower than pytorch version. I am wondering it is a general case, or I am wrong with the tf model structure?
I put the code on, it can be directly run.
https://colab.research.google.com/drive/1oI6GVnt3sAULvbMMAGTY4B6LsHguJd9U#scrollTo=DNH5pPynm-jm
The pytorch version uses half memory with twice gpu util compared with tf version.
Code:
import math
import os
import tensorflow as tf
import json
import numpy as np
import time
def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
assert diffusion_step_embed_dim_in % 2 == 0
half_dim = diffusion_step_embed_dim_in // 2
_embed = tf.math.log(tf.convert_to_tensor(10000.0)) / (half_dim - 1)
_embed = tf.math.exp(tf.cast(tf.experimental.numpy.arange(start = 0,stop = half_dim),dtype = tf.float32) * -_embed)
_embed = tf.cast(diffusion_steps,dtype=tf.float32) * _embed
diffusion_step_embed = tf.concat((tf.math.sin(_embed),
tf.math.cos(_embed)), 1)
assert diffusion_step_embed.shape[0] == diffusion_steps.shape[0]
assert diffusion_step_embed.shape[1] == diffusion_step_embed_dim_in
return diffusion_step_embed
class Residual_block(tf.keras.layers.Layer):
def __init__(self, res_channels, skip_channels,
diffusion_step_embed_dim_out, in_channels,
s4_lmax,
s4_d_state,
s4_dropout,
s4_bidirectional,
s4_layernorm):
super(Residual_block, self).__init__()
self.res_channels = res_channels
self.fc_t = tf.keras.layers.Dense(self.res_channels)
self.conv_layer = tf.keras.layers.Conv1D(filters=2 * self.res_channels, kernel_size=3, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.cond_conv = tf.keras.layers.Conv1D(filters=2*self.res_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.res_conv1 = tf.keras.layers.Conv1D(filters=res_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.res_conv2 = tf.keras.layers.Conv1D(filters=skip_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
def call(self, input_data):
x, cond, diffusion_step_embed = input_data
h = x
B, C, L = h.shape
part_t = self.fc_t(diffusion_step_embed)
part_t = tf.reshape(part_t,[B, self.res_channels, 1])
h = h + part_t
h = self.conv_layer(h)
cond = self.cond_conv(cond)
h += cond
out = tf.math.tanh(h[:,:self.res_channels,:]) * tf.math.sigmoid(h[:,self.res_channels:,:])
res = self.res_conv1(out)
skip = self.res_conv2(out)
return (x + res) * tf.math.sqrt(0.5), skip # normalize for training stability
class Residual_group(tf.keras.Model):
def __init__(self, res_channels, skip_channels, num_res_layers,
diffusion_step_embed_dim_in,
diffusion_step_embed_dim_mid,
diffusion_step_embed_dim_out,
in_channels,
s4_lmax,
s4_d_state,
s4_dropout,
s4_bidirectional,
s4_layernorm):
super(Residual_group, self).__init__()
self.num_res_layers = num_res_layers
self.diffusion_step_embed_dim_in = diffusion_step_embed_dim_in
self.fc_t1 = tf.keras.layers.Dense(diffusion_step_embed_dim_mid)
self.fc_t2 = tf.keras.layers.Dense(diffusion_step_embed_dim_out)
self.residual_blocks = []
for n in range(self.num_res_layers):
self.residual_blocks.append(Residual_block(res_channels, skip_channels,
diffusion_step_embed_dim_out=diffusion_step_embed_dim_out,
in_channels=in_channels,
s4_lmax=s4_lmax,
s4_d_state=s4_d_state,
s4_dropout=s4_dropout,
s4_bidirectional=s4_bidirectional,
s4_layernorm=s4_layernorm))
def call(self, input_data):
h, conditional, diffusion_steps = input_data
diffusion_step_embed = calc_diffusion_step_embedding(diffusion_steps, self.diffusion_step_embed_dim_in)
diffusion_step_embed = tf.keras.activations.swish((self.fc_t1(diffusion_step_embed)))
diffusion_step_embed = tf.keras.activations.swish((self.fc_t2(diffusion_step_embed)))
#out = self.residual_blocks((h, tf.zeros((8,256,248)),conditional, diffusion_step_embed))
#skip = out[1]
skip = tf.zeros((8,256,248))
for n in range(self.num_res_layers):
h, skip_n = self.residual_blocks[n]((h, conditional, diffusion_step_embed))
skip += skip_n
return skip * tf.math.sqrt(1.0 / self.num_res_layers)
class SSSDS4Imputer(tf.keras.Model):
def __init__(self, in_channels, res_channels, skip_channels, out_channels,
num_res_layers,
diffusion_step_embed_dim_in,
diffusion_step_embed_dim_mid,
diffusion_step_embed_dim_out,
s4_lmax,
s4_d_state,
s4_dropout,
s4_bidirectional,
s4_layernorm):
super(SSSDS4Imputer, self).__init__()
# convert the dimension of input from (B,in_channels,L) to (B,res_channels,L)
self.init_conv = tf.keras.layers.Conv1D(filters=res_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.res_channels = res_channels
self.skip_channels = skip_channels
self.residual_layer = Residual_group(res_channels=res_channels,
skip_channels=skip_channels,
num_res_layers=num_res_layers,
diffusion_step_embed_dim_in=diffusion_step_embed_dim_in,
diffusion_step_embed_dim_mid=diffusion_step_embed_dim_mid,
diffusion_step_embed_dim_out=diffusion_step_embed_dim_out,
in_channels=in_channels,
s4_lmax=s4_lmax,
s4_d_state=s4_d_state,
s4_dropout=s4_dropout,
s4_bidirectional=s4_bidirectional,
s4_layernorm=s4_layernorm)
# convert the dimension from (B,skip_channels,L) to (B,out_channels,L)
self.final_conv1 = tf.keras.layers.Conv1D(filters=skip_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.final_conv2 = tf.keras.layers.Conv1D(filters=out_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='zeros', data_format='channels_first')
def call(self, input_data):
x, conditional, mask, diffusion_steps = input_data
conditional = conditional * mask
conditional = tf.concat([conditional, tf.cast(mask,dtype=tf.float32)], axis=1)
x = tf.nn.relu(self.init_conv(x))
x = self.residual_layer((x, conditional, diffusion_steps))
y = tf.nn.relu(self.final_conv1(x))
y = tf.nn.relu(self.final_conv2(y))
return y
def train_step(X, y, net,loss_fn,optimizer):
with tf.GradientTape() as tape:
logits = net(X)
loss_value = loss_fn(y, logits)
grads = tape.gradient(loss_value, net.trainable_variables,unconnected_gradients=tf.UnconnectedGradients.ZERO)
optimizer.apply_gradients(zip(grads, net.trainable_variables))
return loss_value.numpy()
if __name__ == '__main__':
model_config = {'in_channels': 12, 'out_channels': 12, 'num_res_layers': 36, 'res_channels': 256, 'skip_channels': 256,
'diffusion_step_embed_dim_in': 128, 'diffusion_step_embed_dim_mid': 512, 'diffusion_step_embed_dim_out': 512,
's4_lmax': 250, 's4_d_state': 64, 's4_dropout': 0.0, 's4_bidirectional': 1, 's4_layernorm': 1}
net = SSSDS4Imputer(**model_config)
# define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
#training
n_iter = 0
#iterator = iter(dataset)
while n_iter < 150000 + 1:
#try:
X = (tf.random.normal([8,12,248], 0, 1, tf.float32, seed=1),tf.random.normal([8,12,248], 0, 1, tf.float32, seed=2),
tf.random.normal([8,12,248], 0, 1, tf.float32, seed=2),tf.random.normal([8,1], 0, 1, tf.float32, seed=4))
y = tf.random.normal([8,12,248], 0, 1, tf.float32, seed=1)
t0 = time.time()
loss = train_step(X,y,net,tf.keras.losses.MeanSquaredError(),optimizer)
print(time.time()-t0)
n_iter += 1
The tensorflow version I use is 2.4.1

Error in defining custom layers using subclasses in keras

I am getting an error in defining layers to my model. The input for self.layer[i] is not working when I'm implementing X = self.layers[i](A, A)inside the code. But the layer[i] which is made from GTLayer, works well for when I call it separetly, A = tf.random.uniform(shape=[2,2]) d = GTLayer(2,2,1) d.call(A,A)
..
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class GTN(keras.Model): # layers.Layer keeps track of everything under the hood!
def __init__(self, num_edge, num_channels, w_in, w_out, num_class,num_layers,norm):
super(GTN, self).__init__()
self.num_layers = 3
self.num_edge = 6
self.num_channels = 3
self.w_in = 2#tf.random.uniform(shape=[2,2])
self.w_out = 2#tf.random.uniform(shape=[2,2])
self.num_class =2
layers = []
for i in tf.range(num_layers):
print (4)
if i == 0:
layers.append(GTLayer(num_edge, num_channels, first=True))
else:
print(i)
layers.append(GTLayer(num_edge, num_channels, first=False))
#print ((self.w_out*self.num_channels, ))
self.linear1 = tf.keras.layers.Dense( self.w_out, input_shape =(self.w_out*self.num_channels, ), activation= None)
self.linear2 = tf.keras.layers.Dense( self.num_class, input_shape=(self.w_out, ), activation= None)
def call(self, A, X, target_x, target):
#A = tf.expand_dims(A, 0)
Ws = []
print('hello')
for i in range(self.num_layers):
print('i:',i)
if i == 0:
print('layers:',layers)
print('layers[i](A):', self.layers[i](A))
H, W = self.layers[i](A) #self.layers = nn.ModuleList(layers)
else:
#H = self.normalization(H)
print(H)
print(W)
print('A', A)
X = self.layers[i](A, A)
print('H_dash',self.layers[i](A))
H, W = self.layers[i](A, H)
Ws.append(W)
for i in range(self.num_channels):
if i==0:
X_ = tf.nn.relu(self.gcn_conv(X,H[i])).numpy()
else:
X_tmp = tf.nn.relu(self.gcn_conv(X,H[i])).numpy()
X_ = tf.concat((X_,X_tmp), dim=1)
X_ = self.linear1(X_)
X_ = tf.nn.relu(X_).numpy()
y = self.linear2(X_[target_x])
loss = self.loss(y, target)
return loss, y, Ws
class GTLayer(keras.layers.Layer):
def __init__(self, in_channels, out_channels, first=True):
super(GTLayer, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.first = first
def call(self, A, H_ = None):
if self.first == True:
a = tf.random.uniform(shape=[2,2])
b = tf.random.uniform(shape=[2,2])
print(a)
print(b)
#H = torch.bmm(np.array(a),np.array(b))
H = tf.matmul( a, b)
else:
a = tf.random.uniform(shape=[2])
H = tf.random.uniform(shape=[2])
return H,W
Input used for check:
d = GTN(2,3,4,2,2,2,1)
A = tf.random.uniform(shape=[2,2])
X = tf.random.uniform(shape=[2,2])
t_x = 5
t = 4
d.call(A,X,t_x,t)
Error:
---> 47 X = self.layers[i](A, A)
TypeError: call() takes 2 positional arguments but 3 were given

Why arbitrarily define a tf.constant (not using it anywhere) in a project which built by tf1 will cause network performance degradation?

Why does pinn's Burgers identification code (which can be understood as a TF1 neural network) define a tf.constant whether in the network class, in the main function or the outermost (not using it in any functions) will cause network performance degradation?
Before defining it, my network has better prediction accuracy and convergence time.
Here is the code:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from plotting import newfig, savefig
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.gridspec as gridspec
import time
np.random.seed(1234)
tf.set_random_seed(1234)
class PhysicsInformedNN:
# Initialize the class
def __init__(self, X, u, layers, lb, ub):
self.n=tf.constant([0.0])
self.lb = lb
self.ub = ub
self.x = X[:,0:1]
self.t = X[:,1:2]
self.u = u
self.layers = layers
# Initialize NNs
self.weights, self.biases = self.initialize_NN(layers)
# tf placeholders and graph
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
log_device_placement=True))
# Initialize parameters
self.lambda_1 = tf.Variable([0.0], dtype=tf.float32)
self.lambda_2 = tf.Variable([-6.0], dtype=tf.float32)
self.x_tf = tf.placeholder(tf.float32, shape=[None, self.x.shape[1]])
self.t_tf = tf.placeholder(tf.float32, shape=[None, self.t.shape[1]])
self.u_tf = tf.placeholder(tf.float32, shape=[None, self.u.shape[1]])
self.u_pred = self.net_u(self.x_tf, self.t_tf)
self.f_pred = self.net_f(self.x_tf, self.t_tf)
self.loss = tf.reduce_mean(tf.square(self.u_tf - self.u_pred)) + \
tf.reduce_mean(tf.square(self.f_pred))
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.loss,
method = 'L-BFGS-B',
options = {'maxiter': 50000,
'maxfun': 50000,
'maxcor': 50,
'maxls': 50,
'ftol' : 1.0 * np.finfo(float).eps})
self.optimizer_Adam = tf.train.AdamOptimizer()
self.train_op_Adam = self.optimizer_Adam.minimize(self.loss)
init = tf.global_variables_initializer()
self.sess.run(init)
def initialize_NN(self, layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype=tf.float32), dtype=tf.float32)
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype=tf.float32)
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.tanh(tf.add(tf.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.add(tf.matmul(H, W), b)
return Y
def net_u(self, x, t):
u = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return u
def net_f(self, x, t):
lambda_1 = self.lambda_1
lambda_2 = tf.exp(self.lambda_2)
u = self.net_u(x,t)
u_t = tf.gradients(u, t)[0]
u_x = tf.gradients(u, x)[0]
u_xx = tf.gradients(u_x, x)[0]
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
return f
def callback(self, loss, lambda_1, lambda_2):
print('Loss: %e, l1: %.5f, l2: %.5f' % (loss, lambda_1, np.exp(lambda_2)))
def train(self, nIter):
tf_dict = {self.x_tf: self.x, self.t_tf: self.t, self.u_tf: self.u}
start_time = time.time()
for it in range(nIter):
self.sess.run(self.train_op_Adam, tf_dict)
# Print
if it % 10 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss, tf_dict)
lambda_1_value = self.sess.run(self.lambda_1)#run的参数为返回值fetches
lambda_2_value = np.exp(self.sess.run(self.lambda_2))
print('It: %d, Loss: %.3e, Lambda_1: %.3f, Lambda_2: %.6f, Time: %.2f' %
(it, loss_value, lambda_1_value, lambda_2_value, elapsed))
start_time = time.time()
self.optimizer.minimize(self.sess,
feed_dict = tf_dict,
fetches = [self.loss, self.lambda_1, self.lambda_2],
loss_callback = self.callback)
def predict(self, X_star):
tf_dict = {self.x_tf: X_star[:,0:1], self.t_tf: X_star[:,1:2]}
u_star = self.sess.run(self.u_pred, tf_dict)
f_star = self.sess.run(self.f_pred, tf_dict)
return u_star, f_star
if __name__ == "__main__":
nu = 0.01/np.pi
N_u = 2000
layers = [2, 20, 20, 20, 20, 20, 20, 20, 20, 1]
data = scipy.io.loadmat('../Data/burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
# Doman bounds
lb = X_star.min(0)
ub = X_star.max(0)
######################################################################
######################## Noiseles Data ###############################
######################################################################
noise = 0.0
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(0)
u_pred, f_pred = model.predict(X_star)
error_u = np.linalg.norm(u_star-u_pred,2)/np.linalg.norm(u_star,2)
#U_pred = griddata(X_star, u_pred.flatten(), (X, T), method='cubic')
lambda_1_value = model.sess.run(model.lambda_1)
lambda_2_value = model.sess.run(model.lambda_2)
lambda_2_value = np.exp(lambda_2_value)
error_lambda_1 = np.abs(lambda_1_value - 1.0)*100
error_lambda_2 = np.abs(lambda_2_value - nu)/nu * 100
print('Error u: %e' % (error_u))
print('Error l1: %.5f%%' % (error_lambda_1))
print('Error l2: %.5f%%' % (error_lambda_2))

The model cannot be compiled because it has no loss to optimize

I write a vae model which posterior is GMM ,and use self.add_loss to define vae loss,but an error occur when i fit my model:
ValueError: The model cannot be compiled because it has no loss to optimize.
here is my code:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt
from tensorflow.keras import layers
import tensorflow_probability as tfp
import numpy as np
tfd = tfp.distributions
tf.test.is_gpu_available()
# data
(x_train, x_labels), (x_val, x_val_labels) = mnist.load_data()
x_train = x_train.reshape(60000, 784).astype("float32") / 255.
x_val = x_val.reshape(10000, 784).astype("float32") / 255.
x_train[x_train >= 0.5] = 1.
x_train[x_train < 0.5] = 0.
x_val[x_val >= 0.5] = 1.
x_val[x_val < 0.5] = 0.
# from softmax to one_hot
def props_to_onehot(props):
if isinstance(props, list):
props = np.array(props)
a = np.argmax(props, axis=1)
b = np.zeros((len(a), props.shape[1]))
b[np.arange(len(a)), a] = 1
return b
# reparameter
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
class Encoder(layers.Layer):
def __init__(self, latent_dim, base_depth, components, name='encoder', **kwargs):
"""
latent_size: the dimensionality of latent variable z(also the dim of u and Σ)
base_depth: base units of Dense
components: the numbers of gussian distribution.In this case ,we set components = 10
"""
super(Encoder, self).__init__(name=name, **kwargs)
self.latent_size = latent_dim
self.base_depth = base_depth
self.components = components
# shared structured of encoder
self.dense1 = Dense(8 * self.base_depth, activation='relu', name='1')
self.dropout1 = tf.keras.layers.Dropout(0.2)
self.dense2 = Dense(4 * self.base_depth, activation='relu', name='2')
self.dropout2 = tf.keras.layers.Dropout(0.2)
self.dense3 = Dense(4 * self.base_depth, activation='relu', name='3')
self.dense4 = Dense(2 * self.base_depth, activation='relu', name='4')
self.dense5 = Dense(2 * self.base_depth, activation='relu', name='5')
# the output parameters of encoder including {pi,u,Σ}
self.parameters = Dense(self.components + self.components * 2 * self.latent_size, name='6')
self.sampling = Sampling()
def call(self, inputs):
# shared structure output
x = self.dense1(inputs)
x = self.dropout1(x)
x = self.dense2(x)
x = self.dropout2(x)
x = self.dense3(x)
x = self.dense4(x)
x = self.dense5(x)
# meaningful parameters
parameters = self.parameters(x)
pi, _ = tf.split(parameters, [self.components, 10 * 2 * self.latent_size], axis=-1)
pi = tf.nn.softmax(pi)
pi = props_to_onehot(pi)
batch_size_int = tf.shape(pi)[0].numpy()
batch_list = []
for i in range(batch_size_int):
index = np.argmax(pi[0])
batch_list.append(parameters[0][self.components + index * 2 * self.latent_size + 1:self.components + (
index + 1) * 2 * self.latent_size + 1])
batch_list = np.array(batch_list) # (batch_size,2*latent_size)
# (batch_size,latent_size);(batch_size,latent_size)
z_mean, z_log_var = tf.split(batch_list, [self.latent_size, self.latent_size], axis=-1)
z = self.sampling((z_mean, z_log_var))
kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
self.add_loss(kl_loss)
return z_mean, z_log_var, z
class Decoder(layers.Layer):
def __init__(self, base_depth, name="decoder", **kwargs):
super(Decoder, self).__init__(name=name, **kwargs)
self.base_depth = base_depth
self.dense1 = Dense(self.base_depth)
self.dense2 = Dense(2 * self.base_depth, activation='relu')
self.dense3 = Dense(4 * self.base_depth, activation='relu')
self.dropout1 = tf.keras.layers.Dropout(0.2)
self.dense4 = Dense(4 * self.base_depth, activation='relu')
self.dense5 = Dense(8 * self.base_depth, activation='relu')
self.dropout2 = tf.keras.layers.Dropout(0.2)
# no activation
self.dense_out = Dense(784)
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
x = self.dropout1(x)
x = self.dense4(x)
x = self.dense5(x)
x = self.dropout2(x)
x = self.dense_out(x)
# shape=(B,784)
return x
class GMM_VAE_Posterior(tf.keras.Model):
def __init__(self, latent_dim, base_depth, components, name='auto_encoder', **kwargs):
super(GMM_VAE_Posterior, self).__init__(name=name, **kwargs)
self.latent_dim = latent_dim
self.base_depth = base_depth
self.components = components
self.encoder = Encoder(self.latent_dim, self.base_depth, self.components)
self.decoder = Decoder(self.base_depth)
def call(self, inputs):
z_mean, z_log_var, z = self.encoder(inputs)
out = self.decoder(z) # (batch_size,784)
reconstructions_error = tf.nn.sigmoid_cross_entropy_with_logits(labels=inputs, logits=out)
reconstructions_error = tf.reduce_sum(reconstructions_error, axis=-1)
reconstructions_error = tf.reduce_mean(reconstructions_error)
self.add_loss(reconstructions_error)
# shape:(batch_size,784)
return out
vae_gmm = GMM_VAE_Posterior(16, 64, 10)
vae_gmm.compile(optimizer=tf.keras.optimizers.Adam())
vae_gmm.fit(x_train, x_train, epochs=5, batch_size=64) # error
In my view,i think the computation graph of my model is not complete,so model can not BP.But it is just my gusses.
On model compiling, you must fill in the loss parameter. So, when you added the loss in another way, simply set it to None:
vae_gmm.compile(optimizer=tf.keras.optimizers.Adam(), loss = None)

WGAN-GP Large Oscillating Loss

I am trying to train a WaveGAN as described here: https://github.com/chrisdonahue/wavegan
In the paper, the WaveGAN is trained using WGAN-GP, so I have tried to implement it myself by adapting code from: https://github.com/LynnHo/DCGAN-LSGAN-WGAN-GP-DRAGAN-Tensorflow-2. However, after even only 2000 steps (~1 epoch), the loss values I am getting for the critic and the generator are large (< 1000) and oscillate between negative and positive. My audio is the same piano recordings that they used, just resampled at 16000Hz and converted to mono from stereo.
My loss graphs are:
I was hoping someone could please validate whether my implementation is correct and if so, what experiments can I run to diagnose this problem?
Note: TIMESTEPS indicates the number of samples I wish to generate for each generator pass. Currently this is set to 1 to replicate WaveGAN, and I wish to experiment with this in the future. For now, I don't think it is relevant to the issue.
My train.py script is:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np
import librosa
import random
import os
import sys
import time
import GANModels
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
MODEL_DIMS = 64
NUM_SAMPLES = 16384
TIMESTEPS = 1
D_UPDATES_PER_G_UPDATE = 5
GRADIENT_PENALTY_WEIGHT = 10.0
NOISE_LEN = 100
EPOCHS = 2000
EPOCHS_PER_SAMPLE = 2
BATCH_SIZE = 8
Fs = 16000
class GAN:
def __init__(self, model_dims=MODEL_DIMS, num_samples=NUM_SAMPLES, timesteps=TIMESTEPS, gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT,
noise_len=NOISE_LEN, batch_size=BATCH_SIZE, sr=Fs):
self.model_dims = model_dims
self.num_samples = num_samples
self.timesteps = timesteps
self.noise_dims = (timesteps, noise_len)
self.batch_size = batch_size
self.G = GANModels.Generator(self.model_dims, self.timesteps, num_samples)
self.D = GANModels.Critic(self.model_dims, self.timesteps, num_samples)
self.G_optimizer = Adam(learning_rate=1e-4, beta_1=0.5, beta_2=0.9)
self.D_optimizer = Adam(learning_rate=1e-4, beta_1=0.5, beta_2=0.9)
print(self.G.summary())
print(self.D.summary())
self.gradient_penalty_weight = gradient_penalty_weight
self.sr = sr
def _d_loss_fn(self, r_logit, f_logit):
r_loss = - tf.reduce_mean(r_logit)
f_loss = tf.reduce_mean(f_logit)
return r_loss, f_loss
def _g_loss_fn(self, f_logit):
f_loss = - tf.reduce_mean(f_logit)
return f_loss
def _gradient_penalty(self, real, fake):
def _interpolate(a, b):
shape = [tf.shape(a)[0]] + [1] * (a.shape.ndims - 1)
alpha = tf.random.uniform(shape=shape, minval=0., maxval=1.)
inter = a + alpha * (b - a)
inter.set_shape(a.shape)
return inter
x = _interpolate(real, fake)
with tf.GradientTape() as t:
t.watch(x)
pred = self.D(x, training=True)
grad = t.gradient(pred, x)
norm = tf.norm(tf.reshape(grad, [tf.shape(grad)[0], -1]), axis=1)
gp = tf.reduce_mean((norm - 1.)**2)
return gp
#tf.function
def train_G(self):
with tf.GradientTape() as t:
z = tf.random.normal(shape=(self.batch_size,) + self.noise_dims)
x_fake = self.G(z, training=True)
x_fake_d_logit = self.D(x_fake, training=True)
G_loss = self._g_loss_fn(x_fake_d_logit)
G_grad = t.gradient(G_loss, self.G.trainable_variables)
self.G_optimizer.apply_gradients(zip(G_grad, self.G.trainable_variables))
return {'g_loss': G_loss}
#tf.function
def train_D(self, x_real):
with tf.GradientTape() as t:
z = tf.random.normal(shape=(x_real.shape[0],) + self.noise_dims) #Half fake and half real
x_fake = self.G(z, training=True)
x_real_d_logit = self.D(x_real, training=True)
x_fake_d_logit = self.D(x_fake, training=True)
x_real_d_loss, x_fake_d_loss = self._d_loss_fn(x_real_d_logit, x_fake_d_logit)
gp = self._gradient_penalty(x_real, x_fake)
D_loss = (x_real_d_loss + x_fake_d_loss) + gp * self.gradient_penalty_weight
D_grad = t.gradient(D_loss, self.D.trainable_variables)
self.D_optimizer.apply_gradients(zip(D_grad, self.D.trainable_variables))
return {'d_loss': x_real_d_loss + x_fake_d_loss, 'gp': gp}
def sample(self, epoch, num_samples=10):
z = tf.random.normal(shape=(num_samples,) + self.noise_dims)
result = self.G(z, training=False)
for i in range(num_samples):
audio = np.array(result[i, :, :])
audio.flatten()
librosa.output.write_wav(f"output/piano/{epoch}-{i}.wav", audio, sr=self.sr)
#############################################################################
gan = GAN()
X_train = []
for file in os.listdir(r"D:\ML_Datasets\mancini_piano\piano\train"):
with open(r"D:\ML_Datasets\mancini_piano\piano\train" + fr"\{file}", "rb") as f:
samples, _ = librosa.load(f, Fs)
if len(samples) < TIMESTEPS*NUM_SAMPLES:
audio = np.array([np.array([sample]) for sample in samples])
padding = np.zeros(shape=(TIMESTEPS*NUM_SAMPLES - len(samples), 1), dtype='float32')
X_train.append(np.append(audio, padding, axis=0))
else:
for i in range(len(samples) // (TIMESTEPS*NUM_SAMPLES)):
X_train.append(np.array([np.array([sample]) for sample in samples[:TIMESTEPS*NUM_SAMPLES]]))
samples = np.delete(samples, np.s_[:TIMESTEPS*NUM_SAMPLES])
print(f"X_train shape = {(len(X_train),) + X_train[0].shape}")
librosa.output.write_wav("output/piano/test.wav", X_train[0], sr=Fs)
train_summary_writer = tf.summary.create_file_writer("logs/train")
with train_summary_writer.as_default():
steps_per_epoch = len(X_train) // BATCH_SIZE
for e in range(EPOCHS):
for i in range(steps_per_epoch):
D_loss_sum = 0
for n in range(D_UPDATES_PER_G_UPDATE):
D_loss_dict = gan.train_D(np.array(random.sample(X_train, BATCH_SIZE)))
D_loss_sum += D_loss_dict['d_loss']
D_loss = D_loss_sum / D_UPDATES_PER_G_UPDATE
G_loss_dict = gan.train_G()
G_loss = G_loss_dict['g_loss']
tf.summary.scalar('d_loss', D_loss, step=(e*steps_per_epoch)+i)
tf.summary.scalar('g_loss', G_loss, step=(e*steps_per_epoch)+i)
print(f"step {(e*steps_per_epoch)+i}: d_loss = {D_loss} g_loss = {G_loss}")
if e % EPOCHS_PER_SAMPLE == 0:
gan.sample(e)
My GANModels.py script is:
def Generator(d, a, num_samples, c=16):
# Prelim layers
input_layer = Input(shape=(100,))
dense_layer0 = Dense(256*d, input_shape=(100,))(input_layer)#
reshape_layer0 = Reshape((c, c*d))(dense_layer0)#
relu_layer0 = Activation('relu')(reshape_layer0)#
# WaveCNN layers
c //= 2
expanded_layer0 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer0)#relu_layer1
conv1d_t_layer0 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer0)
slice_layer0 = Lambda(lambda x: x[:, 0])(conv1d_t_layer0)
relu_layer2 = Activation('relu')(slice_layer0)
c //= 2
expanded_layer1 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer2)
conv1d_t_layer1 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer1)
slice_layer1 = Lambda(lambda x: x[:, 0])(conv1d_t_layer1)
relu_layer3 = Activation('relu')(slice_layer1)
c //= 2
expanded_layer2 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer3)
conv1d_t_layer2 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer2)
slice_layer2 = Lambda(lambda x: x[:, 0])(conv1d_t_layer2)
relu_layer4 = Activation('relu')(slice_layer2)
c //= 2
expanded_layer3 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer4)
conv1d_t_layer3 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer3)
slice_layer3 = Lambda(lambda x: x[:, 0])(conv1d_t_layer3)
relu_layer5 = Activation('relu')(slice_layer3)
expanded_layer4 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer5)
conv1d_t_layer4 = Conv2DTranspose(1, (1, 25), strides=(1, 4), padding='same')(expanded_layer4)#strides=(1,1)
slice_layer4 = Lambda(lambda x: x[:, 0])(conv1d_t_layer4)
tanh_layer0 = Activation('tanh')(slice_layer4)
model = Model(inputs=input_layer, outputs=tanh_layer0)
return model
def _apply_phaseshuffle(x, rad=2, pad_type='reflect'):
b, x_len, nch = x.get_shape().as_list()
phase = tf.random.uniform([], minval=-rad, maxval=rad + 1, dtype=tf.int32)
pad_l = tf.maximum(phase, 0)
pad_r = tf.maximum(-phase, 0)
phase_start = pad_r
x = tf.pad(x, [[0, 0], [pad_l, pad_r], [0, 0]], mode=pad_type)
x = x[:, phase_start:phase_start+x_len]
x.set_shape([b, x_len, nch])
return x
def Critic(d, a, num_samples, c=1):
input_layer = Input(shape=(a*num_samples, 1))#d*d
conv1d_layer0 = Conv1D(c*d, 25, strides=4, padding='same')(input_layer)#//2
LReLU_layer0 = LeakyReLU(alpha=0.2)(conv1d_layer0)
phaseshuffle_layer0 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer0)
c *= 2
conv1d_layer1 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer0)#d
LReLU_layer1 = LeakyReLU(alpha=0.2)(conv1d_layer1)
phaseshuffle_layer1 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer1)
c *= 2
conv1d_layer2 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer1)#2*d
LReLU_layer2 = LeakyReLU(alpha=0.2)(conv1d_layer2)
phaseshuffle_layer2 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer2)
c *= 2
conv1d_layer3 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer2)#4*d
LReLU_layer3 = LeakyReLU(alpha=0.2)(conv1d_layer3)
phaseshuffle_layer3 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer3)
c *= 2
conv1d_layer4 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer3)#8*d,strides=4
LReLU_layer4 = LeakyReLU(alpha=0.2)(conv1d_layer4)
phaseshuffle_layer4 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer4)
slice_layer0 = Lambda(lambda x: x[:, 0])(phaseshuffle_layer4)
dense_layer1 = Dense(1, input_shape=(256*d,))(slice_layer0)
model = Model(inputs=input_layer, outputs=dense_layer1)
return model