Related
I 'm using google colab to solve the homogeneous heat equation. I had made a program earlier with scipy using sparse matrices which worked upto N = 10(hyperparameter) but I need to run it for like N = 4... 1000 and thus it won't work on my pc. I therefore converted the code to tensorflow and here I 'm unable to use sparse matrices like I could in sympy but even the GPU/TPU computation is also slow and slower than my pc. Problems that I'm facing in the code and require solution for
1) tf.contrib is removed and thus I 've to use an older version of tensorflow for odeint function. Where is it in 2.0?
2)If the computation can be computed with sparse matrices it could be good since matrices are tridiagonal.I know about sparse_dense_mul() function but that returns dense tensor and it wouldn't do the job. The "func" function applies time independent boundary conditions and then requires matrix multiplication of (nxn) with (nX1) which gives (nX1) with multiple matrices.
Also the program was running faster without I created the class.
Also it's giving this
WARNING: Logging before flag parsing goes to stderr.
W0829 09:12:24.415445 139855355791232 lazy_loader.py:50]
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
* https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
* https://github.com/tensorflow/addons
* https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.
W0829 09:12:24.645356 139855355791232 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/contrib/integrate/python/ops/odes.py:233: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
when I run code for loop in range(2, 10) and tqdm does not display and cell keeps running forever but it works fine for in (2, 5) and tqdm bar does appears.
#find a way to use sparse matrices
class Heat:
def __init__(self, N):
self.N = N
self.H = 1/N
self.A = ts.to_dense(ts.SparseTensor(indices=[[0, 0], [0, 1]] + \
[[i, i+j] for i in range(1, N) for j in [-1, 0, 1]] +[[N, N-1], [N, N]],
values=self.H*np.array([1/3, 1/6] + [1/6, 2/3, 1/6]*(N-1) + [1/6, 1/3], dtype=np.float32),
dense_shape=(N+1, N+1 )))
self.D = ts.to_dense(ts.SparseTensor(indices=[[0, 0], [0, 1]] + [[i, i+j] \
for i in range(1, N) for j in [-1, 0, 1]] +[[N, N-1], [N, N]],
values=N*np.array([1-(1), -1 -(-1)] + [-1, 2, -1]*(N-1) + [-1-(-1), 1-(1)], dtype=np.float32),
dense_shape=(N+1, N+1)))
self.domain = tf.linspace(0.0, 1.0, N+1)
def f(k):
if k == 0:
return (1 + math.pi**2)*(math.pi*self.H - math.sin(math.pi*self.H))/(math.pi**2*self.H)
elif k == N:
return -(1 + math.pi**2)*(-math.pi*self.H + math.sin(math.pi*self.H))/(math.pi**2*self.H)
else:
return -2*(1 + math.pi**2)*(math.cos(math.pi*self.H) - 1)*math.sin(math.pi*self.H*k)/(math.pi**2*self.H)
self.F = tf.constant([f(k) for k in range(N+1)], shape=(N+1,), dtype=tf.float32) #caution! shape changed caution caution 1, N+1(problem) is different from N+1,
self.exact = tm.scalar_mul(scalar=np.exp(1), x=tf.sin(math.pi*self.domain))
def error(self):
return np.linalg.norm(self.exact.numpy() - self.approx, 2)
def func (self, y, t):
y = tf.Variable(y)
y = y[0].assign(0.0)
y = y[self.N].assign(0.0)
if self.N**2> 100:
y_dash = tl.matvec(tf.linalg.inv(self.A), tl.matvec(a=tm.negative(self.D), b=y, a_is_sparse=True) + tm.scalar_mul(scalar=math.exp(t), x=self.F)) #caution! shape changed F is (1, N+1) others too
else:
y_dash = tl.matvec(tf.linalg.inv(self.A), tl.matvec(a=tm.negative(self.D), b=y) + tm.scalar_mul(scalar=math.exp(t), x=self.F)) #caution! shape changed F is (1, N+1) others too
y_dash = tf.Variable(y_dash) #!!y_dash performs Hadamard product like multiplication not matrix-like multiplication;returns 2-D
y_dash = y_dash[0].assign(0.0)
y_dash = y_dash[self.N].assign(0.0)
return y_dash
def algo_1(self):
self.approx = tf.contrib.integrate.odeint(
func=self.func,
y0=tf.sin(tm.scalar_mul(scalar=math.pi, x=self.domain)),
t=tf.constant([0.0, 1.0]),
rtol=1e-06,
atol=1e-12,
method='dopri5',
options={"max_num_steps":10**10},
full_output=False,
name=None
).numpy()[1]
def algo_2(self):
self.approx = tf.contrib.integrate.odeint_fixed(
func=self.func,
y0=tf.sin(tm.scalar_mul(scalar=math.pi, x=self.domain)),
t=tf.constant([0.0, 1.0]),
dt=tf.constant([self.H**2], dtype=tf.float32),
method='rk4',
name=None
).numpy()[1]
df = pd.DataFrame(columns=["NumBasis", "Errors"])
Ns = [2**r for r in range(2, 10)]
l =[]
for i in tqdm_notebook(Ns):
heateqn = Heat(i)
heateqn.algo_1()
l.append([i, heateqn.error()])
df.append({"NumBasis":i, "Errors":heateqn.error()}, ignore_index=True)
tf.keras.backend.clear_session()
I'm working on a seq2sql project and I successfully build a model but when training I get an error. I'm not using any Keras embedding layer.
M=13 #Question Length
d=40 #Dimention of the LSTM
C=12 #number of table Columns
batch_size=9
inputs1=Input(shape=(M,100),name='question_token')
Hq=Bidirectional(LSTM(d,return_sequences=True),name='QuestionENC')(inputs1) #this is HQ shape is (num_samples,13,80)
inputs2=Input(shape=(C,3,100),name='col_token')
col_lstm_layer=Bidirectional(LSTM(d,return_sequences=False),name='ColENC')
def hidd(te):
t=tf.Variable(initial_value=1,dtype=tf.int32)
for i in range(batch_size):
t=tf.assign(t,i)
Z = tf.nn.embedding_lookup(te, t)
print(col_lstm_layer(Z))
h=tf.reshape(col_lstm_layer(Z),[1,C,d*2])
if i==0:
# cols_last_hidden=tf.Variable(initial_value=h)
cols_last_hidden=tf.stack(h)#this is because it gives an error if we use tf.Variable here
else:
cols_last_hidden=tf.concat([cols_last_hidden,h],0)#shape of this one is (num_samples,num_col,80) 80 is last encoding of each column
return cols_last_hidden
cols_last_hidden=Lambda(hidd)(inputs2)
Hq=Dense(d*2,name='QuestionLastEncode')(Hq)
I=tf.Variable(initial_value=1,dtype=tf.int32)
J=tf.Variable(initial_value=1,dtype=tf.int32)
K=1
def get_col_att(tensors):
global K,all_col_attention
if K:
t=tf.Variable(initial_value=1,dtype=tf.int32)
for i in range(batch_size):
t=tf.assign(t,i)
x = tf.nn.embedding_lookup(tensors[0], t)
# print("tensors[1]:",tensors[1])
y = tf.nn.embedding_lookup(tensors[1], t)
# print("x shape",x.shape,"y shape",y.shape)
y=tf.transpose(y)
# print("x shape",x.shape,"y",y.shape)
Ecol=tf.reshape(tf.transpose(tf.tensordot(x,y,axes=1)),[1,C,M])
if i==0:
# all_col_attention=tf.Variable(initial_value=Ecol,name=""+i)
all_col_attention=tf.stack(Ecol)
else:
all_col_attention=tf.concat([all_col_attention,Ecol],0)
K=0
print("all_col_attention",all_col_attention)
return all_col_attention
total_alpha_sel_lambda=Lambda(get_col_att,name="Alpha")([Hq,cols_last_hidden])
total_alpha_sel=Dense(13,activation="softmax")(total_alpha_sel_lambda)
# print("Hq",Hq," total_alpha_sel_lambda shape",total_alpha_sel_lambda," total_alpha_sel shape",total_alpha_sel.shape)
def get_EQcol(tensors):
global K
if K:
t=tf.Variable(initial_value=1,dtype=tf.int32)
global all_Eqcol
for i in range(batch_size):
t=tf.assign(t,i)
x = tf.nn.embedding_lookup(tensors[0], t)
y = tf.nn.embedding_lookup(tensors[1], t)
Eqcol=tf.reshape(tf.tensordot(x,y,axes=1),[1,C,d*2])
if i==0:
# all_Eqcol=tf.Variable(initial_value=Eqcol,name=""+i)
all_Eqcol=tf.stack(Eqcol)
else:
all_Eqcol=tf.concat([all_Eqcol,Eqcol],0)
K=0
print("all_Eqcol",all_Eqcol)
return all_Eqcol
K=1
EQcol=Lambda(get_EQcol,name='EQcol')([total_alpha_sel,Hq])#total_alpha_sel(12x13) Hq(13xd*2)
EQcol=Dropout(.2)(EQcol)
L1=Dense(d*2,name='L1')(cols_last_hidden)
L2=Dense(d*2,name='L2')(EQcol)
L1_plus_L2=Add()([L1,L2])
pre=Flatten()(L1_plus_L2)
Psel=Dense(12,activation="softmax")(pre)
model=Model(inputs=[inputs1,inputs2],outputs=Psel)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()
earlyStopping=EarlyStopping(monitor='val_loss', patience=7, verbose=0, mode='auto')
history=model.fit([Equestion,Col_Embeddings],y_train,epochs=50,validation_split=.1,shuffle=False,callbacks=[earlyStopping],batch_size=batch_size)
The shapes of the Equestion, Col_Embeddings, and y_train are (10, 12, 3, 100) ,(10, 13, 100) and (10, 12).
I searched about this error but in all cases they have used an embedding layer incorrectly. Here I get this error even though I'm not using one.
indices = 2 is not in [0, 1)
[[{{node lambda_3/embedding_lookup_2}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:#col_token_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_col_token_2_0_1, lambda_3/Assign_2, lambda_3/embedding_lookup_2/axis)]]
The problem here was the batch size is defined at the graph level.here i have used batch_size =9 for the graph and yes i get batch size of 9 for training by the validation split .1 for the full batch size of 10 but for the validation i left only one sample because 10*.1 is one.
So the batch size of 1 cannot be passed to the graph because it needs batch size of 9.that's why this error comes
As for the solution i put the batch_size=1 and then it works fine also got a good accuracy by using batch_size=1.
Hope this will help someone.
Cheers ..
For me this error was due to bad form of my input data. You have to double check your input data to the model and it depends on your model input.
I want to calculate pairwise distance between a set of Tensor (e.g 4 Tensor). Each matrix is 2D Tensor. I don't know how to do this in vectorize format. I wrote following sudo-code to determine what I need:
E.shape => [4,30,30]
sum = 0
for i in range(4):
for j in range(4):
res = calculate_distance(E[i],E[j]) # E[i] is one the 30*30 Tensor
sum = sum + reduce_sum(res)
Here is my last try:
x_ = tf.expand_dims(E, 0)
y_ = tf.expand_dims(E, 1)
s = x_ - y_
P = tf.reduce_sum(tf.norm(s, axis=[-2, -1]))
This code works But I don't know how do this in a Batch. For instance when E.shape is [BATCH_SIZE * 4 * 30 * 30] my code doesn't work and Out Of Memory will happen. How can I do this efficiently?
Edit: After a day, I find a solution. it's not perfect but works:
res = tf.map_fn(lambda x: tf.map_fn(lambda y: tf.map_fn(lambda z: tf.norm(z - x), x), x), E)
res = tf.reduce_mean(tf.square(res))
Your solution with expand_dims should be okay if your batch size is not too large. However, given that your original pseudo code loops over range(4), you should probably expand axes 1 and 2, instead of 0 and 1.
You can check the shape of the tensors to ensure that you're specifying the correct axes. For example,
batch_size = 8
E_np = np.random.rand(batch_size, 4, 30, 30)
E = K.variable(E_np) # shape=(8, 4, 30, 30)
x_ = K.expand_dims(E, 1)
y_ = K.expand_dims(E, 2)
s = x_ - y_ # shape=(8, 4, 4, 30, 30)
distances = tf.norm(s, axis=[-2, -1]) # shape=(8, 4, 4)
P = K.sum(distances, axis=[-2, -1]) # shape=(8,)
Now P will be the sum of pairwise distances between the 4 matrices for each of the 8 samples.
You can also verify that the values in P is the same as what would be computed in your pseudo code:
answer = []
for batch_idx in range(batch_size):
s = 0
for i in range(4):
for j in range(4):
a = E_np[batch_idx, i]
b = E_np[batch_idx, j]
s += np.sqrt(np.trace(np.dot(a - b, (a - b).T)))
answer.append(s)
print(answer)
[149.45960605637578, 147.2815068236368, 144.97487402393705, 146.04866735065312, 144.25537059201062, 148.9300986019226, 146.61229889228133, 149.34259789169045]
print(K.eval(P).tolist())
[149.4595947265625, 147.281494140625, 144.97488403320312, 146.04867553710938, 144.25537109375, 148.9300994873047, 146.6123046875, 149.34259033203125]
Tensorflow allows to compute the Frobenius norm via tf.norm function. In case of 2D matrices, it's equivalent to 1-norm.
The following solution isn't vectorized and assumes that the first dimension in E is known statically:
E = tf.random_normal(shape=[5, 3, 3], dtype=tf.float32)
F = tf.split(E, E.shape[0])
total = tf.reduce_sum([tf.norm(tensor=(lhs-rhs), ord=1, axis=(-2, -1)) for lhs in F for rhs in F])
Update:
An optimized vectorized version of the same code:
E = tf.random_normal(shape=[1024, 4, 30, 30], dtype=tf.float32)
lhs = tf.expand_dims(E, axis=1)
rhs = tf.expand_dims(E, axis=2)
total = tf.reduce_sum(tf.norm(tensor=(lhs - rhs), ord=1, axis=(-2, -1)))
Memory concerns: upon evaluating this code,
tf.contrib.memory_stats.MaxBytesInUse() reports that the peak memory consumption is 73729792 = 74Mb, which indicates relatively moderate overhead (the raw lhs-rhs tensor is 59Mb). Your OOM is most likely caused by the duplication of BATCH_SIZE dimension when you compute s = x_ - y_, because your batch size is much larger than the number of matrices (1024 vs 4).
I have the following code that I am hoping to get a forward pass from a 2 layer LSTM:
"""
this is a simple numerical example of LSTM forward pass to allow deep understanding
the LSTM is trying to learn the sin function by learning to predict the next value after a sequence of 3 inputs
example 1: {0.583, 0.633, 0.681} --> {0.725}, these values correspond to
{sin(35.66), sin(39.27}, sin(42.92)} --> {sin(46.47)}
example 2: {0.725, 0.767, 0.801} --> {0.849}, these values correspond to
{sin(46.47), sin(50.09), sin(53.23)} --> {sin(58.10)}
example tested: [[['0.725323664']
['0.7671179']
['0.805884672']]]
predicted_instance: [ 0.83467698]
training example pair: [['0.680666907']
['0.725323664']
['0.7671179']] 0.805884672
"""
import numpy as np
# linear activation matrix-wise (works also element-wise)
def linear(x):
return x
# sigmoid function matrix-wise (works also element-wise)
def sigmoid(x):
return 1/(1 + np.exp(-x))
# hard sigmoid function element wise
def hard_sig(x):
# in Keras for both tensorflow and theano backend
return np.max(np.array([0.0, np.min(np.array([1.0, x * 0.2 + 0.5]))]))
# Courbariaux et al. 2016 (Binarized Neural Networks)
# return np.max(np.array([0.0, np.min(np.array([1.0, (x + 1.0)/2.0]))]))
# hard sigmoid function matrix wise
def hard_sigmoid(x, fun=hard_sig):
return np.vectorize(fun)(x)
# hyperbolic tangent function matrix wise (works also element-wise)
def hyperbolic_tangent(x):
return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
print(sigmoid(np.array([-100, 0, 100])))
print(hard_sigmoid(np.array([-100, 0, 0.1, 100])))
print(hyperbolic_tangent(np.array([-100, 0, 100])))
parameter_names = ['lstm_1_kernel_0.npy',
'lstm_1_recurrent_kernel_0.npy',
'lstm_1_bias_0.npy',
'lstm_2_kernel_0.npy',
'lstm_2_recurrent_kernel_0.npy',
'lstm_2_bias_0.npy',
'dense_1_kernel_0.npy',
'dense_1_bias_0.npy']
# LSTM 1 Weights
lstm_1_kernel_0 = np.load('lstm_1_kernel_0.npy')
print('lstm_1_kernel_0: ', lstm_1_kernel_0.shape)
lstm_1_recurrent_kernel_0 = np.load('lstm_1_recurrent_kernel_0.npy')
print('lstm_1_recurrent_kernel_0: ', lstm_1_recurrent_kernel_0.shape)
lstm_1_bias_0 = np.load('lstm_1_bias_0.npy')
print('lstm_1_bias_0: ', lstm_1_bias_0.shape)
# LSTM 2 Wights
lstm_2_kernel_0 = np.load('lstm_2_kernel_0.npy')
print('lstm_2_kernel_0: ', lstm_2_kernel_0.shape)
lstm_2_recurrent_kernel_0 = np.load('lstm_2_recurrent_kernel_0.npy')
print('lstm_2_recurrent_kernel_0: ', lstm_2_recurrent_kernel_0.shape)
lstm_2_bias_0 = np.load('lstm_2_bias_0.npy')
print('lstm_2_bias_0: ', lstm_2_bias_0.shape)
# Dense layer
dense_1_kernel_0 = np.load('dense_1_kernel_0.npy')
print('dense_1_kernel_0: ', dense_1_kernel_0.shape)
dense_1_bias_0 = np.load('dense_1_bias_0.npy')
print('dense_1_bias_0: ', dense_1_bias_0.shape)
time_seq = [0, 1, 2]
"""
input_seq = np.array([[[0.725323664],
[0.7671179],
[0.805884672]]])
"""
input_seq = np.array([[[0.680666907],
[0.725323664],
[0.7671179]]])
print('input_seq: ', input_seq.shape)
for time in time_seq:
print('input t', time, ':', input_seq[0, time, 0])
"""
# z0 = z[:, :self.units]
# z1 = z[:, self.units: 2 * self.units]
# z2 = z[:, 2 * self.units: 3 * self.units]
# z3 = z[:, 3 * self.units:]
# i = self.recurrent_activation(z0)
# f = self.recurrent_activation(z1)
# c = f * c_tm1 + i * self.activation(z2)
# o = self.recurrent_activation(z3)
# activation =' tanh'
# recurrent_activation = 'hard_sigmoid'
"""
# LSTM 1
x_1_lstm_1 = input_seq[0, 0, 0]
print('x_1: ', x_1_lstm_1)
x_2_lstm_1 = input_seq[0, 1, 0]
print('x_2: ', x_2_lstm_1)
x_3_lstm_1 = input_seq[0, 2, 0]
print('x_3: ', x_3_lstm_1)
c_0_lstm_1 = np.zeros((1, 3))
h_0_lstm_1 = np.zeros((1, 3))
z_1_lstm_1 = np.dot(x_1_lstm_1, lstm_1_kernel_0) + np.dot(h_0_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_1_lstm_1.shape)
i_1_lstm_1 = sigmoid(z_1_lstm_1[:, 0:3])
f_1_lstm_1 = sigmoid(z_1_lstm_1[:, 3:6])
input_to_c_1_lstm_1 = z_1_lstm_1[:, 6:9]
o_1_lstm_1 = sigmoid(z_1_lstm_1[:, 9:12])
c_1_lstm_1 = np.multiply(f_1_lstm_1, c_0_lstm_1) + np.multiply(i_1_lstm_1, hyperbolic_tangent(input_to_c_1_lstm_1))
h_1_lstm_1 = np.multiply(o_1_lstm_1, hyperbolic_tangent(c_1_lstm_1))
print('h_1_lstm_1: ', h_1_lstm_1.shape, h_1_lstm_1)
z_2_lstm_1 = np.dot(x_2_lstm_1, lstm_1_kernel_0) + np.dot(h_1_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_2_lstm_1.shape)
i_2_lstm_1 = sigmoid(z_2_lstm_1[:, 0:3])
f_2_lstm_1 = sigmoid(z_2_lstm_1[:, 3:6])
input_to_c_2_lstm_1 = z_2_lstm_1[:, 6:9]
o_2_lstm_1 = sigmoid(z_2_lstm_1[:, 9:12])
c_2_lstm_1 = np.multiply(f_2_lstm_1, c_1_lstm_1) + np.multiply(i_2_lstm_1, hyperbolic_tangent(input_to_c_2_lstm_1))
h_2_lstm_1 = np.multiply(o_2_lstm_1, hyperbolic_tangent(c_2_lstm_1))
print('h_2_lstm_1: ', h_2_lstm_1.shape, h_2_lstm_1)
z_3_lstm_1 = np.dot(x_3_lstm_1, lstm_1_kernel_0) + np.dot(h_2_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_3_lstm_1.shape)
i_3_lstm_1 = sigmoid(z_3_lstm_1[:, 0:3])
f_3_lstm_1 = sigmoid(z_3_lstm_1[:, 3:6])
input_to_c_3_lstm_1 = z_3_lstm_1[:, 6:9]
o_3_lstm_1 = sigmoid(z_3_lstm_1[:, 9:12])
c_3_lstm_1 = np.multiply(f_3_lstm_1, c_2_lstm_1) + np.multiply(i_3_lstm_1, hyperbolic_tangent(input_to_c_3_lstm_1))
h_3_lstm_1 = np.multiply(o_3_lstm_1, hyperbolic_tangent(c_3_lstm_1))
print('h_3_lstm_1: ', h_3_lstm_1.shape, h_3_lstm_1)
# LSTM 2
x_1_lstm_2 = h_1_lstm_1
x_2_lstm_2 = h_2_lstm_1
x_3_lstm_2 = h_3_lstm_1
c_0_lstm_2 = np.zeros((1, 1))
h_0_lstm_2 = np.zeros((1, 1))
z_1_lstm_2 = np.dot(x_1_lstm_2, lstm_2_kernel_0) + np.dot(h_0_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_1_lstm_2.shape)
i_1_lstm_2 = sigmoid(z_1_lstm_2[:, 0])
f_1_lstm_2 = sigmoid(z_1_lstm_2[:, 1])
input_to_c_1_lstm_2 = z_1_lstm_2[:, 2]
o_1_lstm_2 = sigmoid(z_1_lstm_2[:, 3])
c_1_lstm_2 = np.multiply(f_1_lstm_2, c_0_lstm_2) + np.multiply(i_1_lstm_2, hyperbolic_tangent(input_to_c_1_lstm_2))
h_1_lstm_2 = np.multiply(o_1_lstm_2, hyperbolic_tangent(c_1_lstm_2))
print('h_1_lstm_2: ', h_1_lstm_2.shape, h_1_lstm_2)
z_2_lstm_2 = np.dot(x_2_lstm_2, lstm_2_kernel_0) + np.dot(h_1_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_2_lstm_2.shape)
i_2_lstm_2 = sigmoid(z_2_lstm_2[:, 0])
f_2_lstm_2 = sigmoid(z_2_lstm_2[:, 1])
input_to_c_2_lstm_2 = z_2_lstm_2[:, 2]
o_2_lstm_2 = sigmoid(z_2_lstm_2[:, 3])
c_2_lstm_2 = np.multiply(f_2_lstm_2, c_1_lstm_2) + np.multiply(i_2_lstm_2, hyperbolic_tangent(input_to_c_2_lstm_2))
h_2_lstm_2 = np.multiply(o_2_lstm_2, hyperbolic_tangent(c_2_lstm_2))
print('h_2_lstm_2: ', h_2_lstm_2.shape, h_2_lstm_2)
z_3_lstm_2 = np.dot(x_3_lstm_2, lstm_2_kernel_0) + np.dot(h_2_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_3_lstm_2.shape)
i_3_lstm_2 = sigmoid(z_3_lstm_2[:, 0])
f_3_lstm_2 = sigmoid(z_3_lstm_2[:, 1])
input_to_c_3_lstm_2 = z_3_lstm_2[:, 2]
o_3_lstm_2 = sigmoid(z_3_lstm_2[:, 3])
c_3_lstm_2 = np.multiply(f_3_lstm_2, c_2_lstm_2) + np.multiply(i_3_lstm_2, hyperbolic_tangent(input_to_c_3_lstm_2))
h_3_lstm_2 = np.multiply(o_3_lstm_2, hyperbolic_tangent(c_3_lstm_2))
print('h_3_lstm_2: ', h_3_lstm_2.shape, h_3_lstm_2)
output = np.dot(h_3_lstm_2, dense_1_kernel_0) + dense_1_bias_0
print('output: ', output)
The weights have been saved to file at train time and they can be retrieved from the following location:
LSTM weights
In order to create the LSTM which is fitting a sinwave signal I have used the following code in Keras:
def build_simple_model(layers):
model = Sequential()
model.add(LSTM(input_shape=(layers[1], layers[0]),
output_dim=layers[1],
return_sequences=True,
activation='tanh',
recurrent_activation='sigmoid')) # 'hard_sigmoid'
# model.add(Dropout(0.2))
model.add(LSTM(layers[2],
return_sequences=False,
activation='tanh',
recurrent_activation='sigmoid')) # 'hard_sigmoid'
# model.add(Dropout(0.2))
model.add(Dense(output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="rmsprop")
print("> Compilation Time : ", time.time() - start)
plot_model(model, to_file='lstm_model.png', show_shapes=True, show_layer_names=True)
print(model.summary())
return model
This resulted in the following model:
I have used the training procedure as follows:
seq_len = 3
model = lstm.build_simple_model([1, seq_len, 1, 1])
model.fit(X_train,
y_train,
batch_size=512,
nb_epoch=epochs,
validation_split=0.05)
Would it be possible to understand why my forward pass does not produce the desired output in predicting a future sin() signal value based on three previous consecutive ones.
The original example on which I am trying to base my forward pass exercise originates here. The weights uploaded in .npy format are from a network that is able to perfectly predict the next sin() value in a series.
I realised what the problem was. I was trying to extract my model weights using Tensorflow session (after model fitting), rather than via Keras methods directly. This resulted in weights matrices that made perfect sense (dimension wise) but contained the values from initialization step.
model.fit(X_train,
y_train,
batch_size=batch_size,
nb_epoch=epochs,
validation_split=0.05,
callbacks=callbacks_list)
print('n_parameters: ', len(model.weights))
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
parameter_names = ['lstm_1_kernel_0',
'lstm_1_recurrent_kernel_0',
'lstm_1_bias_0',
'lstm_2_kernel_0',
'lstm_2_recurrent_kernel_0',
'lstm_2_bias_0',
'dense_1_kernel_0',
'dense_1_bias_0']
weights = model.get_weights()
trainable_weights = model.trainable_weights
for parameter in range(len(model.weights)):
print('')
# using Keras methods is the correct way
print('parameter: ', trainable_weights[parameter])
print('parameter Keras: ', weights[parameter])
# using session with TF is the wrong way
print('parameter TF: ', model.weights[parameter].eval(session=sess))
#np.save(parameter_names[parameter], model.weights[parameter].eval(session=sess))
#np.save(parameter_names[parameter], weights[parameter])
This prints the following to screen:
parameter: <tf.Variable 'lstm_1/kernel:0' shape=(1, 12) dtype=float32_ref>
parameter Keras: [[ 0.02005039 0.59627813 -0.77670902 -0.17643917 0.64905447 -0.49418128
0.01204901 0.79791737 -1.58887422 -0.3566488 0.67758918 0.77245694]]
parameter TF: [[-0.20346385 -0.07166874 -0.58842945 0.03744811 0.46911311 -0.0469712
-0.07291448 0.27316415 -0.53298378 0.08367682 0.10194337 0.20933461]]
parameter: <tf.Variable 'lstm_1/recurrent_kernel:0' shape=(3, 12) dtype=float32_ref>
parameter Keras: [[ 0.01916649 -0.30881727 -0.07018201 0.28770521 -0.45713434 -0.33738521
0.53091544 -0.78456688 0.50647908 0.12326431 -0.18517831 -0.28752103]
[ 0.44490865 -0.09020164 1.00983524 0.43070397 -0.14646551 -0.53908533
1.33833826 0.76106179 -1.28808987 0.71029669 -0.19338571 -0.30499896]
[ 0.76727188 -0.10291406 0.53285897 0.31021088 0.46876401 0.04961515
0.0573149 1.17765784 -0.45716232 0.26181531 0.60458028 -0.6042906 ]]
parameter TF: [[-0.044281 -0.42013288 -0.06702472 0.16710882 0.07229936 0.20263752
0.01935999 -0.65925431 0.21676332 0.02481769 0.50321299 -0.08369029]
[-0.17725646 -0.14031938 -0.07758044 -0.39292315 0.36675838 -0.20198873
0.59491426 -0.12469263 0.14705807 0.39603388 -0.25511321 -0.01221756]
[ 0.51603764 0.34401873 0.36002275 0.05344227 -0.00293417 -0.36086732
0.1636388 -0.24916036 0.09064917 -0.04246153 0.05563453 -0.5006755 ]]
parameter: <tf.Variable 'lstm_1/bias:0' shape=(12,) dtype=float32_ref>
parameter Keras: [ 3.91339064e-01 -2.09703773e-01 -4.88098420e-04 1.15376031e+00
6.24452651e-01 2.24053934e-01 4.06851530e-01 4.78419960e-01
1.77846551e-01 3.19107175e-01 5.16630232e-01 -2.22970009e-01]
parameter TF: [ 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
parameter: <tf.Variable 'lstm_2/kernel:0' shape=(3, 4) dtype=float32_ref>
parameter Keras: [[ 2.01334882 1.9168334 1.77633524 -0.90856379]
[ 1.17618477 1.02978265 -0.06435115 0.66180402]
[-1.33014703 -0.71629387 -0.87376142 1.35648465]]
parameter TF: [[ 0.83115911 0.72150767 0.51600969 -0.52725452]
[ 0.53043616 0.59162521 -0.59219611 0.0951736 ]
[-0.8030411 -0.00424314 -0.06715947 0.67533839]]
parameter: <tf.Variable 'lstm_2/recurrent_kernel:0' shape=(1, 4) dtype=float32_ref>
parameter Keras: [[-0.09348518 -0.7667768 0.24031806 -0.39155772]]
parameter TF: [[-0.085137 -0.59010917 0.61000961 -0.52193022]]
parameter: <tf.Variable 'lstm_2/bias:0' shape=(4,) dtype=float32_ref>
parameter Keras: [ 1.21466994 2.22224903 1.34946632 0.19186479]
parameter TF: [ 0. 1. 0. 0.]
parameter: <tf.Variable 'dense_1/kernel:0' shape=(1, 1) dtype=float32_ref>
parameter Keras: [[ 2.69569159]]
parameter TF: [[ 1.5422312]]
parameter: <tf.Variable 'dense_1/bias:0' shape=(1,) dtype=float32_ref>
parameter Keras: [ 0.20767514]
parameter TF: [ 0.]
The forward pass code was therefore correct.The weights were wrong.The correct weights .npy files have also been updated at the link mentioned in the question. This forward pass can be used to illustrate sequence generation with LSTM by recycling the output.
I've been following this tutorial:
https://blog.altoros.com/using-linear-regression-in-tensorflow.html
I'm aware there's better ways to do linear regression, but I'm using this as a base to do multi-variate regression and multi-variate non-linear regression to try to understand TensorFlow.
Without normalizing my data at all, I get 'nan' with GradientDescentOptimizer. I'm curious about why this is. Why is normalization so important that the model won't run at all? And what about subtracting mean and dividing by standard deviation suddenly makes it work so well?
After normalizing data, I'd like to recover the original value.
Each set of data seems to be normalized separately with its own stddev and mean parameters: the training data X, training data Y, test data X, and test data Y.
However, when I run the network on new data, I'm assuming when I predict new values, I have to normalize the input again. In that case, how do I make sense of the predicted Y? Am I supposed to use the training data's standard deviation and mean to unnormalize, or the new data's standard deviation and mean? I am confused what the model is actually fitting to when I give it normalized training data, and how to interpret W and b. I originally wanted to fit to Y = mx + b, and want to know what m and b really are.
Because I trained on training data, I assumed that I would need to store the training_data's pre-normalization standard deviation and mean and unnormalize any results from the network using this value. But in fact, when I use the new data's standard deviation and mean to unnormalize I get more reasonable values. I don't think it's worth posting that code because I just have a fundamental misunderstanding of what I need to do, but this is the basic code I'm using anyway.
import tensorflow as tf
import numpy
import matplotlib.pyplot as plt
# Train a data set
# X: size data
size_data = [ 2104, 1600, 2400, 1416, 3000, 1985, 1534, 1427,
1380, 1494, 1940, 2000, 1890, 4478, 1268, 2300,
1320, 1236, 2609, 3031, 1767, 1888, 1604, 1962,
3890, 1100, 1458, 2526, 2200, 2637, 1839, 1000,
2040, 3137, 1811, 1437, 1239, 2132, 4215, 2162,
1664, 2238, 2567, 1200, 852, 1852, 1203 ]
# Y: price data (set to 5x + 30)
price_data = [5*c + 30 for c in size_data]
size_data = numpy.asarray(size_data)
price_data = numpy.asarray(price_data)
# Test a data set
size_data_test = [ 1600, 1494, 1236, 1100, 3137, 2238 ]
price_data_test = [5*c + 30 for c in size_data_test]
size_data_test = numpy.asarray(size_data_test)
price_data_test = numpy.asarray(price_data_test)
def normalize(array):
std = array.std()
mean = array.mean()
return (array - mean) / std, std, mean
# Normalize a data set
size_data_n, size_data_n_std, size_data_n_mean = normalize(size_data)
price_data_n, price_data_n_std, price_data_n_mean = normalize(price_data)
size_data_test_n, size_data_test_n_std, size_data_test_n_mean = normalize(size_data_test)
price_data_test_n, price_data_test_n_std, price_data_test_n_mean = normalize(price_data_test)
# Display a plot
#plt.plot(size_data, price_data, 'ro', label='Samples data')
#plt.legend()
#plt.draw()
samples_number = price_data_n.size
# TF graph input
X = tf.placeholder("float")
Y = tf.placeholder("float")
# Create a model
# Set model weights
W = tf.Variable(numpy.random.randn(), name="weight")
b = tf.Variable(numpy.random.randn(), name="bias")
# Set parameters
learning_rate = 0.05
training_iteration = 200
# Construct a linear model
model = tf.add(tf.mul(X, W), b)
# Minimize squared errors
cost_function = tf.reduce_sum(tf.pow(model - Y, 2))/(2 * samples_number) #L2 loss
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function) #Gradient descent
#optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(cost_function)
# Initialize variables
init = tf.initialize_all_variables()
# Launch a graph
with tf.Session() as sess:
sess.run(init)
display_step = 20
# Fit all training data
for iteration in range(training_iteration):
for (x, y) in zip(size_data_n, price_data_n):
sess.run(optimizer, feed_dict={X: x, Y: y})
# Display logs per iteration step
if iteration % display_step == 0:
print("Iteration:", '%04d' % (iteration + 1), "cost=", "{:.9f}".format(sess.run(cost_function, feed_dict={X:size_data_n, Y:price_data_n})),\
"W=", sess.run(W), "b=", sess.run(b))
tuning_cost = sess.run(cost_function, feed_dict={X: size_data_n, Y: price_data_n})
print("Tuning completed:", "cost=", "{:.9f}".format(tuning_cost), "W=", sess.run(W), "b=", sess.run(b))
# Validate a tuning model
testing_cost = sess.run(cost_function, feed_dict={X: size_data_test_n, Y: price_data_test_n})
print("Testing data cost:" , testing_cost)
Y_predicted = sess.run(model, feed_dict={X: size_data_test_n, Y: price_data_test_n})
print("%-20s%-20s%-20s%-20s" % ("Test X", "Actual", "Target", "Error(%)"))
print('Normalized')
for i in range(len(size_data_test_n)):
err = 100.0 * abs(Y_predicted[i] - price_data_test_n[i]) / abs(price_data_test_n[i])
print("%-20f%-20f%-20f%-20f" % (size_data_test_n[i], Y_predicted[i], price_data_test_n[i], err))
print('Unnormalized')
for i in range(len(size_data_test_n)):
orig_size_data_test_i = size_data_test_n[i] * size_data_test_n_std + size_data_test_n_mean
orig_price_data_test_i = price_data_test_n[i] * price_data_test_n_std + price_data_test_n_mean
# ??? which one is correct for getting unnormalized predicted Y?
#orig_Y_predicted_i = Y_predicted[i] * price_data_n_std + price_data_n_mean
orig_Y_predicted_i = Y_predicted[i] * price_data_test_n_std + price_data_test_n_mean
orig_err = 100.0 * abs(orig_Y_predicted_i - orig_price_data_test_i) / abs(orig_price_data_test_i)
print("%-20f%-20f%-20f%-20f" % (orig_size_data_test_i, orig_Y_predicted_i, orig_price_data_test_i, orig_err))
# Display a plot
plt.figure()
plt.plot(size_data, price_data, 'ro', label='Samples')
plt.plot(size_data_test, price_data_test, 'go', label='Testing samples')
plt.plot(size_data_test, (sess.run(W) * size_data_test_n + sess.run(b))*price_data_n_std + price_data_n_mean , label='Fitted test line')
plt.legend()
plt.show()