Converting Tensorflow code to Pytorch - performance metrics very different - tensorflow

I have converted a tensorflow code for timeseries analysis to pytorch and performance difference is very high, in fact pytorch layers cannot account for seasonality at all. It feels like I must be missing something important.
Please help find where the pytorch code is lacking that the learning is not up to the par. I noticed that loss values has high jumps when it encounters the season change and is not learning that. With the same layers, nodes and every other thing, I imagined the performance to be close.
# tensorflow code
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
dataset = tf.data.Dataset.from_tensor_slices(series)
dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
dataset = dataset.shuffle(shuffle_buffer).map(lambda window: (window[:-1], window[-1]))
dataset = dataset.batch(batch_size).prefetch(1)
return dataset
dataset = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(100, input_shape=[window_size], activation="relu"),
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(1)
])
model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=1e-6, momentum=0.9))
model.fit(dataset,epochs=100,verbose=0)
forecast = []
for time in range(len(series) - window_size):
forecast.append(model.predict(series[time:time + window_size][np.newaxis]))
forecast = forecast[split_time-window_size:]
results = np.array(forecast)[:, 0, 0]
plt.figure(figsize=(10, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, results)
tf.keras.metrics.mean_absolute_error(x_valid, results).numpy()
# pytorch code
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000
class tsdataset(Dataset):
def __init__(self, series, window_size):
self.series = series
self.window_size = window_size
self.dataset, self.labels = self.preprocess()
def preprocess(self):
series = self.series
final, labels = [], []
for i in range(len(series)-self.window_size):
final.append(np.array(series[i:i+window_size]))
labels.append(np.array(series[i+window_size]))
return torch.from_numpy(np.array(final)), torch.from_numpy(np.array(labels))
def __getitem__(self,index):
# print(self.dataset[index], self.labels[index], index)
return self.dataset[index], self.labels[index]
def __len__(self):
return len(self.dataset)
train_dataset = tsdataset(x_train, window_size)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
class tspredictor(nn.Module):
def __init__(self, window_size, out1, out2, out3):
super(tspredictor, self).__init__()
self.l1 = nn.Linear(window_size, out1)
self.l2 = nn.Linear(out1, out2)
self.l3 = nn.Linear(out2, out3)
def forward(self,seq):
l1 = F.relu(self.l1(seq))
l2 = F.relu(self.l2(l1))
l3 = self.l3(l2)
return l3
model = tspredictor(20, 100,10,1)
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-6, momentum=0.9)
for epoch in range(100):
for t,l in train_dataloader:
model.zero_grad()
tag_scores = model(t)
loss = loss_function(tag_scores, l)
loss.backward()
optimizer.step()
# print("Epoch is {}, loss is {}".format(epoch, loss.data))
forecast = []
for time in range(len(series) - window_size):
prediction = model(torch.from_numpy(series[time:time + window_size][np.newaxis]))
forecast.append(prediction)
forecast = forecast[split_time-window_size:]
results = np.array(forecast)
plt.figure(figsize=(10, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, results)
To generate data, you can use:
def plot_series(time, series, format="-", start=0, end=None):
plt.plot(time[start:end], series[start:end], format)
plt.xlabel("Time")
plt.ylabel("Value")
plt.grid(False)
def trend(time, slope=0):
return slope * time
def seasonal_pattern(season_time):
"""Just an arbitrary pattern, you can change it if you wish"""
return np.where(season_time < 0.1,
np.cos(season_time * 6 * np.pi),
2 / np.exp(9 * season_time))
def seasonality(time, period, amplitude=1, phase=0):
"""Repeats the same pattern at each period"""
season_time = ((time + phase) % period) / period
return amplitude * seasonal_pattern(season_time)
def noise(time, noise_level=1, seed=None):
rnd = np.random.RandomState(seed)
return rnd.randn(len(time)) * noise_level
time = np.arange(10 * 365 + 1, dtype="float32")
baseline = 10
series = trend(time, 0.1)
baseline = 10
amplitude = 40
slope = 0.005
noise_level = 3
# Create the series
series = baseline + trend(time, slope) + seasonality(time, period=365, amplitude=amplitude)
# Update with noise
series += noise(time, noise_level, seed=51)
split_time = 3000
time_train = time[:split_time]
x_train = series[:split_time]
time_valid = time[split_time:]
x_valid = series[split_time:]

There was a broadcasting issue in the loss function. Changing to the loss to one below fixes it:
loss = loss_function(tag_scores, l.view(-1,1))

Related

BERT for multi label text classification always has a similar val accuracy despite fine tuning

I have a dataset of German news articles that I need to classify in my job. Since it is imbalanced, I am focussing on only 12 of 30 labels currently. Therefore I tried to balance the dataset by oversampling enhanced with data augmentation. Each sample can belong to multiple categories, thus it is a multi label problem.
The train dataset contains about 127.000 samples.
I am using a German BERT model with Tensorflow but despite fine tuning and even adding new layers, my val accuracy is always about 65%. Sometimes 67 to 68 but never higher. I wondered if my code is maybe broken or if it is due to the dataset.
Here is what I have right now:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
transformer_model = TFAutoModel.from_pretrained("dbmdz/bert-base-german-cased", output_hidden_states=False)
def multi_label_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
"""For multi-label classification, one has to define a custom
acccuracy function because neither tf.keras.metrics.Accuracy nor
tf.keras.metrics.CategoricalAccuracy evaluate the number of
exact matches.
:Example:
>>> from tensorflow.keras import metrics
>>> y_true = tf.convert_to_tensor([[1., 1.]])
>>> y_pred = tf.convert_to_tensor([[1., 0.]])
>>> metrics.Accuracy()(y_true, y_pred).numpy()
0.5
>>> metrics.CategoricalAccuracy()(y_true, y_pred).numpy()
1.0
>>> multi_label_accuracy(y_true, y_pred).numpy()
0.0
"""
y_pred = tf.math.sigmoid(y_pred)
y_pred = tf.math.round(y_pred)
exact_matches = tf.math.reduce_all(y_pred == y_true, axis=1)
exact_matches = tf.cast(exact_matches, tf.float32)
return tf.math.reduce_mean(exact_matches)
def f1_score(y_true, y_logit):
'''
Calculate F1 score
y_true: true value
y_logit: predicted value
'''
y_logit = tf.math.sigmoid(y_logit)
true_positives = K.sum(K.round(K.clip(y_true * y_logit, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
predicted_positives = K.sum(K.round(K.clip(y_logit, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return (2 * precision * recall) / (precision + recall + K.epsilon())
for l in transformer_model.layers:
l.trainable = True
bert = transformer_model.layers[0]
input_ids = tf.keras.layers.Input(shape=(60,), name='input_ids', dtype=np.int32)
attention_masks = tf.keras.layers.Input(shape=(60,), name='attention_masks', dtype=np.int32)
bert_model = bert(input_ids, attention_mask=attention_masks)[0][:, 0, :]
dropout = tf.keras.layers.Dropout(0.2, name="pooled_output")
pooled_output = dropout(bert_model)
dense = tf.keras.layers.Dense(units=256, activation="sigmoid")(pooled_output)
dropout2 = tf.keras.layers.Dropout(0.2)(dense)
dense2 = tf.keras.layers.Dense(units=64, activation="relu")(dropout2)
output = tf.keras.layers.Dense(units=12, name="output")(dense2)
model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
print("Compile model...", flush=True)
optimizer = Adam(learning_rate=1e-5, decay=1e-6)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=[f1_score, multi_label_accuracy]
)
history = model.fit([dataset['train']['bert'], dataset['train']['bert2']], dataset['train']['outputs'], epochs=4, batch_size=64, validation_data=([dataset['val']['bert'], dataset['val']['bert2']], dataset['val']['outputs']))
I would expect the val accuracy to change a lot more by changing the architecture of the model.

How to make lstm/rnn focus more on certain parts of time series while less on other parts using tensorflow?

I have a time series prediction problem where most of the observed values (95%) are 0s while remaining values are non-zeros. How can I make use of RNN for this problem.
I want to predict surface flow from environmental data(air temperature, rainfall, humidity etc). We know surface flow is 0.0 for most of the time in an year. However, I also don't want to simply ignore 0s as the 0s represent the period of the year when when surface flow is 0.0. The image below shows possible observed output and three inputs. The three inputs here are just random but in reality they will be data like rainfall, humidity etc and these input data have some periodic pattern.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import tensorflow as tf
import sys
print(sys.version)
print('tensorflow version: ', tf.__version__)
#clean computation graph
tf.reset_default_graph()
tf.set_random_seed(777) # reproducibility
np.random.seed(0)
def MinMaxScaler(data):
numerator = data - np.min(data, 0)
denominator = np.max(data, 0) - np.min(data, 0)
# noise term prevents the zero division
return numerator / (denominator + 1e-7)
class generate_data(object):
def __init__(self, data_len, in_series, y_pred, seq_lengths, method='sum' ):
self.data_len = data_len
self.data = None
self.in_series = in_series #number of input series
self.y_pred = y_pred #number of final outputs from model
self.seq_lengths = seq_lengths
self.method = method
def _f(self, x):
y = 0
result = []
for _ in x:
result.append(y)
y += np.random.normal(scale=1)
return np.array(result)
def _runningMean(self, x, N):
return np.convolve(x, np.ones((N,))/N)[(N-1):]
def sine(self):
DATA = np.zeros((self.data_len, self.in_series))
xx = [None]
data_0 = np.sin(np.linspace(0, 20, self.data_len*self.in_series))
xx = data_0.reshape(self.data_len, self.in_series)
DATA[:,0: self.in_series] = xx
y = self._get_y(DATA)
return xx,y, DATA
def _get_y(self, xx):
if self.method=='sum':
yy = np.array([np.sum(xx[i,:]) for i in range(np.shape(xx)[0])])
elif self.method == 'mean':
yy = np.array([np.mean(xx[i,:]) for i in range(np.shape(xx)[0])])
elif self.method == 'self_mul':
yy = np.array([np.prod(xx[i,:]) for i in range(np.shape(xx)[0])])
elif self.method == 'mean_mirror':
yy = np.array([np.mean(xx[i,:]) for i in range(np.shape(xx)[0])])
return yy
def normalize(self, xx1,yy1):
yy = [None]*len(yy1)
YMinMax = {}
xx = MinMaxScaler(xx1)
for i in range(self.y_pred):
YMinMax['ymin_' + str(i)] = np.min(yy1[0])
YMinMax['ymax_' + str(i)] = np.max(yy1[0])
yy[i] = MinMaxScaler(yy1[0])
setattr(self, 'YMinMax', YMinMax)
return xx,yy
def create_dataset(self, xx, yy, percent_of_zeros):
'''creates a dataset consisting of windows for x and y data'''
dataX = self._build_input_windows(xx, self.seq_lengths)
if self.y_pred > 1:
pass
elif self.y_pred > 1 and self.seq_lengths != any(self.seq_lengths):
pass
else:
dataY = self._build_y_windows(yy[0] , self.seq_lengths)
indices = np.random.choice(np.arange(dataY.size), replace=False,
size=int(dataY.size * percent_of_zeros))
dataY[indices] = 0
return dataX, dataY
def _build_input_windows(self, time_series, seq_length):
dataX = []
for i in range(0, len(time_series) - seq_length):
_x = time_series[i:i + seq_length, :]
dataX.append(_x)
return np.array(dataX)
def _build_y_windows(self, iny, seq_length):
dataY = []
for i in range(0, len(iny) - seq_length):
_y = iny[i + seq_length, ] # Next close price
dataY.append(_y)
return np.array(dataY)
def TrainTestSplit(self, dataX, dataY, train_frac):
train_size = int(len(dataY) * train_frac)
trainX, testX = np.array(dataX[0:train_size]), np.array(dataX[train_size:len(dataX)])
trainY, testY = np.array(dataY[0:train_size]), np.array(dataY[train_size:len(dataY)])
trainY = trainY.reshape(len(trainY), 1)
testY = testY.reshape(len(testY), 1)
return trainX, trainY, testX, testY, train_size
#training/hyper parameters
tot_epochs = 500
batch_size = 16
learning_rate = 0.01
seq_lengths = 5 #sequence lengths/window size for RNN
rnn_inputs = 3 # no of inputs for RNN
y_pred = 1
data_length = 1005 #this can be overwritten or useless
gen_data = generate_data(data_length, rnn_inputs, y_pred, seq_lengths, 'sum')
xx,yy,data_1 = gen_data.sine()
# xx = abs(xx)
train_frac = 0.8
xx1,yy1 = gen_data.normalize(xx,[yy])
zeros = 0.96
dataX, dataY = gen_data.create_dataset(xx1,yy1, zeros)
trainX, trainY, testX, testY, train_size = gen_data.TrainTestSplit( dataX, dataY, train_frac)
keep_prob = tf.placeholder(tf.float32)
x_placeholders = tf.placeholder(tf.float32, [None, 5, 3])
Y = tf.placeholder(tf.float32, [None, 1])
plt.plot(dataY, '.', label='output')
plt.plot(xx[:,0], '.', label='input1')
plt.plot(xx[:,1], '.', label='input2')
plt.plot(xx[:,2], '.', label='input3')
plt.legend()
# build neural network
with tf.variable_scope('scope0'): #defining RNN
# cell = tf.contrib.rnn.BasicLSTMCell(num_units= 7, state_is_tuple=True, activation=tf.tanh)
cell = tf.keras.layers.LSTMCell(units = 128)
outputs1, _states = tf.nn.dynamic_rnn(cell, x_placeholders, dtype=tf.float32)
# Y_pred1 = tf.contrib.layers.fully_connected(outputs1[:, -1], 1, activation_fn=None)
Y_pred1 = tf.keras.layers.Dense(1)(outputs1[:,-1])
Y_pred = Y_pred1
## cost/loss
loss = tf.reduce_sum(tf.square(Y_pred - Y)) # sum of the squares
## optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
#
## RMSE
targets = tf.placeholder(tf.float32, [None, 1])
predictions = tf.placeholder(tf.float32, [None, 1])
rmse = tf.sqrt(tf.reduce_mean(tf.square(targets - predictions)))
with tf.Session() as sess:
saver = tf.train.Saver(max_to_keep=41)
writer = tf.summary.FileWriter('./laos_2out/cnntest', sess.graph)
init = tf.global_variables_initializer()
sess.run(init)
# Training step
for epoch in range(tot_epochs):
total_batches = int(train_size / batch_size) ##total batches/ no. of steps in an epoch
#for batch in range(total_batches):
_, step_loss = sess.run([train, loss], feed_dict= {x_placeholders:trainX, Y:trainY, keep_prob:0.5} )
print('epoch: # {} loss: {}'.format(epoch, step_loss))
# # evaluating on test data
test_predict = sess.run(Y_pred, feed_dict= {x_placeholders:testX, Y:trainY, keep_prob:0.5} )
#evaluating on training data
train_predict = sess.run(Y_pred, feed_dict={x_placeholders:trainX, Y:trainY, keep_prob:0.5})
rmse_val = sess.run(rmse, feed_dict={targets: testY, predictions: test_predict})
print("RMSE: {}".format(rmse_val))
# Plot predictions
fig, (ax1,ax2) = plt.subplots(1,2, sharey=True)
fig.set_figwidth(14)
fig.set_figheight(5)
ax2.plot(testY, 'b', label='observed')
ax2.plot(test_predict, 'k', label='predicted')
ax2.legend(loc="best")
ax2.set_xlabel("Time Period")
ax2.set_title('Testing')
ax1.plot(trainY, 'b', label='observed')
ax1.plot(train_predict, 'k',label= 'predicted')
ax1.legend(loc="best")
ax1.set_xlabel("Time Period")
ax1.set_ylabel("discharge (cms)")
ax1.set_title('Training')
plt.show()
The problem is that while training, the model focuses on majority of values i.e. 0s and thus makes the predictions equal to 0s. How can I make the model focus on non-zero values (positive surface flow) while at the same time also consider 0s (when there is no surface flow). I have read about attention mechanism but have not understood that how I can implement it in such scenarios.

MNIST - Vanilla Neural Network - Why Cost Function is Increasing?

I've been combing through this code for a week now trying to figure out why my cost function is increasing as in the following image. Reducing the learning rate does help but very little. Can anyone spot why the cost function isn't working as expected?
I realise a CNN would be preferable, but I still want to understand why this simple network is failing.
Please help:)
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets("MNIST_DATA/",one_hot=True)
def createPlaceholders():
xph = tf.placeholder(tf.float32, (784, None))
yph = tf.placeholder(tf.float32, (10, None))
return xph, yph
def init_param(layers_dim):
weights = {}
L = len(layers_dim)
for l in range(1,L):
weights['W' + str(l)] = tf.get_variable('W' + str(l), shape=(layers_dim[l],layers_dim[l-1]), initializer= tf.contrib.layers.xavier_initializer())
weights['b' + str(l)] = tf.get_variable('b' + str(l), shape=(layers_dim[l],1), initializer= tf.zeros_initializer())
return weights
def forward_prop(X,L,weights):
parameters = {}
parameters['A0'] = tf.cast(X,tf.float32)
for l in range(1,L-1):
parameters['Z' + str(l)] = tf.add(tf.matmul(weights['W' + str(l)], parameters['A' + str(l-1)]), weights['b' + str(l)])
parameters['A' + str(l)] = tf.nn.relu(parameters['Z' + str(l)])
parameters['Z' + str(L-1)] = tf.add(tf.matmul(weights['W' + str(L-1)], parameters['A' + str(L-2)]), weights['b' + str(L-1)])
return parameters['Z' + str(L-1)]
def compute_cost(ZL,Y):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.cast(Y,tf.float32), logits = ZL))
return cost
def randomMiniBatches(X,Y,minibatch_size):
m = X.shape[1]
shuffle = np.random.permutation(m)
temp_X = X[:,shuffle]
temp_Y = Y[:,shuffle]
num_complete_minibatches = int(np.floor(m/minibatch_size))
mini_batches = []
for batch in range(num_complete_minibatches):
mini_batches.append((temp_X[:,batch*minibatch_size: (batch+1)*minibatch_size], temp_Y[:,batch*minibatch_size: (batch+1)*minibatch_size]))
mini_batches.append((temp_X[:,num_complete_minibatches*minibatch_size:], temp_Y[:,num_complete_minibatches*minibatch_size:]))
return mini_batches
def model(X, Y, layers_dim, learning_rate = 0.001, num_epochs = 20, minibatch_size = 64):
tf.reset_default_graph()
costs = []
xph, yph = createPlaceholders()
weights = init_param(layers_dim)
ZL = forward_prop(xph, len(layers_dim), weights)
cost = compute_cost(ZL,yph)
optimiser = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(num_epochs):
minibatches = randomMiniBatches(X,Y,minibatch_size)
epoch_cost = 0
for b, mini in enumerate(minibatches,1):
mini_x, mini_y = mini
_,c = sess.run([optimiser,cost],feed_dict={xph:mini_x,yph:mini_y})
epoch_cost += c
print('epoch: ',epoch+1,'/ ',num_epochs)
epoch_cost /= len(minibatches)
costs.append(epoch_cost)
plt.plot(costs)
print(costs)
X_train = mnist.train.images.T
n_x = X_train.shape[0]
Y_train = mnist.train.labels.T
n_y = Y_train.shape[0]
layers_dim = [n_x,10,n_y]
model(X_train, Y_train, layers_dim)
Without going to much into how you draw the mini batches: I think the problem is you are for some reason defining axis 1 of xph and yph as batch dimension (and feeding accordingly) while the computational graph of the network expects axis 0 to be the batch dimension like it is usually done.
So your forward propagation is actually performed along the batch dimension, which does not make sense.

Cost-sensitive loss function in Tensorflow

I'm doing research for cost-sensitive neural network based on Tensorflow. But because of the static graph structure of Tensorflow. Some NN structure couldn't be realized by myself.
My loss function(cost) ,cost matrix and the computational progress is described as follow and my target is to compute the total cost and then optimize the NN :
Approximately computational progress:
the y_ is the last full-connect output of a CNN which has shape (1024,5)
the y is a Tensor which has shape(1024) and indicates the ground truth of x[i]
the y_soft[i] [j] indicates the probability of x[i] to be class j
How can I realize this in Tensorflow?
cost_matrix:
[[0,1,100],
[1,0,1],
[1,20,0]]
label:
[1,2]
y*:
[[0,1,0],
[0,0,1]]
y(prediction):
[[0.2,0.3,0.5],
[0.1,0.2,0.7]]
label,cost_matrix-->cost_embedding:
[[1,0,1],
[1,20,0]]
It obvious 0.3 in [0.2,0.3,0.5] refers to right lable probility of [0,1,0], so it should not contibute to loss.
0.7 in [0.1,0.2,0.7] is the same. In other words, the pos with value 1 in y* not contibute to loss.
So I have (1-y*):
[[1,0,1],
[1,1,0]]
Then the entropy is target*log(predict) + (1-target) * log(1-predict),and value 0 in y*,should use (1-target)*log(1-predict), so I use (1-predict) said (1-y)
1-y:
[[0.8,*0.7*,0.5],
[0.9,0.8,*0.3*]]
(italic num is useless)
the custom loss is
[[1,0,1], [1,20,0]] * log([[0.8,0.7,0.5],[0.9,0.8,0.3]]) *
[[1,0,1],[1,1,0]]
and you can see the (1-y*) can be drop here
so the loss is -tf.reduce_mean(cost_embedding*log(1-y))
,to make it applicable , should be:
-tf.reduce_mean(cost_embedding*log(tf.clip((1-y),1e-10)))
the demo is below
import tensorflow as tf
import numpy as np
hidden_units = 50
num_class = 3
class Model():
def __init__(self,name_scope,is_custom):
self.name_scope = name_scope
self.is_custom = is_custom
self.input_x = tf.placeholder(tf.float32,[None,hidden_units])
self.input_y = tf.placeholder(tf.int32,[None])
self.instantiate_weights()
self.logits = self.inference()
self.predictions = tf.argmax(self.logits,axis=1)
self.losses,self.train_op = self.opitmizer()
def instantiate_weights(self):
with tf.variable_scope(self.name_scope + 'FC'):
self.W = tf.get_variable('W',[hidden_units,num_class])
self.b = tf.get_variable('b',[num_class])
self.cost_matrix = tf.constant(
np.array([[0,1,100],[1,0,100],[20,5,0]]),
dtype = tf.float32
)
def inference(self):
return tf.matmul(self.input_x,self.W) + self.b
def opitmizer(self):
if not self.is_custom:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits\
(labels=self.input_y,logits=self.logits)
else:
batch_cost_matrix = tf.nn.embedding_lookup(
self.cost_matrix,self.input_y
)
loss = - tf.log(1 - tf.nn.softmax(self.logits))\
* batch_cost_matrix
train_op = tf.train.AdamOptimizer().minimize(loss)
return loss,train_op
import random
batch_size = 128
norm_model = Model('norm',False)
custom_model = Model('cost',True)
split_point = int(0.9 * dataset_size)
train_set = datasets[:split_point]
test_set = datasets[split_point:]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100):
batch_index = random.sample(range(split_point),batch_size)
train_batch = train_set[batch_index]
train_labels = lables[batch_index]
_,eval_predict,eval_loss = sess.run([norm_model.train_op,
norm_model.predictions,norm_model.losses],
feed_dict={
norm_model.input_x:train_batch,
norm_model.input_y:train_labels
})
_,eval_predict1,eval_loss1 = sess.run([custom_model.train_op,
custom_model.predictions,custom_model.losses],
feed_dict={
custom_model.input_x:train_batch,
custom_model.input_y:train_labels
})
# print 'norm',eval_predict,'\ncustom',eval_predict1
print np.sum(((eval_predict == train_labels)==True).astype(np.int)),\
np.sum(((eval_predict1 == train_labels)==True).astype(np.int))
if i%10 == 0:
print 'norm_test',sess.run(norm_model.predictions,
feed_dict={
norm_model.input_x:test_set,
norm_model.input_y:lables[split_point:]
})
print 'custom_test',sess.run(custom_model.predictions,
feed_dict={
custom_model.input_x:test_set,
custom_model.input_y:lables[split_point:]
})

implementation a simple siamese network on Tensorflow

I want to implement a Siamese MLP network using mnist dataset.
I built my code based on Keras mnist_siamese_graph, but error value and accuracy are very huge compare to Keras version.
I cannot figure out where are problems.
This is my code:
import random
import numpy as np
import time
import tensorflow as tf
import input_data
mnist = input_data.read_data_sets("/tmp/data",one_hot=False)
import pdb
def create_pairs(x, digit_indices):
'''Positive and negative pair creation.
Alternates between positive and negative pairs.
'''
pairs = []
labels = []
n = min([len(digit_indices[d]) for d in range(10)]) - 1
for d in range(10):
for i in range(n):
z1, z2 = digit_indices[d][i], digit_indices[d][i+1]
pairs += [[x[z1], x[z2]]]
inc = random.randrange(1, 10)
dn = (d + inc) % 10
z1, z2 = digit_indices[d][i], digit_indices[dn][i]
pairs += [[x[z1], x[z2]]]
labels += [1, 0]
return np.array(pairs), np.array(labels)
def mlp(input_,input_dim,output_dim,name="mlp"):
with tf.variable_scope(name):
w = tf.get_variable('w',[input_dim,output_dim],tf.float32,tf.random_normal_initializer())
return tf.nn.relu(tf.matmul(input_,w))
def build_model_mlp(X_,_dropout):
model = mlpnet(X_,_dropout)
return model
def mlpnet(image,_dropout):
l1 = mlp(image,784,128,name='l1')
l1 = tf.nn.dropout(l1,_dropout)
l2 = mlp(l1,128,128,name='l2')
l2 = tf.nn.dropout(l2,_dropout)
l3 = mlp(l2,128,128,name='l3')
return l3
def contrastive_loss(y,d):
tmp= y *tf.square(d)
#tmp= tf.mul(y,tf.square(d))
tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0))
return tf.reduce_sum(tmp +tmp2)/batch_size/2
def compute_accuracy(prediction,labels):
return labels[prediction.ravel() < 0.5].mean()
#return tf.reduce_mean(labels[prediction.ravel() < 0.5])
def next_batch(s,e,inputs,labels):
input1 = inputs[s:e,0]
input2 = inputs[s:e,1]
y= np.reshape(labels[s:e],(len(range(s,e)),1))
return input1,input2,y
# Initializing the variables
init = tf.initialize_all_variables()
# the data, shuffled and split between train and test sets
X_train = mnist.train._images
y_train = mnist.train._labels
X_test = mnist.validation._images
y_test = mnist.validation._labels
batch_size =128
# create training+test positive and negative pairs
digit_indices = [np.where(y_train == i)[0] for i in range(10)]
tr_pairs, tr_y = create_pairs(X_train, digit_indices)
digit_indices = [np.where(y_test == i)[0] for i in range(10)]
te_pairs, te_y = create_pairs(X_test, digit_indices)
images_L = tf.placeholder(tf.float32,shape=([None,784]),name='L')
images_R = tf.placeholder(tf.float32,shape=([None,784]),name='R')
labels = tf.placeholder(tf.float32,shape=([None,1]),name='gt')
dropout_f = tf.placeholder("float")
with tf.variable_scope("siamese") as scope:
model1= build_model_mlp(images_L,dropout_f)
scope.reuse_variables()
model2 = build_model_mlp(images_R,dropout_f)
distance = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(model1,model2),2),1,keep_dims=True))
loss = contrastive_loss(labels,distance)
#contrastice loss
t_vars = tf.trainable_variables()
d_vars = [var for var in t_vars if 'l' in var.name]
batch = tf.Variable(0)
optimizer = tf.train.RMSPropOptimizer(0.001,momentum=0.9,epsilon=1e-6).minimize(loss)
# Launch the graph
with tf.Session() as sess:
#sess.run(init)
tf.initialize_all_variables().run()
# Training cycle
for epoch in range(40):
print('epoch %d' % epoch)
avg_loss = 0.
avg_acc = 0.
total_batch = int(X_train.shape[0]/batch_size)
start_time = time.time()
# Loop over all batches
for i in range(total_batch):
s = i * batch_size
e = (i+1) *batch_size
# Fit training using batch data
input1,input2,y =next_batch(s,e,tr_pairs,tr_y)
_,loss_value,predict=sess.run([optimizer,loss,distance], feed_dict={images_L:input1,images_R:input2 ,labels:y,dropout_f:0.9})
tr_acc = compute_accuracy(predict,y)
avg_loss += loss_value
avg_acc +=tr_acc*100
#print('epoch %d loss %0.2f' %(epoch,avg_loss/total_batch))
duration = time.time() - start_time
print('epoch %d time: %f loss %0.2f acc %0.2f' %(epoch,duration,avg_loss/(total_batch),avg_acc/total_batch))
y = np.reshape(tr_y,(tr_y.shape[0],1))
predict=distance.eval(feed_dict={images_L:tr_pairs[:,0],images_R:tr_pairs[:,1],labels:y,dropout_f:1.0})
tr_acc = compute_accuracy(predict,y)
print('Accuract training set %0.2f' % (100 * tr_acc))