BERT for multi label text classification always has a similar val accuracy despite fine tuning - tensorflow

I have a dataset of German news articles that I need to classify in my job. Since it is imbalanced, I am focussing on only 12 of 30 labels currently. Therefore I tried to balance the dataset by oversampling enhanced with data augmentation. Each sample can belong to multiple categories, thus it is a multi label problem.
The train dataset contains about 127.000 samples.
I am using a German BERT model with Tensorflow but despite fine tuning and even adding new layers, my val accuracy is always about 65%. Sometimes 67 to 68 but never higher. I wondered if my code is maybe broken or if it is due to the dataset.
Here is what I have right now:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
transformer_model = TFAutoModel.from_pretrained("dbmdz/bert-base-german-cased", output_hidden_states=False)
def multi_label_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
"""For multi-label classification, one has to define a custom
acccuracy function because neither tf.keras.metrics.Accuracy nor
tf.keras.metrics.CategoricalAccuracy evaluate the number of
exact matches.
:Example:
>>> from tensorflow.keras import metrics
>>> y_true = tf.convert_to_tensor([[1., 1.]])
>>> y_pred = tf.convert_to_tensor([[1., 0.]])
>>> metrics.Accuracy()(y_true, y_pred).numpy()
0.5
>>> metrics.CategoricalAccuracy()(y_true, y_pred).numpy()
1.0
>>> multi_label_accuracy(y_true, y_pred).numpy()
0.0
"""
y_pred = tf.math.sigmoid(y_pred)
y_pred = tf.math.round(y_pred)
exact_matches = tf.math.reduce_all(y_pred == y_true, axis=1)
exact_matches = tf.cast(exact_matches, tf.float32)
return tf.math.reduce_mean(exact_matches)
def f1_score(y_true, y_logit):
'''
Calculate F1 score
y_true: true value
y_logit: predicted value
'''
y_logit = tf.math.sigmoid(y_logit)
true_positives = K.sum(K.round(K.clip(y_true * y_logit, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
predicted_positives = K.sum(K.round(K.clip(y_logit, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return (2 * precision * recall) / (precision + recall + K.epsilon())
for l in transformer_model.layers:
l.trainable = True
bert = transformer_model.layers[0]
input_ids = tf.keras.layers.Input(shape=(60,), name='input_ids', dtype=np.int32)
attention_masks = tf.keras.layers.Input(shape=(60,), name='attention_masks', dtype=np.int32)
bert_model = bert(input_ids, attention_mask=attention_masks)[0][:, 0, :]
dropout = tf.keras.layers.Dropout(0.2, name="pooled_output")
pooled_output = dropout(bert_model)
dense = tf.keras.layers.Dense(units=256, activation="sigmoid")(pooled_output)
dropout2 = tf.keras.layers.Dropout(0.2)(dense)
dense2 = tf.keras.layers.Dense(units=64, activation="relu")(dropout2)
output = tf.keras.layers.Dense(units=12, name="output")(dense2)
model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
print("Compile model...", flush=True)
optimizer = Adam(learning_rate=1e-5, decay=1e-6)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=[f1_score, multi_label_accuracy]
)
history = model.fit([dataset['train']['bert'], dataset['train']['bert2']], dataset['train']['outputs'], epochs=4, batch_size=64, validation_data=([dataset['val']['bert'], dataset['val']['bert2']], dataset['val']['outputs']))
I would expect the val accuracy to change a lot more by changing the architecture of the model.

Related

Converting Tensorflow code to Pytorch - performance metrics very different

I have converted a tensorflow code for timeseries analysis to pytorch and performance difference is very high, in fact pytorch layers cannot account for seasonality at all. It feels like I must be missing something important.
Please help find where the pytorch code is lacking that the learning is not up to the par. I noticed that loss values has high jumps when it encounters the season change and is not learning that. With the same layers, nodes and every other thing, I imagined the performance to be close.
# tensorflow code
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
dataset = tf.data.Dataset.from_tensor_slices(series)
dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
dataset = dataset.shuffle(shuffle_buffer).map(lambda window: (window[:-1], window[-1]))
dataset = dataset.batch(batch_size).prefetch(1)
return dataset
dataset = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(100, input_shape=[window_size], activation="relu"),
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(1)
])
model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=1e-6, momentum=0.9))
model.fit(dataset,epochs=100,verbose=0)
forecast = []
for time in range(len(series) - window_size):
forecast.append(model.predict(series[time:time + window_size][np.newaxis]))
forecast = forecast[split_time-window_size:]
results = np.array(forecast)[:, 0, 0]
plt.figure(figsize=(10, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, results)
tf.keras.metrics.mean_absolute_error(x_valid, results).numpy()
# pytorch code
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000
class tsdataset(Dataset):
def __init__(self, series, window_size):
self.series = series
self.window_size = window_size
self.dataset, self.labels = self.preprocess()
def preprocess(self):
series = self.series
final, labels = [], []
for i in range(len(series)-self.window_size):
final.append(np.array(series[i:i+window_size]))
labels.append(np.array(series[i+window_size]))
return torch.from_numpy(np.array(final)), torch.from_numpy(np.array(labels))
def __getitem__(self,index):
# print(self.dataset[index], self.labels[index], index)
return self.dataset[index], self.labels[index]
def __len__(self):
return len(self.dataset)
train_dataset = tsdataset(x_train, window_size)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
class tspredictor(nn.Module):
def __init__(self, window_size, out1, out2, out3):
super(tspredictor, self).__init__()
self.l1 = nn.Linear(window_size, out1)
self.l2 = nn.Linear(out1, out2)
self.l3 = nn.Linear(out2, out3)
def forward(self,seq):
l1 = F.relu(self.l1(seq))
l2 = F.relu(self.l2(l1))
l3 = self.l3(l2)
return l3
model = tspredictor(20, 100,10,1)
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-6, momentum=0.9)
for epoch in range(100):
for t,l in train_dataloader:
model.zero_grad()
tag_scores = model(t)
loss = loss_function(tag_scores, l)
loss.backward()
optimizer.step()
# print("Epoch is {}, loss is {}".format(epoch, loss.data))
forecast = []
for time in range(len(series) - window_size):
prediction = model(torch.from_numpy(series[time:time + window_size][np.newaxis]))
forecast.append(prediction)
forecast = forecast[split_time-window_size:]
results = np.array(forecast)
plt.figure(figsize=(10, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, results)
To generate data, you can use:
def plot_series(time, series, format="-", start=0, end=None):
plt.plot(time[start:end], series[start:end], format)
plt.xlabel("Time")
plt.ylabel("Value")
plt.grid(False)
def trend(time, slope=0):
return slope * time
def seasonal_pattern(season_time):
"""Just an arbitrary pattern, you can change it if you wish"""
return np.where(season_time < 0.1,
np.cos(season_time * 6 * np.pi),
2 / np.exp(9 * season_time))
def seasonality(time, period, amplitude=1, phase=0):
"""Repeats the same pattern at each period"""
season_time = ((time + phase) % period) / period
return amplitude * seasonal_pattern(season_time)
def noise(time, noise_level=1, seed=None):
rnd = np.random.RandomState(seed)
return rnd.randn(len(time)) * noise_level
time = np.arange(10 * 365 + 1, dtype="float32")
baseline = 10
series = trend(time, 0.1)
baseline = 10
amplitude = 40
slope = 0.005
noise_level = 3
# Create the series
series = baseline + trend(time, slope) + seasonality(time, period=365, amplitude=amplitude)
# Update with noise
series += noise(time, noise_level, seed=51)
split_time = 3000
time_train = time[:split_time]
x_train = series[:split_time]
time_valid = time[split_time:]
x_valid = series[split_time:]
There was a broadcasting issue in the loss function. Changing to the loss to one below fixes it:
loss = loss_function(tag_scores, l.view(-1,1))

Weighted Pixel Wise Categorical Cross Entropy for Semantic Segmentation

I have recently started learning about Semantic Segmentation. I am trying to train a UNet for the same. My input is RGB 128x128x3 images. My masks are made up of 4 classes 0, 1, 2, 3 and are One-Hot Encoded with dimension 128x128x4.
def weighted_cce(y_true, y_pred):
weights = []
t_inf = tf.convert_to_tensor(1e9, dtype = 'float32')
t_zero = tf.convert_to_tensor(0, dtype = 'int64')
for i in range(0, 4):
l = tf.argmax(y_true, axis = -1) == i
n = tf.cast(tf.math.count_nonzero(l), 'float32') + K.epsilon()
weights.append(n)
weights = [batch_size/j for j in weights]
y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
# clip to prevent NaN's and Inf's
y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
# calc
loss = y_true * K.log(y_pred) * weights
loss = -K.sum(loss, -1)
return loss
This is the loss function that I am using but it classifies every pixel as 2. What am I doing wrong?
You should have weights based on you entire data (unless your batch size is reasonably big so you have sort of stable weights).
If some class is underrepresented, with a small batch size, it will have near infinity weights.
If your target data is numpy array:
shp = y_train.shape
totalPixels = shp[0] * shp[1] * shp[2]
weights = np.sum(y_train, axis=(0, 1, 2)) #final shape (4,)
weights = totalPixels/weights
If your data is in a Sequence generator:
totalPixels = 0
counts = np.zeros((4,))
for i in range(len(generator)):
x, y = generator[i]
shp = y.shape
totalPixels += shp[0] * shp[1] * shp[2]
counts = counts + np.sum(y, axis=(0,1,2))
weights = totalPixels / counts
If your data is in a yield generator (you must know how many batches you have in an epoch):
for i in range(batches_per_epoch):
x, y = next(generator)
#the rest is equal to the Sequence example above
Attempt 1
I don't know if newer versions of Keras are able to handle this, but you can try the simplest approach first: simply call fit or fit_generator with the class_weight argument:
model.fit(...., class_weight = {0: weights[0], 1: weights[1], 2: weights[2], 3: weights[3]})
Attempt 2
Make a healthier loss function:
weights = weights.reshape((1,1,1,4))
kWeights = K.constant(weights)
def weighted_cce(y_true, y_pred):
yWeights = kWeights * y_pred #shape (batch, 128, 128, 4)
yWeights = K.sum(yWeights, axis=-1) #shape (batch, 128, 128)
loss = K.categorical_crossentropy(y_true, y_pred) #shape (batch, 128, 128)
wLoss = yWeights * loss
return K.sum(wLoss, axis=(1,2))

Why Spark ML perceptron classifier has high F1-score while the same model on TensorFlow performs very badly?

Our team is working on a NLP problem. We have a dataset with some labeled sentences and we must classify them into two classes, 0 or 1.
We preprocess the data and use word embeddings so that we have 300 features for each sentence, then we use a simple neural network to train the model.
Since the data are very skewed we measure the model score with the F1-score, computing it both on the train set (80%) and the test set (20%).
Spark
We used the multilayer perceptron classifier featured in PySpark's MLlib:
layers = [300, 600, 2]
trainer = MultilayerPerceptronClassifier(featuresCol='features', labelCol='target',
predictionCol='prediction', maxIter=10, layers=layers,
blockSize=128)
model = trainer.fit(train_df)
result = model.transform(test_df)
predictionAndLabels = result.select("prediction", "target").withColumnRenamed("target", "label")
evaluator = MulticlassClassificationEvaluator(metricName="f1")
f1_score = evaluator.evaluate(predictionAndLabels)
This way we get F1-scores ranging between 0.91 and 0.93.
TensorFlow
We then chose to switch (mainly for learning purpose) to TensorFlow, so we implemented a neural network using the same architecture and formulas of the MLlib's one:
# Network Parameters
n_input = 300
n_hidden_1 = 600
n_classes = 2
# TensorFlow graph input
features = tf.placeholder(tf.float32, shape=(None, n_input), name='inputs')
labels = tf.placeholder(tf.float32, shape=(None, n_classes), name='labels')
# Initializes weights and biases
init_biases_and_weights()
# Layers definition
layer_1 = tf.add(tf.matmul(features, weights['h1']), biases['b1'])
layer_1 = tf.nn.sigmoid(layer_1)
out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
out_layer = tf.nn.softmax(out_layer)
# Optimizer definition
learning_rate_ph = tf.placeholder(tf.float32, shape=(), name='learning_rate')
loss_function = tf.losses.log_loss(labels=labels, predictions=out_layer)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate_ph).minimize(loss_function)
# Start TensorFlow session
init = tf.global_variables_initializer()
tf_session = tf.InteractiveSession()
tf_session.run(init)
# Train Neural Network
learning_rate = 0.01
iterations = 100
batch_size = 256
total_batch = int(len(y_train) / batch_size)
for epoch in range(iterations):
avg_cost = 0.0
for block in range(total_batch):
batch_x = x_train[block * batch_size:min(block * batch_size + batch_size, len(x_train)), :]
batch_y = y_train[block * batch_size:min(block * batch_size + batch_size, len(y_train)), :]
_, c = tf_session.run([optimizer, loss_function], feed_dict={learning_rate_ph: learning_rate,
features: batch_x,
labels: batch_y})
avg_cost += c
avg_cost /= total_batch
print("Iteration " + str(epoch + 1) + " Logistic-loss=" + str(avg_cost))
# Make predictions
predictions_train = tf_session.run(out_layer, feed_dict={features: x_train, labels: y_train})
predictions_test = tf_session.run(out_layer, feed_dict={features: x_test, labels: y_test})
# Compute F1-score
f1_score = f1_score_tf(y_test, predictions_test)
Support functions:
def initialize_weights_and_biases():
global weights, biases
epsilon_1 = sqrt(6) / sqrt(n_input + n_hidden_1)
epsilon_2 = sqrt(6) / sqrt(n_classes + n_hidden_1)
weights = {
'h1': tf.Variable(tf.random_uniform([n_input, n_hidden_1],
minval=0 - epsilon_1, maxval=epsilon_1, dtype=tf.float32)),
'out': tf.Variable(tf.random_uniform([n_hidden_1, n_classes],
minval=0 - epsilon_2, maxval=epsilon_2, dtype=tf.float32))
}
biases = {
'b1': tf.Variable(tf.constant(1, shape=[n_hidden_1], dtype=tf.float32)),
'out': tf.Variable(tf.constant(1, shape=[n_classes], dtype=tf.float32))
}
def f1_score_tf(actual, predicted):
actual = np.argmax(actual, 1)
predicted = np.argmax(predicted, 1)
tp = tf.count_nonzero(predicted * actual)
fp = tf.count_nonzero(predicted * (actual - 1))
fn = tf.count_nonzero((predicted - 1) * actual)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
return tf.Tensor.eval(f1)
This way we get F1-scores ranging between 0.24 and 0.25.
Question
The only differences that I can see between the two neural networks are:
Optimizer: L-BFGS in Spark, Gradient Descent in TensorFlow
Weights and biases initialization: Spark makes its own initialization while we initialize them manually in TensorFlow
I don't think that these two parameters can cause a so big difference in performance between the models, but still Spark seems to get very high scores in very few iterations.
I can't understand if TensorFlow is performing very bad or maybe Spark's scores are not truthful. And in both cases I think we aren't seeing something important.
Initializing weights as uniform and bias as 1 is certainly not a good idea, and it may very well be the cause of this discrepancy.
Use normal or truncated_normal instead, with the default zero mean and a small variance for the weights:
weights = {
'h1': tf.Variable(tf.truncated_normal([n_input, n_hidden_1],
stddev=0.01, dtype=tf.float32)),
'out': tf.Variable(tf.truncated_normal([n_hidden_1, n_classes],
stddev=0.01, dtype=tf.float32))
}
and zero for the biases:
biases = {
'b1': tf.Variable(tf.constant(0, shape=[n_hidden_1], dtype=tf.float32)),
'out': tf.Variable(tf.constant(0, shape=[n_classes], dtype=tf.float32))
}
That said, I am not sure about the correctness of using the MulticlassClassificationEvaluator for a binary classification problem, and I would suggest doing some further manual checks to confirm that the function indeed returns what you think it returns...

Differing results for MNIST autoencoder due to different placement of activation function

I stumbled across a strange phenomenon while playing around with variational autoencoders. The problem is quite simple to describe:
When defining the loss function for the VAE, you have to use some kind of reconstruction error. I decided to use my own implementation of cross-entropy, as I wasn't able to get reasonable results with any function provided by tensorflow. It looks like this:
x_hat = tf.contrib.layers.fully_connected(fc2,
input_dim,
activation_fn=tf.sigmoid)
## Define the loss
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + x_hat) +
(1 - x) * tf.log(epsilon + 1 - x_hat),
axis=1)
It uses the output of the reconstructed layer, which applies the sigmoid function to get it to the [0; 1] range. Now, I wanted to apply the sigmoid within the loss function and changed it to
x_hat = tf.contrib.layers.fully_connected(fc2,
input_dim,
activation_fn=None)
## Define the loss
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + tf.sigmoid(x_hat)) +
(1 - x) * tf.log(epsilon + 1 - tf.sigmoid(x_hat)),
axis=1)
I'm convinced that this should provide nearly identical results. In practice, though, this second attempt results in weird grey pictures. The originals seem blurry and much brighter, too. First the okay version, then the alternative "wrong" version.
Can someone explain to me what causes this weird behavior?
If you want to test it yourself, below is my source code. You have to comment the respective blocks in or out to get the results. Thanks!
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import numpy as np
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)
n_samples = mnist.train.num_examples
input_dim = mnist.train.images[0].shape[0]
inter_dim = 256
encoding_dim = 5
epsilon = 1e-10
learning_rate = 1e-4
n_epochs = 20
batch_size = 100
width = 28
## Define the variational autoencoder model
x = tf.placeholder(dtype=tf.float32,
shape=[None, input_dim],
name='x')
fc1 = tf.contrib.layers.fully_connected(x,
inter_dim,
activation_fn=tf.nn.relu)
z_mean = tf.contrib.layers.fully_connected(fc1,
encoding_dim,
activation_fn=None)
z_log_var = tf.contrib.layers.fully_connected(fc1,
encoding_dim,
activation_fn=None)
eps = tf.random_normal(shape=tf.shape(z_log_var),
mean=0,
stddev=1,
dtype=tf.float32)
z = z_mean + tf.exp(z_log_var / 2) * eps
fc2 = tf.contrib.layers.fully_connected(z,
inter_dim,
activation_fn=tf.nn.relu)
x_hat = tf.contrib.layers.fully_connected(fc2,
input_dim,
activation_fn=tf.sigmoid)
#activation_fn=None)
## Define the loss
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + x_hat) +
(1 - x) * tf.log(epsilon + 1 - x_hat),
axis=1)
ALTERNATIVE LOSS W/ APPLYING SIGMOID, REMOVED ACTIVATION FROM OUTPUT LAYER
'''
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + tf.sigmoid(x_hat)) +
(1 - x) * tf.log(epsilon + 1 - tf.sigmoid(x_hat)),
axis=1)
'''
KL_div = -.5 * tf.reduce_sum(
1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),
axis=1)
total_loss = tf.reduce_mean(reconstruction_loss + KL_div)
## Define the training operator
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_loss)
## Run it
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(n_epochs):
for _ in range(n_samples // batch_size):
batch = mnist.train.next_batch(batch_size)
_, loss, recon_loss, KL_loss = sess.run([train_op,
total_loss,
reconstruction_loss,
KL_div],
feed_dict={x:batch[0]})
print('[Epoch {}] loss: {}'.format(epoch, loss))
print('Training Done')
## Reconstruct a few samples to validate the training
batch = mnist.train.next_batch(100)
x_reconstructed = sess.run(x_hat, feed_dict={x:batch[0]})
n = np.sqrt(batch_size).astype(np.int32)
I_reconstructed = np.empty((width*n, 2*width*n))
for i in range(n):
for j in range(n):
x = np.concatenate(
(x_reconstructed[i*n+j, :].reshape(width, width),
batch[0][i*n+j, :].reshape(width, width)),
axis=1
)
I_reconstructed[i*width:(i+1)*width, j*2*width:(j+1)*2*width] = x
fig = plt.figure()
plt.imshow(I_reconstructed, cmap='gray')
EDIT1: SOLUTION
Thanks to #xdurch0, I was made aware of the fact that the reconstructed output is no longer rescaled via the sigmoid function. That means the sigmoid has to be applied on the image before plotting it. Just modify the output:
x_reconstructed = sess.run(tf.sigmoid(x_hat), feed_dict={x:batch[0]})

Cost-sensitive loss function in Tensorflow

I'm doing research for cost-sensitive neural network based on Tensorflow. But because of the static graph structure of Tensorflow. Some NN structure couldn't be realized by myself.
My loss function(cost) ,cost matrix and the computational progress is described as follow and my target is to compute the total cost and then optimize the NN :
Approximately computational progress:
the y_ is the last full-connect output of a CNN which has shape (1024,5)
the y is a Tensor which has shape(1024) and indicates the ground truth of x[i]
the y_soft[i] [j] indicates the probability of x[i] to be class j
How can I realize this in Tensorflow?
cost_matrix:
[[0,1,100],
[1,0,1],
[1,20,0]]
label:
[1,2]
y*:
[[0,1,0],
[0,0,1]]
y(prediction):
[[0.2,0.3,0.5],
[0.1,0.2,0.7]]
label,cost_matrix-->cost_embedding:
[[1,0,1],
[1,20,0]]
It obvious 0.3 in [0.2,0.3,0.5] refers to right lable probility of [0,1,0], so it should not contibute to loss.
0.7 in [0.1,0.2,0.7] is the same. In other words, the pos with value 1 in y* not contibute to loss.
So I have (1-y*):
[[1,0,1],
[1,1,0]]
Then the entropy is target*log(predict) + (1-target) * log(1-predict),and value 0 in y*,should use (1-target)*log(1-predict), so I use (1-predict) said (1-y)
1-y:
[[0.8,*0.7*,0.5],
[0.9,0.8,*0.3*]]
(italic num is useless)
the custom loss is
[[1,0,1], [1,20,0]] * log([[0.8,0.7,0.5],[0.9,0.8,0.3]]) *
[[1,0,1],[1,1,0]]
and you can see the (1-y*) can be drop here
so the loss is -tf.reduce_mean(cost_embedding*log(1-y))
,to make it applicable , should be:
-tf.reduce_mean(cost_embedding*log(tf.clip((1-y),1e-10)))
the demo is below
import tensorflow as tf
import numpy as np
hidden_units = 50
num_class = 3
class Model():
def __init__(self,name_scope,is_custom):
self.name_scope = name_scope
self.is_custom = is_custom
self.input_x = tf.placeholder(tf.float32,[None,hidden_units])
self.input_y = tf.placeholder(tf.int32,[None])
self.instantiate_weights()
self.logits = self.inference()
self.predictions = tf.argmax(self.logits,axis=1)
self.losses,self.train_op = self.opitmizer()
def instantiate_weights(self):
with tf.variable_scope(self.name_scope + 'FC'):
self.W = tf.get_variable('W',[hidden_units,num_class])
self.b = tf.get_variable('b',[num_class])
self.cost_matrix = tf.constant(
np.array([[0,1,100],[1,0,100],[20,5,0]]),
dtype = tf.float32
)
def inference(self):
return tf.matmul(self.input_x,self.W) + self.b
def opitmizer(self):
if not self.is_custom:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits\
(labels=self.input_y,logits=self.logits)
else:
batch_cost_matrix = tf.nn.embedding_lookup(
self.cost_matrix,self.input_y
)
loss = - tf.log(1 - tf.nn.softmax(self.logits))\
* batch_cost_matrix
train_op = tf.train.AdamOptimizer().minimize(loss)
return loss,train_op
import random
batch_size = 128
norm_model = Model('norm',False)
custom_model = Model('cost',True)
split_point = int(0.9 * dataset_size)
train_set = datasets[:split_point]
test_set = datasets[split_point:]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100):
batch_index = random.sample(range(split_point),batch_size)
train_batch = train_set[batch_index]
train_labels = lables[batch_index]
_,eval_predict,eval_loss = sess.run([norm_model.train_op,
norm_model.predictions,norm_model.losses],
feed_dict={
norm_model.input_x:train_batch,
norm_model.input_y:train_labels
})
_,eval_predict1,eval_loss1 = sess.run([custom_model.train_op,
custom_model.predictions,custom_model.losses],
feed_dict={
custom_model.input_x:train_batch,
custom_model.input_y:train_labels
})
# print 'norm',eval_predict,'\ncustom',eval_predict1
print np.sum(((eval_predict == train_labels)==True).astype(np.int)),\
np.sum(((eval_predict1 == train_labels)==True).astype(np.int))
if i%10 == 0:
print 'norm_test',sess.run(norm_model.predictions,
feed_dict={
norm_model.input_x:test_set,
norm_model.input_y:lables[split_point:]
})
print 'custom_test',sess.run(custom_model.predictions,
feed_dict={
custom_model.input_x:test_set,
custom_model.input_y:lables[split_point:]
})