Masking zero-padding embedding (and return zero gradients) in Tensorflow as in Pytorch - tensorflow

I'm trying to recreate the PoolNet from Spotlight with the BPR loss in Tensorflow but I can't get the same results. Below is the model I'm using (it's an estimator model_fn).
def _pooling_model_fn(features, labels, mode, params):
with tf.name_scope('inputs'):
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
users_prev_items_inputs_train = features['item_seqs']
elif mode == tf.estimator.ModeKeys.PREDICT:
users_prev_items_inputs_train = tf.reshape(features['item_seqs'], [1, -1])
with tf.device('/cpu:0'):
prod_embeddings = tf.keras.layers.Embedding(params["num_items"], params["item_emb_size"], mask_zero=True)
item_biases = tf.keras.layers.Embedding(params["num_items"], 1, mask_zero=True, embeddings_initializer=tf.keras.initializers.Zeros())
prod_embed = prod_embeddings(users_prev_items_inputs_train)
targets = tf.transpose(prod_embed, [0, 2, 1])
sequence_embeddings = tf.expand_dims(targets, axis=3)
sequence_embeddings = tf.pad(sequence_embeddings, paddings=tf.constant([[0, 0], [0, 0], [1, 0], [0, 0]]))
sequence_embedding_sum = tf.cumsum(sequence_embeddings, 2)
non_padding_entries = tf.cumsum(tf.cast(tf.not_equal(sequence_embeddings, tf.constant(0.0)), tf.float32), 2) # .expand_as(sequence_embedding_sum)
user_representations = tf.squeeze((sequence_embedding_sum / (non_padding_entries + 1)), [3])
user_representations_so_far = user_representations[:, :, :-1]
user_representations_new = user_representations[:, :, -1]
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
global_step = tf.contrib.framework.get_or_create_global_step()
with tf.name_scope('loss'):
negative_samples = features['neg_samp']
with tf.device('/cpu:0'):
prod_embed_pos = prod_embeddings(users_prev_items_inputs_train)
target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))
prod_bias_pos = item_biases(users_prev_items_inputs_train)
target_bias_positive = tf.squeeze(prod_bias_pos)
dot_positive = tf.reduce_sum(user_representations_so_far * target_embedding_positive, 1) + target_bias_positive
with tf.device('/cpu:0'):
prod_embed_neg = prod_embeddings(negative_samples)
target_embedding_negative = tf.squeeze(tf.transpose(prod_embed_neg, [0, 2, 1]))
prod_bias_neg = item_biases(negative_samples)
target_bias_negative = tf.squeeze(prod_bias_neg)
dot_negative = tf.reduce_sum(user_representations_so_far * target_embedding_negative, 1) + target_bias_negative
mask = tf.not_equal(users_prev_items_inputs_train, 0)
loss = bpr_loss(dot_positive, dot_negative, mask)
if mode == tf.estimator.ModeKeys.TRAIN:
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate=params["lr"])
train_op = optimizer.minimize(loss, global_step=global_step)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if mode == tf.estimator.ModeKeys.PREDICT:
item_ids = np.arange(params['num_items']).reshape(-1, 1)
item_ids_tensor = tf.convert_to_tensor(item_ids, dtype=tf.int64)
with tf.device('/cpu:0'):
prod_embed_pos = prod_embeddings(item_ids_tensor) # tf.nn.embedding_lookup(prod_embeddings, item_ids_tensor)
target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))
prod_bias_pos = item_biases(item_ids_tensor) # tf.nn.embedding_lookup(item_biases, item_ids_tensor)
target_bias_positive = tf.squeeze(prod_bias_pos)
dot_positive = tf.reduce_sum(user_representations_new * target_embedding_positive, 1) + target_bias_positive
predictions = {
'products': tf.reshape(dot_positive, [1, -1])
}
export_outputs = {
'prediction': tf.estimator.export.PredictOutput(predictions)
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
and the loss function
def bpr_loss(positive_predictions, negative_predictions, mask):
loss1 = 1.0 - tf.nn.sigmoid(positive_predictions - negative_predictions)
if mask is not None:
mask = tf.cast(mask, loss1.dtype)
final_loss = loss1 * mask
return tf.reduce_sum(final_loss) / tf.reduce_sum(mask)
return tf.reduce_mean(loss1)
With the above model, I can't get the same predictions on the exact same dataset (and same random seed) as I do with Spotlight. I end up that the problem is with the zero-padding. The way that the data is generated is as the following:
[[0,0,0,5,6,98],
[0,62,15,4,8,47],
[0,0,5,9,6,3,41],
[78,21,2,56,1,3]]
they have leading zero-padding so every input sample has the same length.
Based on my code I believed I did everything to mask out these zeros from the loss, the embedding layer (using the mask_zero parameter from Keras) as well as from the averaging of the embeddings that I'm computing (using the cumsum). Still though, after training, the zero-indexed embedding is constantly changing (meaning that instead of excluded is taken into consideration and leading to influence the rest gradients and adding noise to my results).
Pytorch seems to have a nice feature in their implementation of the Embedding layer where you can set the padding_idx with the id of the pad and this will be initialized with zeros. Also, it keeps the gradient of this index always zero. So basically, I'm trying to do the same thing with Tensorflow.
Any help would be appreciated.

I solved it using the following solution posted on Tensorflow's Github. It seems to work now.
mask_padding_zero_op = tf.scatter_update(lookup_table,
PADDING_ID,
tf.zeros([EMBEDDING_DIM,], dtype=DTYPE))
with tf.control_dependencies([mask_padding_zero_op]):
# do embedding lookup...

Related

how to add text preprocessing tokenization step into Tensorflow model

I have a TensorFlow model SavedModel which includes saved_model.pb and variables folder. The preprocessing step has not been incorporated into this model that's why I need to do preprocessing(Tokenization etc) before feeding the data to the model for the prediction aspect.
I am looking for an approach that I can incorporate the preprocessing step into the model. I have seen examples here and here however they are image data.
Just to get an idea how the training part has been done, this is a portion of the code that we did training (if you need the implementation of the function I have used here, please let me know(I did not include it to make my question more understandable ))
Training:
processor = IntentProcessor(FLAGS.data_path, FLAGS.test_data_path,
FLAGS.test_proportion, FLAGS.seed, FLAGS.do_early_stopping)
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir,
save_checkpoints_steps=FLAGS.save_checkpoints_steps)
train_examples = None
num_train_steps = None
num_warmup_steps = None
if FLAGS.do_train:
train_examples = processor.get_train_examples()
num_iter_per_epoch = int(len(train_examples) / FLAGS.train_batch_size)
num_train_steps = num_iter_per_epoch * FLAGS.num_train_epochs
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir,
save_checkpoints_steps=num_iter_per_epoch)
best_temperature = 1.0 # Initiate the best T value as 1.0 and will
# update this during the training
model_fn = model_fn_builder(
bert_config=bert_config,
num_labels=len(processor.le.classes_),
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
best_temperature=best_temperature,
seed=FLAGS.seed)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config)
# add parameters by passing a prams variable
if FLAGS.do_train:
train_features = convert_examples_to_features(
train_examples, FLAGS.max_seq_length, tokenizer)
train_labels = processor.get_train_labels()
train_input_fn = input_fn_builder(
features=train_features,
is_training=True,
batch_size=FLAGS.train_batch_size,
seed=FLAGS.seed,
labels=train_labels
)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
And this is the preprocessing that I use for the training:
LABEL_LIST = ['negative', 'neutral', 'positive']
INTENT_MAP = {i: LABEL_LIST[i] for i in range(len(LABEL_LIST))}
BATCH_SIZE = 1
MAX_SEQ_LEN = 70
def convert_examples_to_features(texts, max_seq_length, tokenizer):
"""Loads a data file into a list of InputBatchs.
texts is the list of input text
"""
features = {}
input_ids_list = []
input_mask_list = []
segment_ids_list = []
for (ex_index, text) in enumerate(texts):
tokens_a = tokenizer.tokenize(str(text))
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
input_ids_list.append(input_ids)
input_mask_list.append(input_mask)
segment_ids_list.append(segment_ids)
features['input_ids'] = np.asanyarray(input_ids_list)
features['input_mask'] = np.asanyarray(input_mask_list)
features['segment_ids'] = np.asanyarray(segment_ids_list)
# tf.data.Dataset.from_tensor_slices needs to pass numpy array not
# tensor, or the tensor graph (shape) should match
return features
and inferencing would be like this:
def inference(texts,MODEL_DIR, VOCAB_FILE):
if not isinstance(texts, list):
texts = [texts]
tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
features = convert_examples_to_features(texts, MAX_SEQ_LEN, tokenizer)
predict_fn = predictor.from_saved_model(MODEL_DIR)
response = predict_fn(features)
#print(response)
return get_sentiment(response)
def preprocess(texts):
if not isinstance(texts, list):
texts = [texts]
tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
features = convert_examples_to_features(texts, MAX_SEQ_LEN, tokenizer)
return features
def get_sentiment(response):
idx = response['intent'].tolist()
print(idx)
print(INTENT_MAP.get(idx[0]))
outputs = []
for i in range(0, len(idx)):
outputs.append({
"sentiment": INTENT_MAP.get(idx[i]),
"confidence": response['prob'][i][idx[i]]
})
return outputs
sentence = 'The movie is ok'
inference(sentence, args.model_path, args.vocab_path)
And this is the implementation of model_fn_builder:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps, best_temperature, seed):
"""Returns multi-intents `model_fn` closure for Estimator"""
def model_fn(features, labels, mode,
params): # pylint: disable=unused-argument
"""The `model_fn` for Estimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(
" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(total_loss, per_example_loss, logits) = create_intent_model(
bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, mode, seed)
tvars = tf.trainable_variables()
initialized_variable_names = None
if init_checkpoint:
(assignment_map,
initialized_variable_names) = \
modeling.get_assignment_map_from_checkpoint(
tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(per_example_loss, labels, logits):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
accuracy = tf.metrics.accuracy(labels, predictions)
loss = tf.metrics.mean(per_example_loss)
return {
"eval_accuracy": accuracy,
"eval_loss": loss
}
eval_metrics = metric_fn(per_example_loss, labels, logits)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metrics)
elif mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'intent': tf.argmax(logits, axis=-1, output_type=tf.int32),
'prob': tf.nn.softmax(logits / tf.constant(best_temperature)),
'logits': logits
}
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions)
return output_spec
return model_fn
And this is the implementation of create_intent_model
def create_intent_model(bert_config, is_training, input_ids, input_mask,
segment_ids,
labels, num_labels, mode, seed):
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=False,
seed=seed
)
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
with tf.variable_scope("loss"):
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02, seed=seed))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9, seed=seed)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
loss = None
per_example_loss = None
if mode == tf.estimator.ModeKeys.TRAIN or mode == \
tf.estimator.ModeKeys.EVAL:
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels,
dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
axis=-1)
loss = tf.reduce_mean(per_example_loss)
return loss, per_example_loss, logits
This is the list tensorflow related libraries:
tensorboard==1.15.0
tensorflow-estimator==1.15.1
tensorflow-gpu==1.15.0
There is good documentation here, however, it uses Keras API. Plus, I don't know how can I incorporate preprocessing layer here even with the Keras API.
Again, my final goal is to incorporate the preprocessing step into the model building phase so that when I later load the model I directly pass the The movie is ok to the model?
I just need the idea on how to incorporate a preprocessing layer into this code which is function based.
Thanks in advance~
You can use the TextVectorization layer as follows. But to answer your question fully, I'd need to know what's in model_fn_builder() function. I'll show how you can do this with Keras model building API.
class BertTextProcessor(tf.keras.layers.Layer):
def __init__(self, max_length):
super().__init__()
self.max_length = max_length
# Here I'm setting any preprocessing to none
# by default this layer lowers case and remove punctuation
# i.e. tokens like [CLS] would become cls
self.vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=max_length, standardize=None)
def call(self, inputs):
inputs = "[CLS] " + inputs + " [SEP]"
tok_inputs = self.vectorizer(inputs)
return {
"input_ids": tok_inputs,
"input_mask": tf.cast(tok_inputs != 0, 'int32'),
"segment_ids": tf.zeros_like(tok_inputs)
}
def adapt(self, data):
data = "[CLS] " + data + " [SEP]"
self.vectorizer.adapt(data)
def get_config(self):
return {
"max_length": self.max_length
}
Usage,
input_str = tf.constant(["movie is okay good plot very nice", "terrible movie bad actors not good"])
proc = BertTextProcessor(8)
# You need to call this so that the vectorizer layer learns the vocabulary
proc.adapt(input_str)
print(proc(input_str))
which outputs,
{'input_ids': <tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[ 5, 2, 12, 9, 3, 8, 6, 11, 4, 0],
[ 5, 7, 2, 13, 14, 10, 3, 4, 0, 0]])>, 'input_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}
You can use this layer as an input for a Keras model as you would use any layer.
You can also get the vocabulary using, proc.vectorizer.get_vocabulary() which returns,
['',
'[UNK]',
'movie',
'good',
'[SEP]',
'[CLS]',
'very',
'terrible',
'plot',
'okay',
'not',
'nice',
'is',
'bad',
'actors']
Alternative with tf-models-official
To get data in a format accepted by BERT, you can also use the tf-models-official library. Specifically, you can use the BertPackInputs object.
I recently updated code for one of my books and in Chapter 13/13.1_Spam_Classification you can see how it is used. The section Generating the correct input format for BERT shows how this could be done.
Edit: How to do this in tensorflow==1.15.0
In order to do this in TensorFlow 1.x you will need some reworking as lot of functionality in the original answer is missing. Here's an example of how you can do this, you will need to adapt this code accordingly to your specific usecase/method.
lookup_layer = tf.lookup.StaticHashTable(
tf.lookup.TextFileInitializer(
"vocab.txt", tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "),
100
)
text = tf.constant(["bad film", "movie is okay good plot very nice", "terrible movie bad actors not good"])
text = "[CLS]" + text + "[SEP]"
text = tf.strings.split(text, result_type="RaggedTensor")
text_dense = text.to_tensor("[PAD]")
out = lookup_layer.lookup(text_dense)
with tf.Session() as sess:
sess.run(tf.tables_initializer())
print(sess.run(out))

Training MobilNetV3 SSDLite and poor performance

Hi guys I'm a newbie in deep learning and I'd tried to make my own object detection model MobileNetV3 SSDlite training pascal voc 2007 dataset in tensorflow datasets(tfds).
However, It's performance is poor(mAP 0.3~0.4). And I wonder the reason why it has poor mAP.
The reasons I thought of it:
Training data problem
#I use tfds pascal voc data
#train set
(train_dataset, train_dataset2), ds_info = tfds.load(name="voc/2007", split=["train", "validation"], with_info=True)
train_dataset = train_dataset.concatenate(train_dataset2)
#test set
val_dataset = tfds.load(name="voc/2007", split="test", with_info=False)
Is there any difference between tfds voc2007 and official voc2007 dataset?
Detection model problem
I use imagenet pretrained mobilenetv3 large as a backbone and extract feature from ("multiply_11", "multiply_17") layers which resolution is 19x19 and 10x10
input_tensor = Input((300, 300, 3))
backbone = tf.keras.applications.MobileNetV3Large(include_top=False, alpha=0.75, input_tensor = input_tensor, input_shape = (300, 300, 3))
And extract extra feature map as:
def InvertedResidualBlock(filters, kernel_size, strides, padding):
f1 = Conv2D(filters=filters//2, kernel_size=1, strides = 1, padding=padding, kernel_regularizer=l2(4e-5))
f2 = BatchNormalization()
f3 = ReLU(6.)
f4 = SeparableConv2D(filters = filters,
kernel_size =
kernel_size,
strides = strides,
padding=padding,
#depthwise_regularizer = l2(4e-5),
#pointwise_regularizer = l2(4e-5)
)
f5 = BatchNormalization()
f6 = ReLU(6.)
return reduce(lambda f, g: lambda *args, **kwargs: g(f(*args, **kwargs)), (f1, f2, f3, f4, f5, f6))
class HFPNeckBuilder():
def __init__(self, config) -> None:
self.isLite = config["model_config"]["neck"]["isLite"]
if self.isLite:
self.baseConvBlock = SeparableConvBlock
self.baseConv = SeparableConv
else:
self.baseConvBlock = ConvBlock
self.baseConv = Conv
def __call__(self, ex_stage_output):
Feature_map1 = ex_stage_output[0]
Feature_map2 = ex_stage_output[-1]
Feature_map3 = InvertedResidualBlock(filters= 512, strides = 2, kernel_size = 3, padding="same")(Feature_map2)
Feature_map4 = InvertedResidualBlock(filters= 256, strides = 2, kernel_size = 3, padding="same")(Feature_map3)
Feature_map5 = InvertedResidualBlock(filters= 256, strides = 2, kernel_size = 3, padding="same")(Feature_map4)
Feature_map6 = InvertedResidualBlock(filters= 128, strides = 2, kernel_size = 3, padding="same")(Feature_map5)
return [Feature_map1, Feature_map2, Feature_map3, Feature_map4, Feature_map5, Feature_map6]
Is there any problem in backbone and neck(feature extractor which was mentioned in MobileNetV2 paper)
3.optimizer and lr schedule problem
I had experimented all combination of below setting:
Batch Size:32
Optimizer: SGD(momentum 0.9), Adam, RAdam, RAdam+LookAhead, RAdam+LookAhead+GradientCentralize
Lr Schedule: Cosine decay, Cosine restart, Cosine Warmup and decay with initial learning rate 0.1 ~1e-4
epochs: 150epochs ~ 5000epochs
But all experiments shows poor mAP(0.3~0.4).
Loss Function Problem
I tried two loss function Hard Negative mining + Smooth L1 loss and Focal loss + Smooth L1 loss. I conferd I referred keras official example code for focal loss and pierluigiferrari github for HardNegative mining.
Here is my focal loss code
class SSDLoss(tf.losses.Loss):
'''
Loss with FocalLoss rather than Hard Negative Mining. It is refered from keras reference.
Gamma makes clear the difference between Good detection and Bad detection, if gamma==0 -> crossEntrophy
alpha is weighting factor, if alpha = 0.25 ->BackGround: 0.25, ForeGround: 0.75
'''
def __init__(self, num_classes=80, alpha=0.25, gamma=2.0, config = None):
super(SSDLoss, self).__init__(reduction="auto", name="SSDLoss")
self.alpha = alpha
self.gamma = gamma
self._num_classes = num_classes
self.anchor_boxes = AnchorBox(config).get_anchors()[None, ...]
self._box_variance = [0.1, 0.1, 0.2, 0.2]
def call(self, y_true, y_pred):
y_pred = tf.cast(y_pred, dtype=tf.float32)
box_labels = y_true[:, :, :4]
box_predictions = y_pred[:, :, :4]
cls_labels = tf.one_hot(
tf.cast(y_true[:, :, 4], dtype=tf.int32),
depth=self._num_classes,
dtype=tf.float32,
)
cls_predictions = y_pred[:, :, 4:]
positive_mask = tf.cast(tf.greater(y_true[:, :, 4], -1.0), dtype=tf.float32)
ignore_mask = tf.cast(tf.equal(y_true[:, :, 4], -2.0), dtype=tf.float32)
clf_loss = self.Focal_ClassificationLoss(cls_labels, cls_predictions)
box_loss = self.SmoothL1_BoxLoss(box_labels, box_predictions)
clf_loss = tf.where(tf.equal(ignore_mask, 1.0), 0.0, clf_loss)
box_loss = tf.where(tf.equal(positive_mask, 1.0), box_loss, 0.0)
normalizer = tf.reduce_sum(positive_mask, axis=-1)
clf_loss = tf.math.divide_no_nan(tf.reduce_sum(clf_loss, axis=-1), normalizer)
box_loss = tf.math.divide_no_nan(tf.reduce_sum(box_loss, axis=-1), normalizer)
loss = clf_loss + box_loss
return loss
def SmoothL1_BoxLoss(self, y_true_Box, y_pred_box):
difference = y_true_Box - y_pred_box
absolute_difference = tf.abs(difference) - 0.5
squared_difference = 0.5 * difference ** 2
loss = tf.where(
tf.less(absolute_difference, 1.0),
squared_difference,
absolute_difference)
return tf.reduce_sum(loss, axis=-1)
def Focal_ClassificationLoss(self, y_true_Cls, y_pred_Cls):
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true_Cls, logits=y_pred_Cls)
probs = tf.nn.sigmoid(y_pred_Cls)
alpha = tf.where(tf.equal(y_true_Cls, 1.0), self.alpha, (1.0 - self.alpha))
pt = tf.where(tf.equal(y_true_Cls, 1.0), probs, 1 - probs)
loss = alpha * tf.pow(1.0 - pt, self.gamma) * cross_entropy
return tf.reduce_sum(loss, axis=-1)
I changed it a little bit from the keras official website example code in "Object Detection with RetinaNet".
Question
Is there any advice for improving mAP up to 0.6~0.7? thank you for reading my question.

When i use estimator with same checkpoints to predict same file for several times, the predicted result varies

I'm using textcnn model in estimator to classify some text. After i train the model, the trained model was stored in the form of checkpoints. But when i try to predict the same test file with same checkpoints,the predicted result(porbalility and logits) varies slightly.
I have set the dropout_keep_prob=1 in dropout layer
checkpoints and test file remain the same one.
I have used the LoggingTensorHook to check the tensor values during the predict, two values begin to vary at the max_pool step(at least the conv values are same but i am not sure)
import tensorflow as tf
def line_parser(line, vocab):
def parse_content(record):
items = record.decode().strip().split()
cat = int(items[-1])
tokens = items[:-1]
token_length = len(tokens)
if token_length > FLAGS.max_sequence_length:
tokens = tokens[:FLAGS.max_sequence_length]
if token_length < FLAGS.max_sequence_length:
tokens += [FLAGS.pad_word]*(FLAGS.max_sequence_length-token_length)
return [tokens, cat]
result = tf.py_func(parse_content, [line], [tf.string, tf.int64])
ids = vocab.lookup(result[0])
ids = tf.cast(ids, tf.int64)
ids = tf.reshape(ids, [FLAGS.max_sequence_length])
label = tf.one_hot(result[1], FLAGS.num_classes, dtype=tf.int32)
return [ids, label]
def predict_line_parser(line, vocab):
def parse_content(record):
feature = record.decode().strip()
tokens = feature.split()
token_length = len(tokens)
if token_length > FLAGS.max_sequence_length:
tokens = tokens[:FLAGS.max_sequence_length]
if token_length < FLAGS.max_sequence_length:
tokens += [FLAGS.pad_word]*(FLAGS.max_sequence_length-token_length)
return [tokens]
result = tf.py_func(parse_content, [line], [tf.string])
ids = vocab.lookup(result[0])
ids = tf.cast(ids, tf.int64)
ids = tf.reshape(ids, [FLAGS.max_sequence_length])
return ids
def train_input_fn(file_paths, batch_size):
vocab = tf.contrib.lookup.index_table_from_file(FLAGS.vocab_path)
dataset = tf.data.TextLineDataset(file_paths)
dataset = dataset.map(lambda line: line_parser(line, vocab))
dataset = dataset.shuffle(1000)
dataset = dataset.batch(batch_size).repeat()
return dataset
def eval_input_fn(file_paths, batch_size):
vocab = tf.contrib.lookup.index_table_from_file(FLAGS.vocab_path)
dataset = tf.data.TextLineDataset(file_paths)
dataset = dataset.map(lambda line: line_parser(line, vocab))
dataset = dataset.batch(batch_size=batch_size)
return dataset
def predict_input_fn(file_paths, batch_size):
vocab = tf.contrib.lookup.index_table_from_file(FLAGS.vocab_path)
dataset = tf.data.TextLineDataset(file_paths)
dataset = dataset.map(lambda line:predict_line_parser(line, vocab))
dataset = dataset.batch(batch_size=batch_size)
return dataset
def create_model(features, params):
# projection from sentence with id to embedding
embedding_inputs = tf.nn.embedding_lookup(params["embedding"], features)
embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1)
l2_loss = tf.constant(0.0, name="l2_loss", dtype="float64")
# convolutional layer and pooling layer
pooled_outputs = list()
for i, filter_size in enumerate(params["filter_sizes"]):
with tf.name_scope("conv_{}".format(filter_size)):
filter_shape = [filter_size, params["embedding_size"], 1, params["num_filters"]]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1, dtype="float64"), name="W")
b = tf.Variable(tf.constant(0.1, shape=[params["num_filters"]], dtype="float64"), name="b")
conv = tf.nn.conv2d(embedding_inputs, W, strides=[1, 1, 1, 1], padding="VALID", use_cudnn_on_gpu=True,
name="conv".format(filter_size))
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu".format(filter_size))
pooled = tf.nn.max_pool(
h,
ksize=[1, params["sequence_length"] - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding="VALID",
name="pool".format(filter_size)
)
pooled_outputs.append(pooled)
# concatenate all feature vector
number_filters_total = params["num_filters"] * len(params["filter_sizes"])
h_pool = tf.concat(pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, number_filters_total])
# dropout
with tf.name_scope("dropout"):
# h_drop = tf.nn.dropout(h_pool_flat, params["dropout_keep_prob"])
h_drop = tf.nn.dropout(h_pool_flat, 1)
# fully connected layer
with tf.name_scope("output"):
W = tf.Variable(
tf.truncated_normal(shape=[number_filters_total, params["num_classes"]], stddev=0.1, dtype="float64"),
name="W")
b = tf.Variable(tf.constant(0.1, shape=[params["num_classes"]], dtype="float64"), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
logits = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
return logits, l2_loss
def model_fn_builder():
def text_cnn_model_fn(features, labels, mode, params):
logits, l2_loss = create_model(features, params)
# train mode branch
if mode == tf.estimator.ModeKeys.TRAIN:
# loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
loss = tf.reduce_mean(losses) + params["l2_reg_lambda"] * l2_loss
# optimizer function
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(params["learning_rate"])
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# eval mode branch
if mode == tf.estimator.ModeKeys.EVAL:
# loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
loss = tf.reduce_mean(losses) + params["l2_reg_lambda"] * l2_loss
# predictions
with tf.name_scope("prediction"):
probability = tf.nn.softmax(logits, axis=1, name="probability")
pred = tf.argmax(probability, axis=1, name="predictions")
# metrics
with tf.name_scope("metrics"):
accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1), predictions=pred)
precision = tf.metrics.precision(labels=tf.argmax(labels, axis=1), predictions=pred)
recall = tf.metrics.recall(labels=tf.argmax(labels, axis=1), predictions=pred)
tf.summary.scalar("accuracy", accuracy[1])
tf.summary.scalar("precision", precision[1])
tf.summary.scalar("recall", recall[1])
tf.summary.scalar("loss", loss)
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall}
metric_hook = tf.train.LoggingTensorHook(
{"f1-score": 2 * precision[1] * recall[1] / (precision[1] + recall[1]), "precision": precision[1],
"recall": recall[1]}, every_n_iter=100)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metrics,
evaluation_hooks=[metric_hook])
# predict mode branch
if mode == tf.estimator.ModeKeys.PREDICT:
# predictions
with tf.name_scope("prediction"):
probability = tf.nn.softmax(logits, axis=1, name="probability")
pred = tf.argmax(probability, axis=1, name="predictions")
predictions = {
"class": pred,
"probability": probability,
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
return text_cnn_model_fn
I expect the same output of twice predictions,but it varies like the following:
first time
0\0.02336916147480053
0\0.29461604884471243
0\0.04555523004833724
1\0.5450933830551228
0\0.042727966035733034
0\0.032764190484837884
0\0.11542703615898613
0\0.12662708812885717
0\0.01605587344580832
0\0.006454832043875243
second time
0\0.03389085341620636
0\0.31563690653966603
0\0.06185060165562852
1\0.5891016184323346
0\0.07184752629327144
0\0.04355442431024522
0\0.16290306166502935
0\0.17214872864042816
0\0.02437323886282706
0\0.0109889405648392
Autually,i figured out this problem. The variety is resulted by word embedding vectors which are generated randomly every time.

InvalidArgumentError (see above for traceback): indices[47,6] = 24 is not in [0, 23)

I am trying to run the following main.py file and I continuously get the error "InvalidArgumentError (see above for traceback): indices[138,4] = 23 is not in [0, 23)". I have checked my vocab file. It has exactly 23 words in it.
The code works fine for a single line of new data inserted but when the data is continuous or more then this error pops out. Please help me to rectify this issue.
Below is a small snippet of my code . The line "word_embeddings = tf.nn.embedding_lookup(variable, word_ids)" is where the error comes.
def model_fn(features, labels, mode, params):
# For serving features are a bit different
if isinstance(features, dict):
features = ((features['words'], features['nwords']),
(features['chars'], features['nchars']))
# Read vocabs and inputs
(words, nwords), (chars, nchars) = features
dropout = params['dropout']
training = (mode == tf.estimator.ModeKeys.TRAIN)
vocab_words = tf.contrib.lookup.index_table_from_file(
params['words'], num_oov_buckets=params['num_oov_buckets'])
vocab_chars = tf.contrib.lookup.index_table_from_file(
params['chars'], num_oov_buckets=params['num_oov_buckets'])
with Path(params['tags']).open() as f:
indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
num_tags = len(indices) + 1
with Path(params['chars']).open() as f:
num_chars = sum(1 for _ in f) + params['num_oov_buckets']
# Char Embeddings
char_ids = vocab_chars.lookup(chars)
variable = tf.get_variable(
'chars_embeddings', [num_chars, params['dim_chars']], tf.float32)
char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
char_embeddings = tf.layers.dropout(char_embeddings, rate=dropout,
training=training)
# Char LSTM
dim_words = tf.shape(char_embeddings)[1]
dim_chars = tf.shape(char_embeddings)[2]
flat = tf.reshape(char_embeddings, [-1, dim_chars, params['dim_chars']])
t = tf.transpose(flat, perm=[1, 0, 2])
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
_, (_, output_fw) = lstm_cell_fw(t, dtype=tf.float32,
sequence_length=tf.reshape(nchars, [-1]))
_, (_, output_bw) = lstm_cell_bw(t, dtype=tf.float32,
sequence_length=tf.reshape(nchars, [-1]))
output = tf.concat([output_fw, output_bw], axis=-1)
char_embeddings = tf.reshape(output, [-1, dim_words, 50])
# Word Embeddings
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.] * params['dim']]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embeddings = tf.nn.embedding_lookup(variable, word_ids)
# Concatenate Word and Char Embeddings
embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)
# LSTM
t = tf.transpose(embeddings, perm=[1, 0, 2]) # Need time-major
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
output = tf.concat([output_fw, output_bw], axis=-1)
output = tf.transpose(output, perm=[1, 0, 2])
output = tf.layers.dropout(output, rate=dropout, training=training)
# CRF
logits = tf.layers.dense(output, num_tags)
crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)
if mode == tf.estimator.ModeKeys.PREDICT:
# Predictions
reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
params['tags'])
pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
predictions = {
'pred_ids': pred_ids,
'tags': pred_strings
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
else:
# Loss
vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
tags = vocab_tags.lookup(labels)
log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
logits, tags, nwords, crf_params)
loss = tf.reduce_mean(-log_likelihood)
# Metrics
weights = tf.sequence_mask(nwords)
metrics = {
'acc': tf.metrics.accuracy(tags, pred_ids, weights),
'precision': precision(tags, pred_ids, num_tags, indices, weights),
'recall': recall(tags, pred_ids, num_tags, indices, weights),
'f1': f1(tags, pred_ids, num_tags, indices, weights),
}
for metric_name, op in metrics.items():
tf.summary.scalar(metric_name, op[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
train_op = tf.train.AdamOptimizer().minimize(
loss, global_step=tf.train.get_or_create_global_step())
return tf.estimator.EstimatorSpec(
mode, loss=loss, train_op=train_op)
if __name__ == '__main__':
# Params
params = {
'dim': 300,
'dim_chars': 100,
'dropout': 0.5,
'num_oov_buckets': 1,
'epochs': 25,
'batch_size': 20,
'buffer': 30000000,
'char_lstm_size': 25,
'lstm_size': 100,
'words': str(Path(DATADIR, 'vocab.words.txt')),
'chars': str(Path(DATADIR, 'vocab.chars.txt')),
'tags': str(Path(DATADIR, 'vocab.tags.txt')),
'glove': str(Path(DATADIR, 'glove.npz'))
}
with Path('results1/params.json').open('w') as f:
json.dump(params, f, indent=4, sort_keys=True)
# Word Embeddings
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.] * params['dim']]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embeddings = tf.nn.embedding_lookup(variable, word_ids)
Hope this is not too late for you.
I have been googling this issue for a while, hopefully got the root of it and turns out it was quite simple. Similar issues unsolved were here and here.
Chances are: You have seen an example of this embeddings code somewhere and tried to follow it (this was the case for me). However, the case is that coders and tensorflow assume that the id's for the inputs are sequential. I.e. that if you have 1000 items for example, then your id's are [0,1,2,3..998,999].
However, this is usually not the case with real data where id's are something like "xYzVryCmplxNm5m3r" (in this case, it will give and error because there are characters in the id and tensorflow will not accept that, it only accepts integers), or, in the very subtle case that is probably your case, the id's are actually integers but not sequential. For example, they can go like : ids=[68632548, 15323, ....].
In this case, tensorflow will accept the input data (because it's integers as expected) and give you this error, because the numbers are not sequential and actually much larger than the number of unique id's (this number+1 is usually set to be the limit for the vocab size).
The solution that worked for me was to map all the id values in the original dataframe to sequential id's, preserving their uniqueness, and then input the same data again (it actually worked !).
The code could be something like:
unique_ids=np.unique(old_ids)
sqeuential_ids=[i for i in range(len(unique_ids))]
id_mapping_dict=dict(zip(unique_ids,sqeuential_ids))
def map_ids_to_sequential(original_id):
return id_mapping_dict[original_id]
df['ids']=df['ids'].apply(map_ids_to_sequential)

Implementing LSTM regression model with tensor flow

I am trying to implement a tensor flow LSTM regression model for a list of inputs number.
example:
input_data = [1, 2, 3, 4, 5]
time_steps = 2
-> X == [[1, 2], [2, 3], [3, 4]]
-> y == [3, 4, 5]
The code is below:
TIMESTEPS = 20
num_hidden=20
Xd, yd = load_data()
train_input = Xd['train']
train_input = train_input.reshape(-1,20,1)
train_output = yd['train']
# train_input = [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],..
# train_output = [[21],[22],[23]....
test_input = Xd['test']
test_output = yd['test']
X = tf.placeholder(tf.float32, [None, 20, 1])
y = tf.placeholder(tf.float32, [None, 1])
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
val = tf.Print(val, [tf.argmax(val,1)], 'argmax(val)=' , summarize=20, first_n=7)
val = tf.transpose(val, [1, 0, 2])
val = tf.Print(val, [tf.argmax(val,1)], 'argmax(val2)=' , summarize=20, first_n=7)
# Take only the last output after 20 time steps
last = tf.gather(val, int(val.get_shape()[0]) - 1)
last = tf.Print(last, [tf.argmax(last,1)], 'argmax(val3)=' , summarize=20, first_n=7)
# define variables for weights and bias
weight = tf.Variable(tf.truncated_normal([num_hidden, int(y.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[y.get_shape()[1]]))
# Prediction is matmul of last value + wieght + bias
prediction = tf.matmul(last, weight) + bias
# Cost function using softmax
# y is the true distrubution and prediction is the predicted
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(prediction), reduction_indices=[1]))
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cost)
from tensorflow.python import debug as tf_debug
inita = tf.initialize_all_variables()
sess = tf.Session()
sess.run(inita)
batch_size = 100
no_of_batches = int(len(train_input)/batch_size)
epoch = 10
test_size = 100
for i in range(epoch):
for start, end in zip(range(0, len(train_input), batch_size), range(batch_size, len(train_input)+1, batch_size)):
sess.run(minimize, feed_dict={X: train_input[start:end], y: train_output[start:end]})
test_indices = np.arange(len(test_input)) # Get A Test Batch
np.random.shuffle(test_indices)
test_indices = test_indices[0:test_size]
print (i, mean_squared_error(np.argmax(test_output[test_indices], axis=1), sess.run(prediction, feed_dict={X: test_input[test_indices]})))
print ("predictions", prediction.eval(feed_dict={X: train_input}, session=sess))
y_pred = prediction.eval(feed_dict={X: test_input}, session=sess)
sess.close()
test_size = test_output.shape[0]
ax = np.arange(0, test_size, 1)
plt.plot(ax, test_output, 'r', ax, y_pred, 'b')
plt.show()
But i am not able to minimize the cost, the calculated MSE increases at each step instead of decreasing.
I suspect there is a problem with the cost problem that i am using.
any thoughts or suggestions as to what i am doing wrong ?
Thanks
As mentioned in the comment, you had to change your loss function to the MSE function and reduce your learning rate. Is your error converging to zero ?