Tensorflow Estimator error: incorrect checksum for freed object - object was probably modified after being freed - tensorflow

I am training a TF Boosted Trees estimator, which is giving me an error:
incorrect checksum for freed object - object was probably modified after being freed.
On CloudML I get:
Command '['python', '-m', u'trainer.model', u'--job-type', u'remote', '--job-dir', u'gs://xxx']' died with signal 7.
after an ~1 hour of training. I am training on the CPU. This suggests that there is a Memory leak, but I'm only using TF code, so I'm not sure what is going wrong.
My code is as follows:
def build_training_input_fn():
def parse_record(record):
transformed_feature_spec = transformed_metadata.schema.as_feature_spec()
transformed_features = tf.parse_single_example(record, transformed_feature_spec)
cols_to_remove = []
transformed_labels = transformed_features.pop(LABEL_KEY)
transformed_features = {key: value for (key, value) in transformed_features.items() if
key not in cols_to_remove}
return transformed_features, transformed_labels
def input_fn(file_pattern):
files = tf.data.Dataset.list_files(file_pattern=file_pattern)
files.apply(
tf.data.experimental.parallel_interleave(
lambda filename: tf.data.TFRecordDataset(filename),
cycle_length=32,
block_length=1,
sloppy=True,
))
dataset = dataset.apply(tf.data.experimental.map_and_batch(
map_func=parse_record, batch_size=BATCH_SIZE, drop_remainder=False,
num_parallel_batches=1))
return dataset
return input_fn
classifier = tf.estimator.BoostedTreesClassifier()
input_fn_train = build_training_input_fn()
classifier.train(input_fn=input_fn_train)
where I am reading TF records created by Apache Beam.
I'm not sure how it's possible to get such errors, I know the data in TF records is good, and I can train XGB/Catboost using the same set.
Can anyone help?

Related

How to avoid memory leakage in an autoregressive model within tensorflow

Recently, I am training a LSTM with attention mechanism for regressionin tensorflow 2.9 and I met an problem during training with model.fit():
At the beginning, the training time is okay, like 7s/step. However, it was increasing during the process and after several steps, like 1000, the value might be 50s/step. Here below is a part of the code for my model:
class AttentionModel(tf.keras.Model):
def __init__(self, encoder_output_dim, dec_units, dense_dim, batch):
super().__init__()
self.dense_dim = dense_dim
self.batch = batch
encoder = Encoder(encoder_output_dim)
decoder = Decoder(dec_units,dense_dim)
self.encoder = encoder
self.decoder = decoder
def call(self, inputs):
# Creat a tensor to record the result
tempt = list()
encoder_output, encoder_state = self.encoder(inputs)
new_features = np.zeros((self.batch, 1, 1))
dec_initial_state = encoder_state
for i in range(6):
dec_inputs = DecoderInput(new_features=new_features, enc_output=encoder_output)
dec_result, dec_state = self.decoder(dec_inputs, dec_initial_state)
tempt.append(dec_result.logits)
new_features = dec_result.logits
dec_initial_state = dec_state
result=tf.concat(tempt,1)
return result
In the official documents for tf.function, I notice: "Don't rely on Python side effects like object mutation or list appends".
Since I use a dynamic python list with append() to record the intermediate variables, I guess each time during training, a new tf.graph was added. Is the reason my training is getting slower and slower?
Additionally, what should I use instead of python list to avoid this? I have tried with a numpy.zeros matrix but it will lead to another problem:
tempt = np.zeros(shape=(1,6))
...
for i in range(6):
dec_inputs = DecoderInput(new_features=new_features, enc_output=encoder_output)
dec_result, dec_state = self.decoder(dec_inputs, dec_initial_state)
tempt[i]=(dec_result.logits)
...
Cannot convert a symbolic tf.Tensor (decoder/dense_3/BiasAdd:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported.

ValueError when trying to fine-tune GPT-2 model in TensorFlow

I am encountering a ValueError in my Python code when trying to fine-tune Hugging Face's distribution of the GPT-2 model. Specifically:
ValueError: Dimensions must be equal, but are 64 and 0 for
'{{node Equal_1}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Cast_18, Cast_19)'
with input shapes: [64,0,1024], [2,0,12,1024].
I have around 100 text files that I concatenate into a string variable called raw_text and then pass into the following function to create training and testing TensorFlow datasets:
def to_datasets(raw_text):
# split the raw text in smaller sequences
seqs = [
raw_text[SEQ_LEN * i:SEQ_LEN * (i + 1)]
for i in range(len(raw_text) // SEQ_LEN)
]
# set up Hugging Face GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# tokenize the character sequences
tokenized_seqs = [
tokenizer(seq, padding="max_length", return_tensors="tf")["input_ids"]
for seq in seqs
]
# convert tokenized sequences into TensorFlow datasets
trn_seqs = tf.data.Dataset \
.from_tensor_slices(tokenized_seqs[:int(len(tokenized_seqs) * TRAIN_PERCENT)])
tst_seqs = tf.data.Dataset \
.from_tensor_slices(tokenized_seqs[int(len(tokenized_seqs) * TRAIN_PERCENT):])
def input_and_target(x):
return x[:-1], x[1:]
# map into (input, target) tuples, shuffle order of elements, and batch
trn_dataset = trn_seqs.map(input_and_target) \
.shuffle(SHUFFLE_BUFFER_SIZE) \
.batch(BATCH_SIZE, drop_remainder=True)
tst_dataset = tst_seqs.map(input_and_target) \
.shuffle(SHUFFLE_BUFFER_SIZE) \
.batch(BATCH_SIZE, drop_remainder=True)
return trn_dataset, tst_dataset
I then try to train my model, calling train_model(*to_datasets(raw_text)):
def train_model(trn_dataset, tst_dataset):
# import Hugging Face GPT-2 model
model = TFGPT2Model.from_pretrained("gpt2")
model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=tf.metrics.SparseCategoricalAccuracy()
)
model.fit(
trn_dataset,
epochs=EPOCHS,
initial_epoch=0,
validation_data=tst_dataset
)
The ValueError is triggered on the model.fit() call. The variables in all-caps are settings pulled in from a JSON file. Currently, they are set to:
{
"BATCH_SIZE":64,
"SHUFFLE_BUFFER_SIZE":10000,
"EPOCHS":500,
"SEQ_LEN":2048,
"TRAIN_PERCENT":0.9
}
Any information regarding what this error means or ideas on how to resolve it would be greatly appreciated. Thank you!
I'm having the same problem but when I change the batch size to 12 (same as n_layer parameter in the gpt-2 config file) it works.
I don't Know why it works but you can try it...
If you manage to solve it on different way I will be glad to hear.

How to fix "Retval[0] has already been set" when serving saved model

I have a working SavedModel (ie. a saved model that works when restored in python) that fails when run on tensorflow serving.
The error message on the server is:
OP_REQUIRES failed at function_ops.cc:68 : Internal: Retval[0] has already been set.
The REST API returns 500 and specifies the node on the graph:
[[{{node _retval_loop/concat_0_0}}]
Exact Steps to Reproduce
(https://drive.google.com/file/d/1at1CQ9iHgcPHCn-MkvSGcgtbVM2lrKJn/view) link to saved model. it can be restored and run in python successfully but will throw an error if run on a model server. (Takes an image as input:
sess.run(fetches=["loop/Exit_1:0"],feed_dict={"image_bytes:0": image})
Source code / logs
Relevant source code(I hope):
(contains a while loop with a concat in the body)
val, idx =tf.nn.top_k(softmax ,name="topk")
sentence = tf.Variable([vocab.start_id],False,name="sentence",)
sentence = tf.concat([sentence, idx[0]], 0)#
def cond(sentence,state):
return tf.math.not_equal(
sentence[-1],tf.constant(vocab.end_id))
def body(sentence,state):
input_seqs = tf.expand_dims([sentence[-1]], 1)
seq_embeddings = tf.nn.embedding_lookup(self.embedding_map,
input_seqs)
embed = seq_embeddings
# In inference mode, use concatenated states for convenient feeding and
# fetching.
state_feed = tf.concat(axis=1, values=state, name="state")
# Placeholder for feeding a batch of concatenated states.
# state_feed = tf.placeholder(dtype=tf.float32,
# shape=[None,
sum(lstm_cell.state_size)],
# name="state_feed")
state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1)
# Run a single LSTM step.
lstm_outputs, new_state_tuple = lstm_cell(
inputs=tf.squeeze(embed, axis=[1]),
state=state_tuple)
# Concatentate the resulting state.
state = tf.concat(axis=1, values=new_state_tuple, name="state")
# Stack batches vertically.
lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])
with tf.variable_scope("logits") as logits_scope:
logits = tf.contrib.layers.fully_connected(
inputs=lstm_outputs,
num_outputs=self.config.vocab_size,
activation_fn=None,
weights_initializer=self.initializer,
scope=logits_scope, reuse = True
)
softmax = tf.nn.softmax(logits, name="softmax")
self.softmax = softmax
val, idx = tf.nn.top_k(softmax, name="topk")
sentence = tf.concat([sentence,idx[0]],0)
self.output = sentence
return [sentence, state]
out = tf.while_loop(cond, body, [sentence, state],parallel_iterations=1,maximum_iterations=20,name="loop",shape_invariants=[tf.TensorShape([None]),tf.TensorShape([None,None])])
return out
fails with error:
W external/org_tensorflow/tensorflow/core/framework/op_kernel.cc:1401] OP_REQUIRES failed at function_ops.cc:68 : Internal: Retval[0] has already been set.
It could be the output nodes in sess.run contains node types that contain Enter, Merge, LoopCond, Switch, Exit, Less, etc.

Tensorflow: Bug when using `tf.contrib.metrics.streaming_mean_iou`

I'm getting a strange error when trying to compute the intersection over union using tensorflows tf.contrib.metrics.streaming_mean_iou.
This was the code I was using before which works perfectly fine
tensorflow as tf
label = tf.image.decode_png(tf.read_file('/path/to/label.png'),channels=1)
label_lin = tf.reshape(label, [-1,])
weights = tf.cast(tf.less_equal(label_lin, 10), tf.int32)
mIoU, update_op = tf.contrib.metrics.streaming_mean_iou(label_lin, label_lin,num_classes = 11,weights = weights)
init = tf.local_variables_initializer()
sess.run(init)
sess.run([update_op])
However when I use a mask like this
mask = tf.image.decode_png(tf.read_file('/path/to/mask_file.png'),channels=1)
mask_lin = tf.reshape(mask, [-1,])
mask_lin = tf.cast(mask_lin,tf.int32)
mIoU, update_op = tf.contrib.metrics.streaming_mean_iou(label_lin, label_lin,num_classes = 11,weights = mask_lin)
init = tf.local_variables_initializer()
sess.run(init)
sess.run([update_op])
It keeps on failing after an irregular number of iterations showing this error:
*** Error in `/usr/bin/python': corrupted double-linked list: 0x00007f29d0022fd0 ***
I checked the shape and data type of both mask_lin and weights. They are the same, so I cannot really see what is going wrong here.
Also the fact that the error comes after calling update_op an irregular number of times is strange. Maybe TF empties the mask_lin object after calling several sess.run()'s ?
Or is this some TF bug? But then again why would it work with weights...

"Output 0 of type double does not match declared output type string" while running the iris sample program in TensorFlow Serving

I am running the sample iris program in TensorFlow Serving. Since it is a TF.Learn model, I am exporting the model using the following classifier.export(export_dir=model_dir,signature_fn=my_classification_signature_fn) and the signature_fn is defined as shown below:
def my_classification_signature_fn(examples, unused_features, predictions):
"""Creates classification signature from given examples and predictions.
Args:
examples: `Tensor`.
unused_features: `dict` of `Tensor`s.
predictions: `Tensor` or dict of tensors that contains the classes tensor
as in {'classes': `Tensor`}.
Returns:
Tuple of default classification signature and empty named signatures.
Raises:
ValueError: If examples is `None`.
"""
if examples is None:
raise ValueError('examples cannot be None when using this signature fn.')
if isinstance(predictions, dict):
default_signature = exporter.classification_signature(
examples, classes_tensor=predictions['classes'])
else:
default_signature = exporter.classification_signature(
examples, classes_tensor=predictions)
named_graph_signatures={
'inputs': exporter.generic_signature({'x_values': examples}),
'outputs': exporter.generic_signature({'preds': predictions})}
return default_signature, named_graph_signatures
The model gets successfully exported using the following piece of code.
I have created a client which makes real-time predictions using TensorFlow Serving.
The following is the code for the client:
flags.DEFINE_string("model_dir", "/tmp/iris_model_dir", "Base directory for output models.")
tf.app.flags.DEFINE_integer('concurrency', 1,
'maximum number of concurrent inference requests')
tf.app.flags.DEFINE_string('server', '', 'PredictionService host:port')
#connection
host, port = FLAGS.server.split(':')
channel = implementations.insecure_channel(host, int(port))
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
# Classify two new flower samples.
new_samples = np.array([5.8, 3.1, 5.0, 1.7], dtype=float)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'iris'
request.inputs["x_values"].CopyFrom(
tf.contrib.util.make_tensor_proto(new_samples))
result = stub.Predict(request, 10.0) # 10 secs timeout
However, on making the predictions, the following error is displayed:
grpc.framework.interfaces.face.face.AbortionError: AbortionError(code=StatusCode.INTERNAL, details="Output 0 of type double does not match declared output type string for node _recv_input_example_tensor_0 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=2016246895612781641, tensor_name="input_example_tensor:0", tensor_type=DT_STRING, _device="/job:localhost/replica:0/task:0/cpu:0"]()")
Here is the entire stack trace.
enter image description here
The iris model is defined in the following manner:
# Specify that all features have real-value data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=3, model_dir=model_dir)
# Fit model.
classifier.fit(x=training_set.data,
y=training_set.target,
steps=2000)
Kindly guide a solution for this error.
I think the problem is that your signature_fn is going on the else branch and passing predictions as the output to the classification signature, which expects a string output and not a double output. Either use a regression signature function or add something to the graph to get the output in the form of a string.