UnImplimented Error while training on TPU in Colab - tensorflow

When trying to train my model on TPU in Colab
model.fit(train_dataset,
steps_per_epoch = len(df_train) // config.BATCH_SIZE,
validation_data = valid_dataset,
epochs = config.EPOCHS)
I got this error with whole traceback:
UnimplementedError Traceback (most recent call last)
<ipython-input-37-92afbe2b5ae5> in <module>()
2 steps_per_epoch = len(df_train) // config.BATCH_SIZE,
3 validation_data = valid_dataset,
----> 4 epochs = config.EPOCHS)
13 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1186 logs = tmp_logs # No error, now safe to assign to logs.
1187 end_step = step + data_handler.step_increment
-> 1188 callbacks.on_train_batch_end(end_step, logs)
1189 if self.stop_training:
1190 break
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
455 """
456 if self._should_call_train_batch_hooks:
--> 457 self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
458
459 def on_test_batch_begin(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _call_batch_hook(self, mode, hook, batch, logs)
315 self._call_batch_begin_hook(mode, batch, logs)
316 elif hook == 'end':
--> 317 self._call_batch_end_hook(mode, batch, logs)
318 else:
319 raise ValueError('Unrecognized hook: {}'.format(hook))
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _call_batch_end_hook(self, mode, batch, logs)
335 self._batch_times.append(batch_time)
336
--> 337 self._call_batch_hook_helper(hook_name, batch, logs)
338
339 if len(self._batch_times) >= self._num_batches_for_timing_check:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _call_batch_hook_helper(self, hook_name, batch, logs)
373 for callback in self.callbacks:
374 hook = getattr(callback, hook_name)
--> 375 hook(batch, logs)
376
377 if self._check_timing:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
1027
1028 def on_train_batch_end(self, batch, logs=None):
-> 1029 self._batch_update_progbar(batch, logs)
1030
1031 def on_test_batch_end(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _batch_update_progbar(self, batch, logs)
1099 if self.verbose == 1:
1100 # Only block async when verbose = 1.
-> 1101 logs = tf_utils.sync_to_numpy_or_python_type(logs)
1102 self.progbar.update(self.seen, list(logs.items()), finalize=False)
1103
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/utils/tf_utils.py in sync_to_numpy_or_python_type(tensors)
517 return t # Don't turn ragged or sparse tensors to NumPy.
518
--> 519 return nest.map_structure(_to_single_numpy_or_python_type, tensors)
520
521
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
513 def _to_single_numpy_or_python_type(t):
514 if isinstance(t, ops.Tensor):
--> 515 x = t.numpy()
516 return x.item() if np.ndim(x) == 0 else x
517 return t # Don't turn ragged or sparse tensors to NumPy.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
1092 """
1093 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
-> 1094 maybe_arr = self._numpy() # pylint: disable=protected-access
1095 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
1096
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1060 return self._numpy_internal()
1061 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1062 six.raise_from(core._status_to_exception(e.code, e.message), None) # pylint: disable=protected-access
1063
1064 #property
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)
UnimplementedError: 9 root error(s) found.
(0) Unimplemented: {{function_node __inference_train_function_88574}} Asked to propagate a dynamic dimension from hlo convolution.24975#{}#2 to hlo %all-reduce.24980 = f32[3,3,<=3,32]{3,2,1,0} all-reduce(f32[3,3,<=3,32]{3,2,1,0} %convolution.24975), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=%sum.24976, metadata={op_type="CrossReplicaSum" op_name="while/body/_1/while/Adam/CrossReplicaSum"}, which is not implemented.
[[{{node TPUReplicate/_compile/_18168620323984915962/_4}}]]
[[while/body/_1/while/strided_slice_1/_253]]
(1) Unimplemented: {{function_node __inference_train_function_88574}} Asked to propagate a dynamic dimension from hlo convolution.24975#{}#2 to hlo %all-reduce.24980 = f32[3,3,<=3,32]{3,2,1,0} all-reduce(f32[3,3,<=3,32]{3,2,1,0} %convolution.24975), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=%sum.24976, metadata={op_type="CrossReplicaSum" op_name="while/body/_1/while/Adam/CrossReplicaSum"}, which is not implemented.
[[{{node TPUReplicate/_compile/_18168620323984915962/_4}}]]
[[TPUReplicate/_compile/_18168620323984915962/_4/_243]]
(2) Unimplemented: {{function_node __inference_train_function_88574}} Asked to propagate a dynamic dimension from hlo convolution.24975#{}#2 to hlo %all-reduce.24980 = f32[3,3,<=3,32]{3,2,1,0} all-reduce(f32[3,3,<=3,32]{3,2,1,0} %convolution.24975), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=%sum.24976, metadata={op_type="CrossReplicaSum" op_name="while/body/_1/while/Adam/CrossReplicaSum"}, which is not implemented.[truncated]
Things That I have checked:
My data is in a GCS bucket and can be retrieved using the dataset object I created.
My model definition:
with strategy.scope():
base_model = efn.EfficientNetB0(include_top=False)
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(config.IMG_SIZE, config.IMG_SIZE, 3)),
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=config.LR),
loss = tf.keras.losses.SparseCategoricalCrossentropy(),
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()],
steps_per_execution = 32)
Any idea why this is happening. It says that a dynamic dimension was asked to propagate but I don't think this should be the case. Considering the model worked in GPU settings (with data present in the current session).

Related

ResourceExhaustedError: OOM when allocating tensor with shape[64,64,224,224]

Got ResourceExhaustedError while training deep learning algorithm on NVIDIA GeForce RTX 3050 Ti Laptop GPU using tensorflow with memory_limit: 1721342363
I am training 2870 images and its working well using CPU but on GPU it seems to be getting restricted due to memory limit. Have I turned on a limit of memory on my GPU or do I have no option but to use my CPU?
It took me 70 mins on my CPU and that is why I chose to run on my GPU.
But while training
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(training_set, validation_data=test_set, epochs=20, batch_size=32)
Got this error:
Epoch 1/20
1/45 [..............................] - ETA: 9:46 - loss: 1.8638 - accuracy: 0.1667
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
Cell In [4], line 3
1 #Compile the model
2 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
----> 3 history = model.fit(training_set, validation_data=test_set, epochs=20, batch_size=32)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\keras\engine\training.py:1184, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1177 with tf.profiler.experimental.Trace(
1178 'train',
1179 epoch_num=epoch,
1180 step_num=step,
1181 batch_size=batch_size,
1182 _r=1):
1183 callbacks.on_train_batch_begin(step)
-> 1184 tmp_logs = self.train_function(iterator)
1185 if data_handler.should_sync:
1186 context.async_wait()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\def_function.py:885, in Function.__call__(self, *args, **kwds)
882 compiler = "xla" if self._jit_compile else "nonXla"
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
887 new_tracing_count = self.experimental_get_tracing_count()
888 without_tracing = (tracing_count == new_tracing_count)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\def_function.py:917, in Function._call(self, *args, **kwds)
914 self._lock.release()
915 # In this case we have created variables on the first call, so we run the
916 # defunned version which is guaranteed to never create variables.
--> 917 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
918 elif self._stateful_fn is not None:
919 # Release the lock early so that multiple threads can perform the call
920 # in parallel.
921 self._lock.release()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:3039, in Function.__call__(self, *args, **kwargs)
3036 with self._lock:
3037 (graph_function,
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 3039 return graph_function._call_flat(
3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:1963, in ConcreteFunction._call_flat(self, args, captured_inputs, cancellation_manager)
1959 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
1960 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
1961 and executing_eagerly):
1962 # No tape is watching; skip to running the function.
-> 1963 return self._build_call_outputs(self._inference_function.call(
1964 ctx, args, cancellation_manager=cancellation_manager))
1965 forward_backward = self._select_forward_and_backward_functions(
1966 args,
1967 possible_gradient_type,
1968 executing_eagerly)
1969 forward_function, args_with_tangents = forward_backward.forward()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:591, in _EagerDefinedFunction.call(self, ctx, args, cancellation_manager)
589 with _InterpolateFunctionError(self):
590 if cancellation_manager is None:
--> 591 outputs = execute.execute(
592 str(self.signature.name),
593 num_outputs=self._num_outputs,
594 inputs=args,
595 attrs=attrs,
596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
599 str(self.signature.name),
600 num_outputs=self._num_outputs,
(...)
603 ctx=ctx,
604 cancellation_manager=cancellation_manager)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\execute.py:59, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
ResourceExhaustedError: OOM when allocating tensor with shape[64,64,224,224] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node model/block1_conv2/Relu (defined at \AppData\Local\Temp\ipykernel_11956\3538519329.py:3) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_1205]
Function call stack:
train_function
I might be not entirely right on the details, but the training process looks like this from the GPU memory perspective:
the network is loaded into the GPU memory
a batch is loaded into the GPU memory
trainable parameters are computed on the GPU
So there are several ways to reduce GPU memory usage:
Reduce the number of trainable parameters so each training step requires less computations. You can do this by either reducing the size of the net or by making some of the later layers non-trainable if the net is pretrained (tf.keras.applications has a lot of networks pretrained on the imagenet).
To make layers of the Functional net (in which the abovementioned nets are written) untrainable, you should switch their .trainable attribute to False:
Say to freeze all the layers starting from the 100th one you need to:
ind_to_freeze = 100
for layer in net.layers[:100]:
layer.trainable=False
Reduce the image shapes so each batch requires less memory. Generally, this will lower the accuracy.
Reduce the batch size so each batch requires less memory.
Also I see you've loaded all 2870 images into the (CPU) memory at once. I recommend using generator instead that takes filenames and reads instead. You can read about building native tf.data.Datasets here

(0) Unavailable: {{function_node __inference_train_function_53748}}

I am working on colab and using TPU, but unfortunatly it does not work properly and the model faces an issue while fitting ..
Here is my code:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_address)
tf.config.experimental_connect_to_cluster(resolver)
strategy = tf.distribute.TPUStrategy(resolver)
with strategy.scope():
model = create_model(input_shape=(HEIGHT, WIDTH, CANAL), n_out=N_CLASSES)
for layer in model.layers:
layer.trainable = False
for i in range(-5, 0):
model.layers[i].trainable = True
es = EarlyStopping(monitor='val_loss', mode='min', patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
rlrop = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=RLROP_PATIENCE, factor=DECAY_DROP, min_lr=1e-6, verbose=1)
callback_list = [es, rlrop]
optimizer = optimizers.Adam(lr=LEARNING_RATE)
model.compile(optimizer = optimizers.Adam(lr=WARMUP_LEARNING_RATE),
loss = 'categorical_crossentropy',
metrics = ['accuracy'])
model.summary()
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
history_finetunning = model.fit_generator(generator=train_generator,
steps_per_epoch=STEP_SIZE_TRAIN,
epochs=EPOCHS,
validation_data=valid_generator,
validation_steps=STEP_SIZE_VALID,
verbose =1)
And this is the error ..
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py:1915: UserWarning: `Model.fit_generator` is deprecated and will be removed in a future version. Please use `Model.fit`, which supports generators.
warnings.warn('`Model.fit_generator` is deprecated and '
Epoch 1/40
---------------------------------------------------------------------------
UnavailableError Traceback (most recent call last)
<ipython-input-41-1c157bad2449> in <module>()
4 validation_data=valid_generator,
5 validation_steps=STEP_SIZE_VALID,
----> 6 verbose =1)
14 frames
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1930 use_multiprocessing=use_multiprocessing,
1931 shuffle=shuffle,
-> 1932 initial_epoch=initial_epoch)
1933
1934 def evaluate_generator(self,
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1161 logs = tmp_logs # No error, now safe to assign to logs.
1162 end_step = step + data_handler.step_increment
-> 1163 callbacks.on_train_batch_end(end_step, logs)
1164 if self.stop_training:
1165 break
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in on_train_batch_end(self, batch, logs)
434 """
435 if self._should_call_train_batch_hooks:
--> 436 self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
437
438 def on_test_batch_begin(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_hook(self, mode, hook, batch, logs)
276 self._call_batch_begin_hook(mode, batch, logs)
277 elif hook == 'end':
--> 278 self._call_batch_end_hook(mode, batch, logs)
279 else:
280 raise ValueError('Unrecognized hook: {}'.format(hook))
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_end_hook(self, mode, batch, logs)
296 self._batch_times.append(batch_time)
297
--> 298 self._call_batch_hook_helper(hook_name, batch, logs)
299
300 if len(self._batch_times) >= self._num_batches_for_timing_check:
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_hook_helper(self, hook_name, batch, logs)
336 hook = getattr(callback, hook_name)
337 if getattr(callback, '_supports_tf_logs', False):
--> 338 hook(batch, logs)
339 else:
340 if numpy_logs is None: # Only convert once.
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in on_train_batch_end(self, batch, logs)
1042
1043 def on_train_batch_end(self, batch, logs=None):
-> 1044 self._batch_update_progbar(batch, logs)
1045
1046 def on_test_batch_end(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _batch_update_progbar(self, batch, logs)
1106 if self.verbose == 1:
1107 # Only block async when verbose = 1.
-> 1108 logs = tf_utils.sync_to_numpy_or_python_type(logs)
1109 self.progbar.update(self.seen, list(logs.items()), finalize=False)
1110
/usr/local/lib/python3.7/dist-packages/keras/utils/tf_utils.py in sync_to_numpy_or_python_type(tensors)
505 return t # Don't turn ragged or sparse tensors to NumPy.
506
--> 507 return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors)
508
509
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
501 def _to_single_numpy_or_python_type(t):
502 if isinstance(t, tf.Tensor):
--> 503 x = t.numpy()
504 return x.item() if np.ndim(x) == 0 else x
505 return t # Don't turn ragged or sparse tensors to NumPy.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
1092 """
1093 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
-> 1094 maybe_arr = self._numpy() # pylint: disable=protected-access
1095 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
1096
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1060 return self._numpy_internal()
1061 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1062 six.raise_from(core._status_to_exception(e.code, e.message), None) # pylint: disable=protected-access
1063
1064 #property
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)
UnavailableError: 3 root error(s) found.
(0) Unavailable: {{function_node __inference_train_function_53748}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"#1626347736.544045826","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":5420,"referenced_errors":[{"created":"#1626347735.785465323","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}]}
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNextAsOptional]]
[[cond_11/switch_pred/_107/_76]]
(1) Unavailable: {{function_node __inference_train_function_53748}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"#1626347736.544045826","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":5420,"referenced_errors":[{"created":"#1626347735.785465323","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}]}
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNextAsOptional]]
[[cluster_train_function/_execute_2_0/_333]]
(2) Unavailable: {{function_node __inference_train_function_53748}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"#1626347736.544045826","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":5420,"referenced_errors":[{"created":"#1626347735.785465323","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}]}
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNextAsOptional]]
0 successful operations.
6 derived errors ignored.
Using the same code for TPU configuration with tfds from tensorflow_datasets and fitting the model with model.fit method does not raise any error,
Model.fit_generator is deprecated and doesn't work on TPUs.
Try using tf.keras.preprocessing.image_dataset_from_directory or tf.data.Dataset and combining it with Keras preprocessing layers

Problem with Logits using Densenet121 in Tensorflow 2.4

I was trying to reproach the examples Transfer learning and fine-tuning adapting for my problem My colab with GPU. When I use a softmax in the last Dense Layers, this error not ocorrer. But with 'from_logits=True', this error ocorrer, my imagens are jpg and they are divided in folders:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-85-7ea61d5df8ec> in <module>()
----> 1 loss0, accuracy0, auc0, precision0, recall0 = model.evaluate(val_ds)
5 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in evaluate(self, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing, return_dict)
1387 with trace.Trace('test', step_num=step, _r=1):
1388 callbacks.on_test_batch_begin(step)
-> 1389 tmp_logs = self.test_function(iterator)
1390 if data_handler.should_sync:
1391 context.async_wait()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
826 tracing_count = self.experimental_get_tracing_count()
827 with trace.Trace(self._name) as tm:
--> 828 result = self._call(*args, **kwds)
829 compiler = "xla" if self._experimental_compile else "nonXla"
830 new_tracing_count = self.experimental_get_tracing_count()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
893 # If we did not create any variables the trace we have is good enough.
894 return self._concrete_stateful_fn._call_flat(
--> 895 filtered_flat_args, self._concrete_stateful_fn.captured_inputs) # pylint: disable=protected-access
896
897 def fn_with_cond(inner_args, inner_kwds, inner_filtered_flat_args):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1917 # No tape is watching; skip to running the function.
1918 return self._build_call_outputs(self._inference_function.call(
-> 1919 ctx, args, cancellation_manager=cancellation_manager))
1920 forward_backward = self._select_forward_and_backward_functions(
1921 args,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
558 inputs=args,
559 attrs=attrs,
--> 560 ctx=ctx)
561 else:
562 outputs = execute.execute_with_cancellation(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (Dense121/dense_1/BiasAdd:0) = ] [[0.853173912 1.97515857 0.608713508...]...] [y (Cast_4/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
(1) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (Dense121/dense_1/BiasAdd:0) = ] [[0.853173912 1.97515857 0.608713508...]...] [y (Cast_4/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
[[assert_greater_equal_2/Assert/AssertGuard/branch_executed/_65/_167]]
0 successful operations.
0 derived errors ignored. [Op:__inference_test_function_61870]
Function call stack:
test_function -> test_function
I tried several things to solve and nothing

ResNet model in Tensorflow Federated

I tried to customize the model in "Image classification" tutorial in Tensorflow Federated. (It originally used a sequential model)
I use Keras ResNet50 but when it began to train, there is always an error "Incompatible shapes"
Here are my codes:
NUM_CLIENTS = 4
NUM_EPOCHS = 10
BATCH_SIZE = 2
SHUFFLE_BUFFER = 5
def create_compiled_keras_model():
model = tf.keras.applications.resnet.ResNet50(include_top=False, weights='imagenet',
input_tensor=tf.keras.layers.Input(shape=(100,
300, 3)), pooling=None)
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.02),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
return model
def model_fn():
keras_model = create_compiled_keras_model()
return tff.learning.from_compiled_keras_model(keras_model, sample_batch)
iterative_process = tff.learning.build_federated_averaging_process(model_fn)
Error information:
enter image description here
I feel that the shape is incompatible because the epoch and clients information were somehow missing. Would be very thankful if someone could give me a hint.
Updates:
The Assertion error happened during tff.learning.build_federated_averaging_process
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-164-dac26193d9d8> in <module>()
----> 1 iterative_process = tff.learning.build_federated_averaging_process(model_fn)
2
3 # iterative_process = build_federated_averaging_process(model_fn)
13 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/learning/federated_averaging.py in build_federated_averaging_process(model_fn, server_optimizer_fn, client_weight_fn, stateful_delta_aggregate_fn, stateful_model_broadcast_fn)
165 return optimizer_utils.build_model_delta_optimizer_process(
166 model_fn, client_fed_avg, server_optimizer_fn,
--> 167 stateful_delta_aggregate_fn, stateful_model_broadcast_fn)
/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/learning/framework/optimizer_utils.py in build_model_delta_optimizer_process(model_fn, model_to_client_delta_fn, server_optimizer_fn, stateful_delta_aggregate_fn, stateful_model_broadcast_fn)
349 # still need this.
350 with tf.Graph().as_default():
--> 351 dummy_model_for_metadata = model_utils.enhance(model_fn())
352
353 # ===========================================================================
<ipython-input-159-b2763ace8e5b> in model_fn()
1 def model_fn():
2 keras_model = model
----> 3 return tff.learning.from_compiled_keras_model(keras_model, sample_batch)
/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/learning/keras_utils.py in from_compiled_keras_model(keras_model, dummy_batch)
211 # Model.test_on_batch() once before asking for metrics.
212 if isinstance(dummy_tensors, collections.Mapping):
--> 213 keras_model.test_on_batch(**dummy_tensors)
214 else:
215 keras_model.test_on_batch(*dummy_tensors)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in test_on_batch(self, x, y, sample_weight, reset_metrics)
1007 sample_weight=sample_weight,
1008 reset_metrics=reset_metrics,
-> 1009 standalone=True)
1010 outputs = (
1011 outputs['total_loss'] + outputs['output_losses'] + outputs['metrics'])
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in test_on_batch(model, x, y, sample_weight, reset_metrics, standalone)
503 y,
504 sample_weights=sample_weights,
--> 505 output_loss_metrics=model._output_loss_metrics)
506
507 if reset_metrics:
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
568 xla_context.Exit()
569 else:
--> 570 result = self._call(*args, **kwds)
571
572 if tracing_count == self._get_tracing_count():
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
606 # In this case we have not created variables on the first call. So we can
607 # run the first trace but we should fail if variables are created.
--> 608 results = self._stateful_fn(*args, **kwds)
609 if self._created_variables:
610 raise ValueError("Creating variables on a non-first call to a function"
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
2407 """Calls a graph function specialized to the inputs."""
2408 with self._lock:
-> 2409 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
2410 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2411
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2765
2766 self._function_cache.missed.add(call_context_key)
-> 2767 graph_function = self._create_graph_function(args, kwargs)
2768 self._function_cache.primary[cache_key] = graph_function
2769 return graph_function, args, kwargs
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2655 arg_names=arg_names,
2656 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2657 capture_by_value=self._capture_by_value),
2658 self._function_attributes,
2659 # Tell the ConcreteFunction to clean up its graph once it goes out of
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
979 _, original_func = tf_decorator.unwrap(python_func)
980
--> 981 func_outputs = python_func(*func_args, **func_kwargs)
982
983 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in wrapped_fn(*args, **kwds)
437 # __wrapped__ allows AutoGraph to swap in a converted function. We give
438 # the function a weak reference to itself to avoid a reference cycle.
--> 439 return weak_wrapped_fn().__wrapped__(*args, **kwds)
440 weak_wrapped_fn = weakref.ref(wrapped_fn)
441
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
AssertionError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_eager.py:345 test_on_batch *
with backend.eager_learning_phase_scope(0):
/usr/lib/python3.6/contextlib.py:81 __enter__
return next(self.gen)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py:425 eager_learning_phase_scope
assert ops.executing_eagerly_outside_functions()
AssertionError:
Ah, I believe this issue is coming from mismatched expectations on sample_batch. TFF passes sample_batch to Keras, which calls a forward pass with this sample batch to initialize various attributes of the keras model. sample_batch should be either a sample from the literal data you are going to be feeding the model as on the server side, or a batch of fake data which matches the shape and type of the data you will be passing in.
An example of the former can be found here (this uses tf.data.Dataset), and there are several examples of the latter in test code, like here.
From what I see of the definition of the model, likely the x element of your sample_batch should be an ndarray of shape [2, 100, 300, 3] (where 2 is for the batch size, but technically this can be any nonzero dimension), and the y element should also match the expected y structure in the data you are using.
I hope this helps, just ping back if there are any problems!
One thing to note, that may be helpful in thinking about TFF--TFF is building a syntax tree representing the distributed computation you are defining via build_federated_averaging_process. This error actually occurs during construction of this object. TFF must trace the computation you pass it in order to know what structure to generate, and this is what is raising here. Actual training of the model happens when you call next on the returned IterativeProcess.
I have same problem:
if I execute this line
state, metrics = iterative_process.next(state, federated_train_data)
print('round 1, metrics={}'.format(metrics))
I find this error
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Default MaxPoolingOp only supports NHWC on device type CPU
[[{{node StatefulPartitionedCall/StatefulPartitionedCall/sequential/vgg16/block1_pool/MaxPool}}]]
[[subcomputation/StatefulPartitionedCall_1/ReduceDataset]]
[[subcomputation/StatefulPartitionedCall_1/ReduceDataset/_140]]
(1) Invalid argument: Default MaxPoolingOp only supports NHWC on device type CPU
[[{{node StatefulPartitionedCall/StatefulPartitionedCall/sequential/vgg16/block1_pool/MaxPool}}]]
[[subcomputation/StatefulPartitionedCall_1/ReduceDataset]]
0 successful operations.
0 derived errors ignored.
knowin that I employe VGG16
have you any idea on this type of error

TensorFlow beginner use estimator for prediction after running experiment

I am following this guide by Google (https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/courses/machine_learning/tensorflow/d_experiment.ipynb) to build a simple linear regression model.
In the notebook it has used the Experiment class and learn_runner (a class that I cannot find any documentation) to train up the model. I am now trying to use the model for prediction. I tried the following but i got an error. Would you please let me know the correct way to do it? Thanks.
Code added to the bottom:
# load the saved model
estimator = tflearn.LinearRegressor(feature_columns=feature_cols, model_dir='taxi_trained')
estimator.predict(input_fn=get_test)
Error got:
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_is_chief': True, '_model_dir': None, '_save_checkpoints_secs': 600, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000218611630F0>, '_master': '', '_task_id': 0, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_environment': 'local', '_num_worker_replicas': 0, '_tf_random_seed': None, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1
}
, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_task_type': None, '_num_ps_replicas': 0, '_save_summary_steps': 100}
WARNING:tensorflow:From c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\util\deprecation.py:335: calling LinearRegressor.predict (from tensorflow.contrib.learn.python.learn.estimators.linear) with outputs=None is deprecated and will be removed after 2017-03-01.
Instructions for updating:
Please switch to predict_scores, or set `outputs` argument.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-5-7f1903437174> in <module>()
1 with tf.Session() as sess:
2 estimator = tflearn.LinearRegressor(feature_columns=feature_cols, model_dir='taxi_trained')
----> 3 estimator.predict(input_fn=get_test)
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\util\deprecation.py in new_func(*args, **kwargs)
333 _call_location(), decorator_utils.get_qualified_name(func),
334 func.__module__, arg_name, arg_value, date, instructions)
--> 335 return func(*args, **kwargs)
336 new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
337 func.__doc__, date, instructions)
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\util\deprecation.py in new_func(*args, **kwargs)
333 _call_location(), decorator_utils.get_qualified_name(func),
334 func.__module__, arg_name, arg_value, date, instructions)
--> 335 return func(*args, **kwargs)
336 new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
337 func.__doc__, date, instructions)
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\linear.py in predict(self, x, input_fn, batch_size, outputs, as_iterable)
755 input_fn=input_fn,
756 batch_size=batch_size,
--> 757 as_iterable=as_iterable)
758 return super(LinearRegressor, self).predict(
759 x=x,
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\util\deprecation.py in new_func(*args, **kwargs)
333 _call_location(), decorator_utils.get_qualified_name(func),
334 func.__module__, arg_name, arg_value, date, instructions)
--> 335 return func(*args, **kwargs)
336 new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
337 func.__doc__, date, instructions)
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\linear.py in predict_scores(self, x, input_fn, batch_size, as_iterable)
790 batch_size=batch_size,
791 outputs=[key],
--> 792 as_iterable=as_iterable)
793 if as_iterable:
794 return _as_iterable(preds, output=key)
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\util\deprecation.py in new_func(*args, **kwargs)
279 _call_location(), decorator_utils.get_qualified_name(func),
280 func.__module__, arg_name, date, instructions)
--> 281 return func(*args, **kwargs)
282 new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
283 func.__doc__, date, instructions)
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py in predict(self, x, input_fn, batch_size, outputs, as_iterable)
563 feed_fn=feed_fn,
564 outputs=outputs,
--> 565 as_iterable=as_iterable)
566
567 def get_variable_value(self, name):
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py in _infer_model(self, input_fn, feed_fn, outputs, as_iterable, iterate_batches)
855 contrib_framework.create_global_step(g)
856 features = self._get_features_from_input_fn(input_fn)
--> 857 infer_ops = self._get_predict_ops(features)
858 predictions = self._filter_predictions(infer_ops.predictions, outputs)
859 mon_sess = monitored_session.MonitoredSession(
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py in _get_predict_ops(self, features)
1186 labels = tensor_signature.create_placeholders_from_signatures(
1187 self._labels_info)
-> 1188 return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)
1189
1190 def export_savedmodel(
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py in _call_model_fn(self, features, labels, mode)
1101 if 'model_dir' in model_fn_args:
1102 kwargs['model_dir'] = self.model_dir
-> 1103 model_fn_results = self._model_fn(features, labels, **kwargs)
1104
1105 if isinstance(model_fn_results, model_fn_lib.ModelFnOps):
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\linear.py in _linear_model_fn(features, labels, mode, params, config)
159 num_outputs=head.logits_dimension,
160 weight_collections=[parent_scope],
--> 161 scope=scope)
162
163 def _train_op_fn(loss):
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\layers\python\layers\feature_column_ops.py in weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections, trainable, scope)
529 # pylint: disable=protected-access
530 for column in sorted(set(feature_columns), key=lambda x: x.key):
--> 531 transformed_tensor = transformer.transform(column)
532 try:
533 embedding_lookup_arguments = column._wide_embedding_lookup_arguments(
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\layers\python\layers\feature_column_ops.py in transform(self, feature_column)
880 return self._columns_to_tensors[feature_column]
881
--> 882 feature_column.insert_transformed_feature(self._columns_to_tensors)
883
884 if feature_column not in self._columns_to_tensors:
c:\users\tommy\appdata\local\programs\python\python35\lib\site-packages\tensorflow\contrib\layers\python\layers\feature_column.py in insert_transformed_feature(self, columns_to_tensors)
1406 """
1407 # Transform the input tensor according to the normalizer function.
-> 1408 input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
1409 columns_to_tensors[self] = math_ops.to_float(input_tensor)
1410
KeyError: 'dropofflat'
I am using TensorFlow 1.1 with Python 3.5 on Windows 10. GPU enabled.