I am working on colab and using TPU, but unfortunatly it does not work properly and the model faces an issue while fitting ..
Here is my code:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_address)
tf.config.experimental_connect_to_cluster(resolver)
strategy = tf.distribute.TPUStrategy(resolver)
with strategy.scope():
model = create_model(input_shape=(HEIGHT, WIDTH, CANAL), n_out=N_CLASSES)
for layer in model.layers:
layer.trainable = False
for i in range(-5, 0):
model.layers[i].trainable = True
es = EarlyStopping(monitor='val_loss', mode='min', patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
rlrop = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=RLROP_PATIENCE, factor=DECAY_DROP, min_lr=1e-6, verbose=1)
callback_list = [es, rlrop]
optimizer = optimizers.Adam(lr=LEARNING_RATE)
model.compile(optimizer = optimizers.Adam(lr=WARMUP_LEARNING_RATE),
loss = 'categorical_crossentropy',
metrics = ['accuracy'])
model.summary()
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
history_finetunning = model.fit_generator(generator=train_generator,
steps_per_epoch=STEP_SIZE_TRAIN,
epochs=EPOCHS,
validation_data=valid_generator,
validation_steps=STEP_SIZE_VALID,
verbose =1)
And this is the error ..
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py:1915: UserWarning: `Model.fit_generator` is deprecated and will be removed in a future version. Please use `Model.fit`, which supports generators.
warnings.warn('`Model.fit_generator` is deprecated and '
Epoch 1/40
---------------------------------------------------------------------------
UnavailableError Traceback (most recent call last)
<ipython-input-41-1c157bad2449> in <module>()
4 validation_data=valid_generator,
5 validation_steps=STEP_SIZE_VALID,
----> 6 verbose =1)
14 frames
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1930 use_multiprocessing=use_multiprocessing,
1931 shuffle=shuffle,
-> 1932 initial_epoch=initial_epoch)
1933
1934 def evaluate_generator(self,
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1161 logs = tmp_logs # No error, now safe to assign to logs.
1162 end_step = step + data_handler.step_increment
-> 1163 callbacks.on_train_batch_end(end_step, logs)
1164 if self.stop_training:
1165 break
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in on_train_batch_end(self, batch, logs)
434 """
435 if self._should_call_train_batch_hooks:
--> 436 self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
437
438 def on_test_batch_begin(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_hook(self, mode, hook, batch, logs)
276 self._call_batch_begin_hook(mode, batch, logs)
277 elif hook == 'end':
--> 278 self._call_batch_end_hook(mode, batch, logs)
279 else:
280 raise ValueError('Unrecognized hook: {}'.format(hook))
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_end_hook(self, mode, batch, logs)
296 self._batch_times.append(batch_time)
297
--> 298 self._call_batch_hook_helper(hook_name, batch, logs)
299
300 if len(self._batch_times) >= self._num_batches_for_timing_check:
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_hook_helper(self, hook_name, batch, logs)
336 hook = getattr(callback, hook_name)
337 if getattr(callback, '_supports_tf_logs', False):
--> 338 hook(batch, logs)
339 else:
340 if numpy_logs is None: # Only convert once.
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in on_train_batch_end(self, batch, logs)
1042
1043 def on_train_batch_end(self, batch, logs=None):
-> 1044 self._batch_update_progbar(batch, logs)
1045
1046 def on_test_batch_end(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _batch_update_progbar(self, batch, logs)
1106 if self.verbose == 1:
1107 # Only block async when verbose = 1.
-> 1108 logs = tf_utils.sync_to_numpy_or_python_type(logs)
1109 self.progbar.update(self.seen, list(logs.items()), finalize=False)
1110
/usr/local/lib/python3.7/dist-packages/keras/utils/tf_utils.py in sync_to_numpy_or_python_type(tensors)
505 return t # Don't turn ragged or sparse tensors to NumPy.
506
--> 507 return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors)
508
509
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
501 def _to_single_numpy_or_python_type(t):
502 if isinstance(t, tf.Tensor):
--> 503 x = t.numpy()
504 return x.item() if np.ndim(x) == 0 else x
505 return t # Don't turn ragged or sparse tensors to NumPy.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
1092 """
1093 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
-> 1094 maybe_arr = self._numpy() # pylint: disable=protected-access
1095 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
1096
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1060 return self._numpy_internal()
1061 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1062 six.raise_from(core._status_to_exception(e.code, e.message), None) # pylint: disable=protected-access
1063
1064 #property
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)
UnavailableError: 3 root error(s) found.
(0) Unavailable: {{function_node __inference_train_function_53748}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"#1626347736.544045826","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":5420,"referenced_errors":[{"created":"#1626347735.785465323","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}]}
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNextAsOptional]]
[[cond_11/switch_pred/_107/_76]]
(1) Unavailable: {{function_node __inference_train_function_53748}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"#1626347736.544045826","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":5420,"referenced_errors":[{"created":"#1626347735.785465323","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}]}
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNextAsOptional]]
[[cluster_train_function/_execute_2_0/_333]]
(2) Unavailable: {{function_node __inference_train_function_53748}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"#1626347736.544045826","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":5420,"referenced_errors":[{"created":"#1626347735.785465323","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}]}
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNextAsOptional]]
0 successful operations.
6 derived errors ignored.
Using the same code for TPU configuration with tfds from tensorflow_datasets and fitting the model with model.fit method does not raise any error,
Model.fit_generator is deprecated and doesn't work on TPUs.
Try using tf.keras.preprocessing.image_dataset_from_directory or tf.data.Dataset and combining it with Keras preprocessing layers
Related
Got ResourceExhaustedError while training deep learning algorithm on NVIDIA GeForce RTX 3050 Ti Laptop GPU using tensorflow with memory_limit: 1721342363
I am training 2870 images and its working well using CPU but on GPU it seems to be getting restricted due to memory limit. Have I turned on a limit of memory on my GPU or do I have no option but to use my CPU?
It took me 70 mins on my CPU and that is why I chose to run on my GPU.
But while training
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(training_set, validation_data=test_set, epochs=20, batch_size=32)
Got this error:
Epoch 1/20
1/45 [..............................] - ETA: 9:46 - loss: 1.8638 - accuracy: 0.1667
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
Cell In [4], line 3
1 #Compile the model
2 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
----> 3 history = model.fit(training_set, validation_data=test_set, epochs=20, batch_size=32)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\keras\engine\training.py:1184, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1177 with tf.profiler.experimental.Trace(
1178 'train',
1179 epoch_num=epoch,
1180 step_num=step,
1181 batch_size=batch_size,
1182 _r=1):
1183 callbacks.on_train_batch_begin(step)
-> 1184 tmp_logs = self.train_function(iterator)
1185 if data_handler.should_sync:
1186 context.async_wait()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\def_function.py:885, in Function.__call__(self, *args, **kwds)
882 compiler = "xla" if self._jit_compile else "nonXla"
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
887 new_tracing_count = self.experimental_get_tracing_count()
888 without_tracing = (tracing_count == new_tracing_count)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\def_function.py:917, in Function._call(self, *args, **kwds)
914 self._lock.release()
915 # In this case we have created variables on the first call, so we run the
916 # defunned version which is guaranteed to never create variables.
--> 917 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
918 elif self._stateful_fn is not None:
919 # Release the lock early so that multiple threads can perform the call
920 # in parallel.
921 self._lock.release()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:3039, in Function.__call__(self, *args, **kwargs)
3036 with self._lock:
3037 (graph_function,
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 3039 return graph_function._call_flat(
3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:1963, in ConcreteFunction._call_flat(self, args, captured_inputs, cancellation_manager)
1959 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
1960 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
1961 and executing_eagerly):
1962 # No tape is watching; skip to running the function.
-> 1963 return self._build_call_outputs(self._inference_function.call(
1964 ctx, args, cancellation_manager=cancellation_manager))
1965 forward_backward = self._select_forward_and_backward_functions(
1966 args,
1967 possible_gradient_type,
1968 executing_eagerly)
1969 forward_function, args_with_tangents = forward_backward.forward()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:591, in _EagerDefinedFunction.call(self, ctx, args, cancellation_manager)
589 with _InterpolateFunctionError(self):
590 if cancellation_manager is None:
--> 591 outputs = execute.execute(
592 str(self.signature.name),
593 num_outputs=self._num_outputs,
594 inputs=args,
595 attrs=attrs,
596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
599 str(self.signature.name),
600 num_outputs=self._num_outputs,
(...)
603 ctx=ctx,
604 cancellation_manager=cancellation_manager)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\execute.py:59, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
ResourceExhaustedError: OOM when allocating tensor with shape[64,64,224,224] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node model/block1_conv2/Relu (defined at \AppData\Local\Temp\ipykernel_11956\3538519329.py:3) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_1205]
Function call stack:
train_function
I might be not entirely right on the details, but the training process looks like this from the GPU memory perspective:
the network is loaded into the GPU memory
a batch is loaded into the GPU memory
trainable parameters are computed on the GPU
So there are several ways to reduce GPU memory usage:
Reduce the number of trainable parameters so each training step requires less computations. You can do this by either reducing the size of the net or by making some of the later layers non-trainable if the net is pretrained (tf.keras.applications has a lot of networks pretrained on the imagenet).
To make layers of the Functional net (in which the abovementioned nets are written) untrainable, you should switch their .trainable attribute to False:
Say to freeze all the layers starting from the 100th one you need to:
ind_to_freeze = 100
for layer in net.layers[:100]:
layer.trainable=False
Reduce the image shapes so each batch requires less memory. Generally, this will lower the accuracy.
Reduce the batch size so each batch requires less memory.
Also I see you've loaded all 2870 images into the (CPU) memory at once. I recommend using generator instead that takes filenames and reads instead. You can read about building native tf.data.Datasets here
When trying to train my model on TPU in Colab
model.fit(train_dataset,
steps_per_epoch = len(df_train) // config.BATCH_SIZE,
validation_data = valid_dataset,
epochs = config.EPOCHS)
I got this error with whole traceback:
UnimplementedError Traceback (most recent call last)
<ipython-input-37-92afbe2b5ae5> in <module>()
2 steps_per_epoch = len(df_train) // config.BATCH_SIZE,
3 validation_data = valid_dataset,
----> 4 epochs = config.EPOCHS)
13 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1186 logs = tmp_logs # No error, now safe to assign to logs.
1187 end_step = step + data_handler.step_increment
-> 1188 callbacks.on_train_batch_end(end_step, logs)
1189 if self.stop_training:
1190 break
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
455 """
456 if self._should_call_train_batch_hooks:
--> 457 self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
458
459 def on_test_batch_begin(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _call_batch_hook(self, mode, hook, batch, logs)
315 self._call_batch_begin_hook(mode, batch, logs)
316 elif hook == 'end':
--> 317 self._call_batch_end_hook(mode, batch, logs)
318 else:
319 raise ValueError('Unrecognized hook: {}'.format(hook))
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _call_batch_end_hook(self, mode, batch, logs)
335 self._batch_times.append(batch_time)
336
--> 337 self._call_batch_hook_helper(hook_name, batch, logs)
338
339 if len(self._batch_times) >= self._num_batches_for_timing_check:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _call_batch_hook_helper(self, hook_name, batch, logs)
373 for callback in self.callbacks:
374 hook = getattr(callback, hook_name)
--> 375 hook(batch, logs)
376
377 if self._check_timing:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
1027
1028 def on_train_batch_end(self, batch, logs=None):
-> 1029 self._batch_update_progbar(batch, logs)
1030
1031 def on_test_batch_end(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/callbacks.py in _batch_update_progbar(self, batch, logs)
1099 if self.verbose == 1:
1100 # Only block async when verbose = 1.
-> 1101 logs = tf_utils.sync_to_numpy_or_python_type(logs)
1102 self.progbar.update(self.seen, list(logs.items()), finalize=False)
1103
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/utils/tf_utils.py in sync_to_numpy_or_python_type(tensors)
517 return t # Don't turn ragged or sparse tensors to NumPy.
518
--> 519 return nest.map_structure(_to_single_numpy_or_python_type, tensors)
520
521
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
865
866 return pack_sequence_as(
--> 867 structure[0], [func(*x) for x in entries],
868 expand_composites=expand_composites)
869
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
513 def _to_single_numpy_or_python_type(t):
514 if isinstance(t, ops.Tensor):
--> 515 x = t.numpy()
516 return x.item() if np.ndim(x) == 0 else x
517 return t # Don't turn ragged or sparse tensors to NumPy.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
1092 """
1093 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
-> 1094 maybe_arr = self._numpy() # pylint: disable=protected-access
1095 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
1096
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1060 return self._numpy_internal()
1061 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1062 six.raise_from(core._status_to_exception(e.code, e.message), None) # pylint: disable=protected-access
1063
1064 #property
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)
UnimplementedError: 9 root error(s) found.
(0) Unimplemented: {{function_node __inference_train_function_88574}} Asked to propagate a dynamic dimension from hlo convolution.24975#{}#2 to hlo %all-reduce.24980 = f32[3,3,<=3,32]{3,2,1,0} all-reduce(f32[3,3,<=3,32]{3,2,1,0} %convolution.24975), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=%sum.24976, metadata={op_type="CrossReplicaSum" op_name="while/body/_1/while/Adam/CrossReplicaSum"}, which is not implemented.
[[{{node TPUReplicate/_compile/_18168620323984915962/_4}}]]
[[while/body/_1/while/strided_slice_1/_253]]
(1) Unimplemented: {{function_node __inference_train_function_88574}} Asked to propagate a dynamic dimension from hlo convolution.24975#{}#2 to hlo %all-reduce.24980 = f32[3,3,<=3,32]{3,2,1,0} all-reduce(f32[3,3,<=3,32]{3,2,1,0} %convolution.24975), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=%sum.24976, metadata={op_type="CrossReplicaSum" op_name="while/body/_1/while/Adam/CrossReplicaSum"}, which is not implemented.
[[{{node TPUReplicate/_compile/_18168620323984915962/_4}}]]
[[TPUReplicate/_compile/_18168620323984915962/_4/_243]]
(2) Unimplemented: {{function_node __inference_train_function_88574}} Asked to propagate a dynamic dimension from hlo convolution.24975#{}#2 to hlo %all-reduce.24980 = f32[3,3,<=3,32]{3,2,1,0} all-reduce(f32[3,3,<=3,32]{3,2,1,0} %convolution.24975), replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=%sum.24976, metadata={op_type="CrossReplicaSum" op_name="while/body/_1/while/Adam/CrossReplicaSum"}, which is not implemented.[truncated]
Things That I have checked:
My data is in a GCS bucket and can be retrieved using the dataset object I created.
My model definition:
with strategy.scope():
base_model = efn.EfficientNetB0(include_top=False)
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(config.IMG_SIZE, config.IMG_SIZE, 3)),
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=config.LR),
loss = tf.keras.losses.SparseCategoricalCrossentropy(),
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()],
steps_per_execution = 32)
Any idea why this is happening. It says that a dynamic dimension was asked to propagate but I don't think this should be the case. Considering the model worked in GPU settings (with data present in the current session).
I was trying to reproach the examples Transfer learning and fine-tuning adapting for my problem My colab with GPU. When I use a softmax in the last Dense Layers, this error not ocorrer. But with 'from_logits=True', this error ocorrer, my imagens are jpg and they are divided in folders:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-85-7ea61d5df8ec> in <module>()
----> 1 loss0, accuracy0, auc0, precision0, recall0 = model.evaluate(val_ds)
5 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in evaluate(self, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing, return_dict)
1387 with trace.Trace('test', step_num=step, _r=1):
1388 callbacks.on_test_batch_begin(step)
-> 1389 tmp_logs = self.test_function(iterator)
1390 if data_handler.should_sync:
1391 context.async_wait()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
826 tracing_count = self.experimental_get_tracing_count()
827 with trace.Trace(self._name) as tm:
--> 828 result = self._call(*args, **kwds)
829 compiler = "xla" if self._experimental_compile else "nonXla"
830 new_tracing_count = self.experimental_get_tracing_count()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
893 # If we did not create any variables the trace we have is good enough.
894 return self._concrete_stateful_fn._call_flat(
--> 895 filtered_flat_args, self._concrete_stateful_fn.captured_inputs) # pylint: disable=protected-access
896
897 def fn_with_cond(inner_args, inner_kwds, inner_filtered_flat_args):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1917 # No tape is watching; skip to running the function.
1918 return self._build_call_outputs(self._inference_function.call(
-> 1919 ctx, args, cancellation_manager=cancellation_manager))
1920 forward_backward = self._select_forward_and_backward_functions(
1921 args,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
558 inputs=args,
559 attrs=attrs,
--> 560 ctx=ctx)
561 else:
562 outputs = execute.execute_with_cancellation(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (Dense121/dense_1/BiasAdd:0) = ] [[0.853173912 1.97515857 0.608713508...]...] [y (Cast_4/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
(1) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (Dense121/dense_1/BiasAdd:0) = ] [[0.853173912 1.97515857 0.608713508...]...] [y (Cast_4/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
[[assert_greater_equal_2/Assert/AssertGuard/branch_executed/_65/_167]]
0 successful operations.
0 derived errors ignored. [Op:__inference_test_function_61870]
Function call stack:
test_function -> test_function
I tried several things to solve and nothing
I'm getting the following error why I apply model.fit. I've described my model and other variables after this error.
--------------------------------------------------------------------------- TypeError Traceback (most recent call
last) in
----> 1 results = model.fit({'title_input': sequences_matrix_train_t, 'body_input': sequences_matrix_train_b},
2 {'main_output': y_train, 'aux_output': y_train},
3 validation_data=[{'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},
4 {'main_output': y_test, 'aux_output': y_test}],
5 epochs=5, batch_size=800)
~/Library/Python/3.8/lib/python/site-packages/tensorflow/python/keras/engine/training.py
in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside run_distribute_coordinator already.
~/Library/Python/3.8/lib/python/site-packages/tensorflow/python/keras/engine/training.py
in fit(self, x, y, batch_size, epochs, verbose, callbacks,
validation_split, validation_data, shuffle, class_weight,
sample_weight, initial_epoch, steps_per_epoch, validation_steps,
validation_batch_size, validation_freq, max_queue_size, workers,
use_multiprocessing) 1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator) 1099 if data_handler.should_sync: 1100
context.async_wait()
~/Library/Python/3.8/lib/python/site-packages/tensorflow/python/eager/def_function.py
in call(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~/Library/Python/3.8/lib/python/site-packages/tensorflow/python/eager/def_function.py
in _call(self, *args, **kwds)
805 # In this case we have created variables on the first call, so we run the
806 # defunned version which is guaranteed to never create variables.
--> 807 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
808 elif self._stateful_fn is not None:
809 # Release the lock early so that multiple threads can perform the call
TypeError: 'NoneType' object is not callable
Getting the above error when I fit my model. The model is described below:-
def RNN():
# RNN part for the title column
title_input = Input(name='title_input',shape=[max_len_of_title_tokens])
title_Embed = Embedding(vocab_len_t+1,2000,input_length=max_len_of_title_tokens,mask_zero=True,name='title_Embed')(title_input)
gru_out_t = GRU(300)(title_Embed)
# Using dense method for auxillary output to smoothen the effects of the GRU
auxiliary_output = Dense(10, activation='sigmoid', name='aux_output')(gru_out_t)
# RNN part for the body column
body_input = Input(name='body_input',shape=[max_len_of_body_tokens])
body_Embed = Embedding(vocab_len_b+1,170,input_length=max_len_of_body_tokens,mask_zero=True,name='body_Embed')(body_input)
gru_out_b = GRU(200)(body_Embed)
# combining these two
combined = concatenate([gru_out_t, gru_out_b])
# Feeding combined to our remaining two dense layers
dense1 = Dense(400,activation='relu')(combined)
dropout = Dropout(0.5)(dense1)
batchNormalizer = BatchNormalization()(dropout)
dense2 = Dense(150,activation='relu')(batchNormalizer)
main_output = Dense(10, activation='sigmoid', name='main_output')(dense2)
model = Model(inputs=[title_input, body_input],outputs=[main_output, auxiliary_output])
return model
#python function ends, stack overflow wouldn't let me indent lol.
#below is the code outside the 'RNN' function.
model = RNN() #this 'model' variable is the one over which I apply the 'fit' method.
#Here's how I'm applying the fit method:-
results = model.fit({'title_input': sequences_matrix_train_t, 'body_input': sequences_matrix_train_b},
{'main_output': y_train, 'aux_output': y_train},
validation_data=[{'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},
{'main_output': y_test, 'aux_output': y_test}],
epochs=5, batch_size=800)
Can someone please look over my error and help me figure out why it's being caused.
Here's my jupyter notebook for more info – https://github.com/datares/teamRed/blob/main/post_classification.ipynb
I tried to customize the model in "Image classification" tutorial in Tensorflow Federated. (It originally used a sequential model)
I use Keras ResNet50 but when it began to train, there is always an error "Incompatible shapes"
Here are my codes:
NUM_CLIENTS = 4
NUM_EPOCHS = 10
BATCH_SIZE = 2
SHUFFLE_BUFFER = 5
def create_compiled_keras_model():
model = tf.keras.applications.resnet.ResNet50(include_top=False, weights='imagenet',
input_tensor=tf.keras.layers.Input(shape=(100,
300, 3)), pooling=None)
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.02),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
return model
def model_fn():
keras_model = create_compiled_keras_model()
return tff.learning.from_compiled_keras_model(keras_model, sample_batch)
iterative_process = tff.learning.build_federated_averaging_process(model_fn)
Error information:
enter image description here
I feel that the shape is incompatible because the epoch and clients information were somehow missing. Would be very thankful if someone could give me a hint.
Updates:
The Assertion error happened during tff.learning.build_federated_averaging_process
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-164-dac26193d9d8> in <module>()
----> 1 iterative_process = tff.learning.build_federated_averaging_process(model_fn)
2
3 # iterative_process = build_federated_averaging_process(model_fn)
13 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/learning/federated_averaging.py in build_federated_averaging_process(model_fn, server_optimizer_fn, client_weight_fn, stateful_delta_aggregate_fn, stateful_model_broadcast_fn)
165 return optimizer_utils.build_model_delta_optimizer_process(
166 model_fn, client_fed_avg, server_optimizer_fn,
--> 167 stateful_delta_aggregate_fn, stateful_model_broadcast_fn)
/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/learning/framework/optimizer_utils.py in build_model_delta_optimizer_process(model_fn, model_to_client_delta_fn, server_optimizer_fn, stateful_delta_aggregate_fn, stateful_model_broadcast_fn)
349 # still need this.
350 with tf.Graph().as_default():
--> 351 dummy_model_for_metadata = model_utils.enhance(model_fn())
352
353 # ===========================================================================
<ipython-input-159-b2763ace8e5b> in model_fn()
1 def model_fn():
2 keras_model = model
----> 3 return tff.learning.from_compiled_keras_model(keras_model, sample_batch)
/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/learning/keras_utils.py in from_compiled_keras_model(keras_model, dummy_batch)
211 # Model.test_on_batch() once before asking for metrics.
212 if isinstance(dummy_tensors, collections.Mapping):
--> 213 keras_model.test_on_batch(**dummy_tensors)
214 else:
215 keras_model.test_on_batch(*dummy_tensors)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in test_on_batch(self, x, y, sample_weight, reset_metrics)
1007 sample_weight=sample_weight,
1008 reset_metrics=reset_metrics,
-> 1009 standalone=True)
1010 outputs = (
1011 outputs['total_loss'] + outputs['output_losses'] + outputs['metrics'])
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in test_on_batch(model, x, y, sample_weight, reset_metrics, standalone)
503 y,
504 sample_weights=sample_weights,
--> 505 output_loss_metrics=model._output_loss_metrics)
506
507 if reset_metrics:
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
568 xla_context.Exit()
569 else:
--> 570 result = self._call(*args, **kwds)
571
572 if tracing_count == self._get_tracing_count():
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
606 # In this case we have not created variables on the first call. So we can
607 # run the first trace but we should fail if variables are created.
--> 608 results = self._stateful_fn(*args, **kwds)
609 if self._created_variables:
610 raise ValueError("Creating variables on a non-first call to a function"
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
2407 """Calls a graph function specialized to the inputs."""
2408 with self._lock:
-> 2409 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
2410 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2411
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2765
2766 self._function_cache.missed.add(call_context_key)
-> 2767 graph_function = self._create_graph_function(args, kwargs)
2768 self._function_cache.primary[cache_key] = graph_function
2769 return graph_function, args, kwargs
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2655 arg_names=arg_names,
2656 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2657 capture_by_value=self._capture_by_value),
2658 self._function_attributes,
2659 # Tell the ConcreteFunction to clean up its graph once it goes out of
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
979 _, original_func = tf_decorator.unwrap(python_func)
980
--> 981 func_outputs = python_func(*func_args, **func_kwargs)
982
983 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in wrapped_fn(*args, **kwds)
437 # __wrapped__ allows AutoGraph to swap in a converted function. We give
438 # the function a weak reference to itself to avoid a reference cycle.
--> 439 return weak_wrapped_fn().__wrapped__(*args, **kwds)
440 weak_wrapped_fn = weakref.ref(wrapped_fn)
441
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
AssertionError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_eager.py:345 test_on_batch *
with backend.eager_learning_phase_scope(0):
/usr/lib/python3.6/contextlib.py:81 __enter__
return next(self.gen)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py:425 eager_learning_phase_scope
assert ops.executing_eagerly_outside_functions()
AssertionError:
Ah, I believe this issue is coming from mismatched expectations on sample_batch. TFF passes sample_batch to Keras, which calls a forward pass with this sample batch to initialize various attributes of the keras model. sample_batch should be either a sample from the literal data you are going to be feeding the model as on the server side, or a batch of fake data which matches the shape and type of the data you will be passing in.
An example of the former can be found here (this uses tf.data.Dataset), and there are several examples of the latter in test code, like here.
From what I see of the definition of the model, likely the x element of your sample_batch should be an ndarray of shape [2, 100, 300, 3] (where 2 is for the batch size, but technically this can be any nonzero dimension), and the y element should also match the expected y structure in the data you are using.
I hope this helps, just ping back if there are any problems!
One thing to note, that may be helpful in thinking about TFF--TFF is building a syntax tree representing the distributed computation you are defining via build_federated_averaging_process. This error actually occurs during construction of this object. TFF must trace the computation you pass it in order to know what structure to generate, and this is what is raising here. Actual training of the model happens when you call next on the returned IterativeProcess.
I have same problem:
if I execute this line
state, metrics = iterative_process.next(state, federated_train_data)
print('round 1, metrics={}'.format(metrics))
I find this error
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Default MaxPoolingOp only supports NHWC on device type CPU
[[{{node StatefulPartitionedCall/StatefulPartitionedCall/sequential/vgg16/block1_pool/MaxPool}}]]
[[subcomputation/StatefulPartitionedCall_1/ReduceDataset]]
[[subcomputation/StatefulPartitionedCall_1/ReduceDataset/_140]]
(1) Invalid argument: Default MaxPoolingOp only supports NHWC on device type CPU
[[{{node StatefulPartitionedCall/StatefulPartitionedCall/sequential/vgg16/block1_pool/MaxPool}}]]
[[subcomputation/StatefulPartitionedCall_1/ReduceDataset]]
0 successful operations.
0 derived errors ignored.
knowin that I employe VGG16
have you any idea on this type of error