How to make a custom metric available to TFMA/Beam? - tensorflow

I have created a custom Keras metric, similar to the demo implementation below:
import tensorflow as tf
class MyMetric(tf.keras.metrics.Mean):
def __init__(self, name='my_metric', dtype=None):
super(MyMetric, self).__init__(name=name, dtype=dtype)
def update_state(self, y_true, y_pred, sample_weight=None):
return super(MyMetric, self).update_state(
y_pred, sample_weight=sample_weight)
I have turned the implementation into a Python module with the init/main files and added the path to the system's PYTHONPATH.
I can use the metric when I train the Keras model.
Unfortunately, I haven't found a way to make the custom metric available to TensorFlow Model Analysis (TFMA).
In my interactive context notebook, I can load the metric when I create the eval_config.
import tensorflow as tf
import tensorflow_model_analysis as tfma
from mymetric.metric import MyMetric
metrics = [MyMetric()]
metrics_specs = tfma.metrics.specs_from_metrics(metrics)
eval_config = tfma.EvalConfig(
model_specs=[tfma.ModelSpec(label_key='label_xf')],
metrics_specs=metrics_specs,
slicing_specs=[tfma.SlicingSpec()]
)
evaluator = Evaluator(
examples=example_gen.outputs['examples'],
model=trainer.outputs['model'],
baseline_model=model_resolver.outputs['model'],
eval_config=eval_config)
When I try to execute the evaluator, the metric is listed as in the metric specifications
metrics_specs {
metrics {
class_name: "MyMetric"
config: "{\"dtype\": \"float32\", \"name\": \"my_metric\"}"
threshold {
}
}
}
but the execution fails with the error
ValueError: Unknown metric function: MyMetric
Since the metric calculation is executed via Apache Beam's executor.Do function, I assume that Beam can't find the module (even though it is on the PYTHONPATH). If that is the case, how can I make the module available to Apache Beam beyond the PYTHONPATH configuration?
Traceback:
/usr/local/lib/python3.6/dist-packages/tensorflow_model_analysis/metrics/metric_specs.py in _deserialize_tf_metric(metric_config, custom_objects)
741 cls_name, cfg = _tf_class_and_config(metric_config)
742 with tf.keras.utils.custom_object_scope(custom_objects):
--> 743 return tf.keras.metrics.deserialize({'class_name': cls_name, 'config': cfg})
744
745
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/metrics.py in deserialize(config, custom_objects)
3441 module_objects=globals(),
3442 custom_objects=custom_objects,
-> 3443 printable_module_name='metric function')
3444
3445
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/utils/generic_utils.py in deserialize_keras_object(identifier, module_objects, custom_objects, printable_module_name)
345 config = identifier
346 (cls, cls_config) = class_and_config_for_serialized_keras_object(
--> 347 config, module_objects, custom_objects, printable_module_name)
348
349 if hasattr(cls, 'from_config'):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/utils/generic_utils.py in class_and_config_for_serialized_keras_object(config, module_objects, custom_objects, printable_module_name)
294 cls = get_registered_object(class_name, custom_objects, module_objects)
295 if cls is None:
--> 296 raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
297
298 cls_config = config['config']
ValueError: Unknown metric function: MyMetric

You need to specify the module so that TFX knows where to find your MyMetric class. One way of doing this is to specify it as part of the metric specs:
from tensorflow_model_analysis import config
metric_config = [config.MetricConfig(class_name='MyMetric', module='mymodule.mymetric')]
metrics_specs = [config.MetricsSpec(metrics=metric_config)]
You will also need to create a module called mymodule and put your MyMetric class in in mymetric.py for this to work. Also make sure that the module is accessible from where you are executing the code (which should be the case if you have added it to your PYTHONPATH).

Related

Tensorflow how to change hub.Module() to local folder

how can I change:
BERT_MODEL = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return bert.tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
tokenizer = create_tokenizer_from_hub_module()
So that I can load a local BERT model without the hub.Module() call as it doesn't work with a local path.
I downloaded a different TF1 pre-trained model from a different website, unzipped it and stored in /test/module/.
If I change above BERT_MODEL = "/test/module" how would I need to change the rest? I now get string errors as tokenization_info = bert_module(signature="tokenization_info", as_dict=True) doesn't work.
Help please I am new to TF - note I need to use TF1, not TF2.
Note: on suggestion below I get:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-11-a98e44536f87> in <module>()
9 return vocab_file, do_lower_case
10
---> 11 print(get_bert_tokenizer_info("/tmp/local_copy"))
12 # Will print: (b'/tmp/local_copy/assets/vocab.txt', False)
4 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_hub/registry.py in __call__(self, *args, **kwargs)
43 raise RuntimeError(
44 "Missing implementation that supports: %s(*%r, **%r)" % (
---> 45 self._name, args, kwargs))
46
47
RuntimeError: Missing implementation that supports: loader(*('/tmp/local_copy',), **{})
hub.Module works with local uncompressed paths, so you can change BERT_MODEL to another path and reuse the same code.
Example:
Create local copy of the module:
mkdir /tmp/local_copy
wget "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1?tf-hub-format=compressed" -O "module.tar.gz"
tar -C /tmp/local_copy -xzvf module.tar.gz
Use the local copy of the module:
import tensorflow as tf
import tensorflow_hub as hub
def get_bert_tokenizer_info(bert_module):
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(bert_module)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return vocab_file, do_lower_case
print(get_bert_tokenizer_info("/tmp/local_copy"))
# Will print: (b'/tmp/local_copy/assets/vocab.txt', False)

Using Sagemaker predictor in a Spark UDF function

I am trying to run inference on a Tensorflow model deployed on SageMaker from a Python Spark job.
I am running a (Databricks) notebook which has the following cell:
def call_predict():
batch_size = 1
data = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2]]
tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[batch_size, len(data[0])], dtype=tf.float32)
prediction = predictor.predict(tensor_proto)
print("Process time: {}".format((time.clock() - start)))
return prediction
If I just call call_predict() it works fine:
call_predict()
and I get the output:
Process time: 65.261396
Out[61]: {'model_spec': {'name': u'generic_model',
'signature_name': u'serving_default',
'version': {'value': 1578909324L}},
'outputs': {u'ages': {'dtype': 1,
'float_val': [5.680944442749023],
'tensor_shape': {'dim': [{'size': 1L}]}}}}
but when I try to call from a Spark context (in a UDF) I get a serialization error.
The code I'm trying to run is:
dataRange = range(1, 10001)
rangeRDD = sc.parallelize(dataRange, 8)
new_data = rangeRDD.map(lambda x : call_predict())
new_data.count()
and the error I get is:
---------------------------------------------------------------------------
PicklingError Traceback (most recent call last)
<command-2282434> in <module>()
2 rangeRDD = sc.parallelize(dataRange, 8)
3 new_data = rangeRDD.map(lambda x : call_predict())
----> 4 new_data.count()
5
/databricks/spark/python/pyspark/rdd.pyc in count(self)
1094 3
1095 """
-> 1096 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1097
1098 def stats(self):
/databricks/spark/python/pyspark/rdd.pyc in sum(self)
1085 6.0
1086 """
-> 1087 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
1088
1089 def count(self):
/databricks/spark/python/pyspark/rdd.pyc in fold(self, zeroValue, op)
956 # zeroValue provided to each partition is unique from the one provided
957 # to the final reduce call
--> 958 vals = self.mapPartitions(func).collect()
959 return reduce(op, vals, zeroValue)
960
/databricks/spark/python/pyspark/rdd.pyc in collect(self)
829 # Default path used in OSS Spark / for non-credential passthrough clusters:
830 with SCCallSiteSync(self.context) as css:
--> 831 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
832 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
833
/databricks/spark/python/pyspark/rdd.pyc in _jrdd(self)
2573
2574 wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
-> 2575 self._jrdd_deserializer, profiler)
2576 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
2577 self.preservesPartitioning, self.is_barrier)
/databricks/spark/python/pyspark/rdd.pyc in _wrap_function(sc, func, deserializer, serializer, profiler)
2475 assert serializer, "serializer should not be empty"
2476 command = (func, profiler, deserializer, serializer)
-> 2477 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
2478 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
2479 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/databricks/spark/python/pyspark/rdd.pyc in _prepare_for_python_RDD(sc, command)
2461 # the serialized command will be compressed by broadcast
2462 ser = CloudPickleSerializer()
-> 2463 pickled_command = ser.dumps(command)
2464 if len(pickled_command) > sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M
2465 # The broadcast will have same life cycle as created PythonRDD
/databricks/spark/python/pyspark/serializers.pyc in dumps(self, obj)
709 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
710 cloudpickle.print_exec(sys.stderr)
--> 711 raise pickle.PicklingError(msg)
712
713
PicklingError: Could not serialize object: TypeError: can't pickle _ssl._SSLSocket objects
Not sure what is this serialization error - does is complain about failing to deserialize the Predictor
My notebook has a cell which was called prior to the above cells with the following imports:
import sagemaker
import boto3
from sagemaker.tensorflow.model import TensorFlowPredictor
import tensorflow as tf
import numpy as np
import time
The Predictor was created with the following code:
sagemaker_client = boto3.client('sagemaker', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
sagemaker_runtime_client = boto3.client('sagemaker-runtime', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session, sagemaker_client=sagemaker_client, sagemaker_runtime_client=sagemaker_runtime_client)
predictor = TensorFlowPredictor('endpoint-poc', sagemaker_session)
The udf function will be executed by multiple spark tasks in parallel. Those tasks run in completely isolated python processes and they are scheduled to physically different machines. Hence each data, those functions reference, must be on the same node. This is the case for everything created within the udf.
Whenever you reference any object outside of the udf from the function, this data structure needs to be serialised (pickled) to each executor. Some object state, like open connections to a socket, cannot be pickled.
You need to make sure, that connections are lazily opened each executor. It must happen only on the first function call on that executor. The connection pooling topic is covered in the docs, however only in the spark streaming guide (though it also applies for normal batch jobs).
Normally one can use the Singleton Pattern for this. But in python people use the Borgh pattern.
class Env:
_shared_state = {
"sagemaker_client": None
"sagemaker_runtime_client": None
"boto_session": None
"sagemaker_session": None
"predictor": None
}
def __init__(self):
self.__dict__ = self._shared_state
if not self.predictor:
self.sagemaker_client = boto3.client('sagemaker', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
self.sagemaker_runtime_client = boto3.client('sagemaker-runtime', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
self.boto_session = boto3.Session(region_name='us-east-1')
self.sagemaker_session = sagemaker.Session(self.boto_session, sagemaker_client=self.sagemaker_client, sagemaker_runtime_client=self.sagemaker_runtime_client)
self.predictor = TensorFlowPredictor('endpoint-poc', self.sagemaker_session)
#....
def call_predict():
env = Env()
batch_size = 1
data = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2]]
tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[batch_size, len(data[0])], dtype=tf.float32)
prediction = env.predictor.predict(tensor_proto)
print("Process time: {}".format((time.clock() - start)))
return prediction
new_data = rangeRDD.map(lambda x : call_predict())
The Env class is defined on the master node. Its _shared_state has empty entries. When then Env object is instantiated first time, it shares the state with all further instances of Env on any subsequent call to the udf. On each separate parallel running process this will happen exactly one time. This way the sessions are shared and do not need to pickled.

When I execute nlp.begin_training() I'm getting following error

"I'm trying to train the ner model using spacy. It works fine for CPU. But when I try executing it using GPU I'm getting the following error. Spacy version 2.1.4, CUDA version 10.1"
"I tried re-installing thinc but still I'm getting the error"
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import json
spacy.require_gpu()
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
ner = nlp.create_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
optimizer = nlp.begin_training()
"I'm getting the following error"
"CUDARuntimeError
Traceback (most recent call last)
in
----> 1 optimizer = nlp.begin_training()
G:\Anaconda3\lib\site-packages\spacy\language.py in begin_training(self, get_gold_tuples, sgd, component_cfg, **cfg)
547 if self.vocab.vectors.data.shape[1] >= 1:
548 self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
--> 549 link_vectors_to_models(self.vocab)
550 if self.vocab.vectors.data.shape[1]:
551 cfg["pretrained_vectors"] = self.vocab.vectors.name
G:\Anaconda3\lib\site-packages\spacy_ml.py in link_vectors_to_models(vocab)
297 else:
298 word.rank = 0
--> 299 data = ops.asarray(vectors.data)
300 # Set an entry here, so that vectors are accessed by StaticVectors
301 # (unideal, I know)
ops.pyx in thinc.neural.ops.CupyOps.asarray()
G:\Anaconda3\lib\site-packages\cupy\creation\from_data.py in array(obj, dtype, copy, order, subok, ndmin)
39
40 """
---> 41 return core.array(obj, dtype, copy, order, subok, ndmin)
42
43
cupy\core\core.pyx in cupy.core.core.array()
cupy\core\core.pyx in cupy.core.core.array()
cupy\core\core.pyx in cupy.core.core.ndarray.__init__()
cupy\cuda\memory.pyx in cupy.cuda.memory.alloc()
cupy\cuda\memory.pyx in cupy.cuda.memory.MemoryPool.malloc()
cupy\cuda\memory.pyx in cupy.cuda.memory.MemoryPool.malloc()
cupy\cuda\device.pyx in cupy.cuda.device.get_device_id()
cupy\cuda\runtime.pyx in cupy.cuda.runtime.getDevice()
cupy\cuda\runtime.pyx in cupy.cuda.runtime.check_status()
CUDARuntimeError: cudaErrorUnknown: unknown error"

Tesorflow Custom Layer in High level API: throws object has no attribute '_expects_mask_arg' error

I am trying to reconsturct an image based on three inputs of previous layers normal (None,128,128,3),albedo(None,128,128,3) and lighting(27) . But here the code still says object has no attribute '_expects_mask_arg' error .I have presented my code here in which I have implemented a custom layer using Tensorflow v2 beta using the high level API.
import math
class Reconstruction_Layer(tf.keras.layers.Layer):
def __init__(self,input_shape ):
super(Reconstruction_Layer, self).__init__()
#self.num_outputs = num_outputs
#self.pixel=np.zeros((9),dtype=int)
self.sphar=np.zeros((9),dtype=float)
self.y=np.zeros((9),dtype=float)
self.reconstructed_img=np.zeros((128,128,3),dtype=float)
#self.y=tf.zeros([128,128,9])
self.normal_light=np.zeros((128,128,9),dtype=float)
self.y_temp=np.zeros((9),dtype=float)
w_init = tf.random_normal_initializer()
self.r_img = tf.Variable(initial_value=w_init(shape=input_shape),dtype='float32',trainable=True)
def build(self,input_shape):
super(MyLayer, self).build(input_shape)
def call(self,input_layer):
self.normal,self.albedo,self.light = input_layer
for i in range(128):
for j in range(128):
#self.y=spherical_harmonic_calc(self.normal(i,j))
self.pixel=self.normal[i,j,:]
#self.normal_light(i,j)= self.y
self.sphar[0]=(1/((4*math.pi)**0.5))
self.sphar[1]=((3/(4*math.pi))**0.5)*self.pixel[2]
self.sphar[3]=(((3/(4*math.pi))**0.5)*self.pixel[1])
self.sphar[4]=((1/2)*((5/(4*math.pi))**0.5)*(3*(self.pixel[2]**2) - 1))
self.sphar[5]=(3*((5/(12*math.pi))**0.5)*self.pixel[2]*self.pixel[0])
self.sphar[6]=(3*((5/(12*math.pi))**0.5)*self.pixel[2]*self.pixel[1])
self.sphar[7]=((3/2)*((5/(12*math.pi))**0.5)*((self.pixel[0]**2)-(self.pixel[1]**2)))
self.sphar[8]=(3*((5/(12*math.pi))**0.5)*self.pixel[0]*self.pixel[1])
self.normal_light[i,j,:]=self.sphar
for j in range(128):
for k in range(128):
for i in range(3):
self.reconstructed_img[j,k,i]=self.albedo[j,k,i]* tf.tensordot(self.normal_light[j,k],self.light[i*9:(i+1)*9 ],axes=1)
self.reconstructed_img=tf.convert_to_tensor(self.reconstructed_img)
self.r_img=self.reconstructed_img
return self.r_img
"""
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-74-06759ef5b0b5> in <module>
1 import numpy as np
----> 2 x=Reconstruction_Layer((128,128,3))(d)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
580 # explicitly take priority.
581 input_masks = self._collect_input_masks(inputs, args, kwargs)
--> 582 if (self._expects_mask_arg and input_masks is not None and
583 not self._call_arg_was_passed('mask', args, kwargs)):
584 kwargs['mask'] = input_masks
AttributeError: 'Reconstruction_Layer' object has no attribute '_expects_mask_arg'
"""
I just had the same error and it was due to me forgetting to call .__init__() after super(). You did it, but this make me think that this error is due to wrong initialization of the base layer you are deriving from.
I notice that in the doc example it's not necessary to call build() on the base layer, and it works for me if you remove that function (as it does nothing related to your layer).

In keras, how can you clone a model with custom objects?

I have a model with a custom activation. As a result,
model2 = keras.models.clone_model(model)
gives an error. I'm able to load saved models using custom_objects keyword, but I see no such option on clone_model. Is there a way around it besides remaking the model and transferring weights?
EDIT:
Here's example code (toy problem):
import tensorflow.keras as keras
import tensorflow.keras.backend as K
def myTanh(x):
return K.tanh(x)
inp = keras.Input(shape=(10,10,1))
flat = keras.layers.Flatten()(inp)
out = keras.layers.Dense(20, activation=myTanh)(flat)
model = keras.Model(inp,out)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss='categorical_crossentropy')
model2 = keras.models.clone_model(model)
And the error dump:
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/models.py in clone_model(model, input_tensors)
269 return _clone_sequential_model(model, input_tensors=input_tensors)
270 else:
--> 271 return _clone_functional_model(model, input_tensors=input_tensors)
272
273
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/models.py in _clone_functional_model(model, input_tensors)
129 if layer not in layer_map:
130 # Clone layer.
--> 131 new_layer = layer.__class__.from_config(layer.get_config())
132 layer_map[layer] = new_layer
133 layer = new_layer
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py in from_config(cls, config)
400 A layer instance.
401 """
--> 402 return cls(**config)
403
404 def compute_output_shape(self, input_shape):
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/layers/core.py in __init__(self, units, activation, use_bias, kernel_initializer, bias_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, kernel_constraint, bias_constraint, **kwargs)
920 activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
921 self.units = int(units)
--> 922 self.activation = activations.get(activation)
923 self.use_bias = use_bias
924 self.kernel_initializer = initializers.get(kernel_initializer)
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/activations.py in get(identifier)
209 if isinstance(identifier, six.string_types):
210 identifier = str(identifier)
--> 211 return deserialize(identifier)
212 elif callable(identifier):
213 return identifier
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/activations.py in deserialize(name, custom_objects)
200 module_objects=globals(),
201 custom_objects=custom_objects,
--> 202 printable_module_name='activation function')
203
204
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/utils/generic_utils.py in deserialize_keras_object(identifier, module_objects, custom_objects, printable_module_name)
210 if fn is None:
211 raise ValueError('Unknown ' + printable_module_name + ':' +
--> 212 function_name)
213 return fn
214 else:
ValueError: Unknown activation function:myTanh
I solved the issue by calling
keras.utils.get_custom_objects().update(custom_objects)
Right after the definition of the additional objects that keras must be aware of to properly clone the model.
def lrelu(x, alpha=0.2):
return tf.nn.relu(x) * (1 - alpha) + x * alpha
custom_object = {
'lrelu': lrelu,
}
keras.utils.get_custom_objects().update(custom_objects)
This is an open bug in Keras.
The suggested way around is to use a Lambda layer in stead of an Activation layer.
x = keras.layers.Lambda(my_custom_activation_function)(x)