Using Sagemaker predictor in a Spark UDF function - tensorflow

I am trying to run inference on a Tensorflow model deployed on SageMaker from a Python Spark job.
I am running a (Databricks) notebook which has the following cell:
def call_predict():
batch_size = 1
data = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2]]
tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[batch_size, len(data[0])], dtype=tf.float32)
prediction = predictor.predict(tensor_proto)
print("Process time: {}".format((time.clock() - start)))
return prediction
If I just call call_predict() it works fine:
call_predict()
and I get the output:
Process time: 65.261396
Out[61]: {'model_spec': {'name': u'generic_model',
'signature_name': u'serving_default',
'version': {'value': 1578909324L}},
'outputs': {u'ages': {'dtype': 1,
'float_val': [5.680944442749023],
'tensor_shape': {'dim': [{'size': 1L}]}}}}
but when I try to call from a Spark context (in a UDF) I get a serialization error.
The code I'm trying to run is:
dataRange = range(1, 10001)
rangeRDD = sc.parallelize(dataRange, 8)
new_data = rangeRDD.map(lambda x : call_predict())
new_data.count()
and the error I get is:
---------------------------------------------------------------------------
PicklingError Traceback (most recent call last)
<command-2282434> in <module>()
2 rangeRDD = sc.parallelize(dataRange, 8)
3 new_data = rangeRDD.map(lambda x : call_predict())
----> 4 new_data.count()
5
/databricks/spark/python/pyspark/rdd.pyc in count(self)
1094 3
1095 """
-> 1096 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1097
1098 def stats(self):
/databricks/spark/python/pyspark/rdd.pyc in sum(self)
1085 6.0
1086 """
-> 1087 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
1088
1089 def count(self):
/databricks/spark/python/pyspark/rdd.pyc in fold(self, zeroValue, op)
956 # zeroValue provided to each partition is unique from the one provided
957 # to the final reduce call
--> 958 vals = self.mapPartitions(func).collect()
959 return reduce(op, vals, zeroValue)
960
/databricks/spark/python/pyspark/rdd.pyc in collect(self)
829 # Default path used in OSS Spark / for non-credential passthrough clusters:
830 with SCCallSiteSync(self.context) as css:
--> 831 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
832 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
833
/databricks/spark/python/pyspark/rdd.pyc in _jrdd(self)
2573
2574 wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
-> 2575 self._jrdd_deserializer, profiler)
2576 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
2577 self.preservesPartitioning, self.is_barrier)
/databricks/spark/python/pyspark/rdd.pyc in _wrap_function(sc, func, deserializer, serializer, profiler)
2475 assert serializer, "serializer should not be empty"
2476 command = (func, profiler, deserializer, serializer)
-> 2477 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
2478 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
2479 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/databricks/spark/python/pyspark/rdd.pyc in _prepare_for_python_RDD(sc, command)
2461 # the serialized command will be compressed by broadcast
2462 ser = CloudPickleSerializer()
-> 2463 pickled_command = ser.dumps(command)
2464 if len(pickled_command) > sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M
2465 # The broadcast will have same life cycle as created PythonRDD
/databricks/spark/python/pyspark/serializers.pyc in dumps(self, obj)
709 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
710 cloudpickle.print_exec(sys.stderr)
--> 711 raise pickle.PicklingError(msg)
712
713
PicklingError: Could not serialize object: TypeError: can't pickle _ssl._SSLSocket objects
Not sure what is this serialization error - does is complain about failing to deserialize the Predictor
My notebook has a cell which was called prior to the above cells with the following imports:
import sagemaker
import boto3
from sagemaker.tensorflow.model import TensorFlowPredictor
import tensorflow as tf
import numpy as np
import time
The Predictor was created with the following code:
sagemaker_client = boto3.client('sagemaker', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
sagemaker_runtime_client = boto3.client('sagemaker-runtime', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session, sagemaker_client=sagemaker_client, sagemaker_runtime_client=sagemaker_runtime_client)
predictor = TensorFlowPredictor('endpoint-poc', sagemaker_session)

The udf function will be executed by multiple spark tasks in parallel. Those tasks run in completely isolated python processes and they are scheduled to physically different machines. Hence each data, those functions reference, must be on the same node. This is the case for everything created within the udf.
Whenever you reference any object outside of the udf from the function, this data structure needs to be serialised (pickled) to each executor. Some object state, like open connections to a socket, cannot be pickled.
You need to make sure, that connections are lazily opened each executor. It must happen only on the first function call on that executor. The connection pooling topic is covered in the docs, however only in the spark streaming guide (though it also applies for normal batch jobs).
Normally one can use the Singleton Pattern for this. But in python people use the Borgh pattern.
class Env:
_shared_state = {
"sagemaker_client": None
"sagemaker_runtime_client": None
"boto_session": None
"sagemaker_session": None
"predictor": None
}
def __init__(self):
self.__dict__ = self._shared_state
if not self.predictor:
self.sagemaker_client = boto3.client('sagemaker', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
self.sagemaker_runtime_client = boto3.client('sagemaker-runtime', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
self.boto_session = boto3.Session(region_name='us-east-1')
self.sagemaker_session = sagemaker.Session(self.boto_session, sagemaker_client=self.sagemaker_client, sagemaker_runtime_client=self.sagemaker_runtime_client)
self.predictor = TensorFlowPredictor('endpoint-poc', self.sagemaker_session)
#....
def call_predict():
env = Env()
batch_size = 1
data = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2]]
tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[batch_size, len(data[0])], dtype=tf.float32)
prediction = env.predictor.predict(tensor_proto)
print("Process time: {}".format((time.clock() - start)))
return prediction
new_data = rangeRDD.map(lambda x : call_predict())
The Env class is defined on the master node. Its _shared_state has empty entries. When then Env object is instantiated first time, it shares the state with all further instances of Env on any subsequent call to the udf. On each separate parallel running process this will happen exactly one time. This way the sessions are shared and do not need to pickled.

Related

Incremental PCA on big dataset, with large component demand

I am trying to find the main 200 components of a datasets of 846 images (2048x2048x3 RGB) with sklearn.decomposition.IncrementalPCA.
Data are read by cv2 and reshaped into a 2d np array ([846,2048x2048x3] size, float16)
To ensure a smaller memory cost, I used partial_fit() and divide the original data into smaller chunks (batches) in both partial_fit() and transform() steps.
just like the way in this problem's solution:
Python PCA on Matrix too large to fit into memory
Now my code works well for relative smaller size computations, like computing 20 components for 200 images in the datasets. It outputs right outcomes.
However, the tasks demands me to compute 200 components, which leads to the limit that my batch's size should be larger or at least equal to 200. (according to sklearn's document and the information in the terminal when running the code)
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html
With such big chunk size,I can finish the IPCA model set, but always face MemoryError when doing partial_fit()
What's more, another problem is:
I need to use inverse_transform later, I am not sure if I can use chunk-style compute in this step or not. (In the code below I did not use it.)
What can I do to avoid this MemoryError? Or should I replace IncrementalPCA with some other method instead ? (these alternatives should have some method like inverse_transform())
The all memory I can access to is 131661572 kB(~127GB)
My code:
from sklearn.decomposition import PCA, IncrementalPCA
import numpy as np
import cv2
import os
folder_path = "./output_img"
input=[]
for i in range(1, 847):
if i%10 == 0: print("loading",i,"th image")
# if i == 60: continue #special case, should be skipped
image_path = folder_path+f"/{i}neutral.jpg"
img = cv2.imread(image_path)
input.append(img.reshape(-1))
print("Loaded all",i,"images")
# change into numpy matrix
all_image = np.stack(input,axis=0)
# trans to 0-1 format float64
all_image = (all_image.astype(np.float16))
### shape: #_of_imag x image_pixel_num (50331648 for img_normals case)
# print(all_image)
# print(all_image.shape)
# PCA, keeps 200 features
COM_NUM=200
pca=IncrementalPCA(n_components = COM_NUM)
print("finished IPCA model set")
saving_path = "./principle847"
element_num = all_image.shape[0] # how many elements(rows) we have in the dataset
chunk_size = 220 # how many elements we feed to IPCA at a time
for i in range(0, element_num//chunk_size):
pca.partial_fit(all_image[i*chunk_size : (i+1)*chunk_size])
print("finished PCA fit:",i*chunk_size,"to",(i+1)*chunk_size)
pca.partial_fit(all_image[(i+1)*chunk_size : element_num]) #tail
print("finished PCA fit:",(i+1)*chunk_size,"to",element_num)
for i in range(0, element_num//chunk_size):
if i==0:
result = pca.transform(all_image[i*chunk_size : (i+1)*chunk_size])
else:
tmp = pca.transform(all_image[i*chunk_size : (i+1)*chunk_size])
result = np.concatenate((result, tmp), axis=0)
print("finished PCA transform:",i*chunk_size,"to",(i+1)*chunk_size)
tmp = pca.transform(all_image[(i+1)*chunk_size : element_num]) #tail
result = np.concatenate((result, tmp), axis=0)
print("finished PCA transform:",(i+1)*chunk_size,"to",element_num)
result = pca.inverse_transform(result)
print("PCA mean:",pca.mean_)
mean_img = pca.mean_
mean_img = mean_img.reshape(2048,2048,3)
mean_img = mean_img.astype(np.uint8)
cv2.imwrite(os.path.join(saving_path,("mean.png")),mean_img)
result=result.reshape(-1,2048,2048,3)
# result shape: #_of_componets * 2048 * 2048 * 3
dst = result
# dst=result/np.linalg.norm(result,axis=(3),keepdims=True)
for j in range(0,COM_NUM):
reconImage = (dst)[j]
# reconImage = reconImage.reshape(4096,4096,3)
reconImage = np.clip(reconImage,0,255)
reconImage = reconImage.astype(np.uint8)
cv2.imwrite(os.path.join(saving_path,("p"+str(j)+".png")),reconImage)
print("Saved",j+1,"principle imgs")
The error goes like:
File "model_generate.py", line 36, in <module>
pca.partial_fit(all_image[i*chunk_size : (i+1)*chunk_size])
File "/root/anaconda3/envs/PCA/lib/python3.8/site-packages/sklearn/decomposition/_incremental_pca.py", line 299, in partial_fit
U, V = svd_flip(U, V, u_based_decision=False)
File "/root/anaconda3/envs/PCA/lib/python3.8/site-packages/sklearn/utils/extmath.py", line 538, in svd_flip
max_abs_rows = np.argmax(np.abs(v), axis=1)
File "/root/anaconda3/envs/PCA/lib/python3.8/site-packages/numpy/core/fromnumeric.py", line 1103, in argmax
return _wrapfunc(a, 'argmax', axis=axis, out=out)
File "/root/anaconda3/envs/PCA/lib/python3.8/site-packages/numpy/core/fromnumeric.py", line 56, in _wrapfunc
return getattr(obj, method)(*args, **kwds)
MemoryError

Using BatchedPyEnvironment in tf_agents

I am trying to create a batched environment version of an SAC agent example from the Tensorflow Agents library, the original code can be found here. I am also using a custom environment.
I am pursuing a batched environment setup in order to better leverage GPU resources in order to speed up training. My understanding is that by passing batches of trajectories to the GPU, there will be less overhead incurred when passing data from the host (CPU) to the device (GPU).
My custom environment is called SacEnv, and I attempt to create a batched environment like so:
py_envs = [SacEnv() for _ in range(0, batch_size)]
batched_env = batched_py_environment.BatchedPyEnvironment(envs=py_envs)
tf_env = tf_py_environment.TFPyEnvironment(batched_env)
My hope is that this will create a batched environment consisting of a 'batch' of non-batched environments. However I am receiving the following error when running the code:
ValueError: Cannot assign value to variable ' Accumulator:0': Shape mismatch.The variable shape (1,), and the assigned value shape (32,) are incompatible.
with the stack trace:
Traceback (most recent call last):
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 370, in <module>
app.run(main)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/absl/app.py", line 312, in run
_run_main(main, args)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/absl/app.py", line 258, in _run_main
sys.exit(main(argv))
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 366, in main
train_eval(FLAGS.root_dir)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 274, in train_eval
results = metric_utils.eager_compute(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/eval/metric_utils.py", line 163, in eager_compute
common.function(driver.run)(time_step, policy_state)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 211, in run
return self._run_fn(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/utils/common.py", line 188, in with_check_resource_vars
return fn(*fn_args, **fn_kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 238, in _run
tf.while_loop(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 154, in loop_body
observer_ops = [observer(traj) for observer in self._observers]
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 154, in <listcomp>
observer_ops = [observer(traj) for observer in self._observers]
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metric.py", line 93, in __call__
return self._update_state(*args, **kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metric.py", line 81, in _update_state
return self.call(*arg, **kwargs)
ValueError: in user code:
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metrics.py", line 176, in call *
self._return_accumulator.assign(
ValueError: Cannot assign value to variable ' Accumulator:0': Shape mismatch.The variable shape (1,), and the assigned value shape (32,) are incompatible.
In call to configurable 'eager_compute' (<function eager_compute at 0x7fa4d6e5e040>)
In call to configurable 'train_eval' (<function train_eval at 0x7fa4c8622dc0>)
I have dug through the tf_metric.py code to try and understand the error, however I have been unsuccessful. A related issue was solved when I added the batch size (32) to the initializer for the AverageReturnMetric instance, and this issue seems related.
The full code is:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python2, python3
r"""Train and Eval SAC.
All hyperparameters come from the SAC paper
https://arxiv.org/pdf/1812.05905.pdf
To run:
```bash
tensorboard --logdir $HOME/tmp/sac/gym/HalfCheetah-v2/ --port 2223 &
python tf_agents/agents/sac/examples/v2/train_eval.py \
--root_dir=$HOME/tmp/sac/gym/HalfCheetah-v2/ \
--alsologtostderr
\```
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sac_env import SacEnv
import os
import time
from absl import app
from absl import flags
from absl import logging
import gin
from six.moves import range
import tensorflow as tf # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.agents.sac import tanh_normal_projection_network
from tf_agents.drivers import dynamic_step_driver
#from tf_agents.environments import suite_mujoco
from tf_agents.environments import tf_py_environment
from tf_agents.environments import batched_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
from tf_agents.train.utils import strategy_utils
flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
'Root directory for writing logs/summaries/checkpoints.')
flags.DEFINE_multi_string('gin_file', None, 'Path to the trainer config files.')
flags.DEFINE_multi_string('gin_param', None, 'Gin binding to pass through.')
FLAGS = flags.FLAGS
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
print(e)
#gin.configurable
def train_eval(
root_dir,
env_name='SacEnv',
# The SAC paper reported:
# Hopper and Cartpole results up to 1000000 iters,
# Humanoid results up to 10000000 iters,
# Other mujoco tasks up to 3000000 iters.
num_iterations=3000000,
actor_fc_layers=(256, 256),
critic_obs_fc_layers=None,
critic_action_fc_layers=None,
critic_joint_fc_layers=(256, 256),
# Params for collect
# Follow https://github.com/haarnoja/sac/blob/master/examples/variants.py
# HalfCheetah and Ant take 10000 initial collection steps.
# Other mujoco tasks take 1000.
# Different choices roughly keep the initial episodes about the same.
#initial_collect_steps=10000,
initial_collect_steps=2000,
collect_steps_per_iteration=1,
replay_buffer_capacity=31250, # 1000000 / 32
# Params for target update
target_update_tau=0.005,
target_update_period=1,
# Params for train
train_steps_per_iteration=1,
#batch_size=256,
batch_size=32,
actor_learning_rate=3e-4,
critic_learning_rate=3e-4,
alpha_learning_rate=3e-4,
td_errors_loss_fn=tf.math.squared_difference,
gamma=0.99,
reward_scale_factor=0.1,
gradient_clipping=None,
use_tf_functions=True,
# Params for eval
num_eval_episodes=30,
eval_interval=10000,
# Params for summaries and logging
train_checkpoint_interval=50000,
policy_checkpoint_interval=50000,
rb_checkpoint_interval=50000,
log_interval=1000,
summary_interval=1000,
summaries_flush_secs=10,
debug_summaries=False,
summarize_grads_and_vars=False,
eval_metrics_callback=None):
"""A simple train and eval for SAC."""
root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
train_summary_writer = tf.compat.v2.summary.create_file_writer(
train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()
eval_summary_writer = tf.compat.v2.summary.create_file_writer(
eval_dir, flush_millis=summaries_flush_secs * 1000)
eval_metrics = [
tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
]
global_step = tf.compat.v1.train.get_or_create_global_step()
with tf.compat.v2.summary.record_if(
lambda: tf.math.equal(global_step % summary_interval, 0)):
py_envs = [SacEnv() for _ in range(0, batch_size)]
batched_env = batched_py_environment.BatchedPyEnvironment(envs=py_envs)
tf_env = tf_py_environment.TFPyEnvironment(batched_env)
eval_py_envs = [SacEnv() for _ in range(0, batch_size)]
eval_batched_env = batched_py_environment.BatchedPyEnvironment(envs=eval_py_envs)
eval_tf_env = tf_py_environment.TFPyEnvironment(eval_batched_env)
time_step_spec = tf_env.time_step_spec()
observation_spec = time_step_spec.observation
action_spec = tf_env.action_spec()
strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True)
with strategy.scope():
actor_net = actor_distribution_network.ActorDistributionNetwork(
observation_spec,
action_spec,
fc_layer_params=actor_fc_layers,
continuous_projection_net=tanh_normal_projection_network
.TanhNormalProjectionNetwork)
critic_net = critic_network.CriticNetwork(
(observation_spec, action_spec),
observation_fc_layer_params=critic_obs_fc_layers,
action_fc_layer_params=critic_action_fc_layers,
joint_fc_layer_params=critic_joint_fc_layers,
kernel_initializer='glorot_uniform',
last_kernel_initializer='glorot_uniform')
tf_agent = sac_agent.SacAgent(
time_step_spec,
action_spec,
actor_network=actor_net,
critic_network=critic_net,
actor_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=actor_learning_rate),
critic_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=critic_learning_rate),
alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=alpha_learning_rate),
target_update_tau=target_update_tau,
target_update_period=target_update_period,
td_errors_loss_fn=td_errors_loss_fn,
gamma=gamma,
reward_scale_factor=reward_scale_factor,
gradient_clipping=gradient_clipping,
debug_summaries=debug_summaries,
summarize_grads_and_vars=summarize_grads_and_vars,
train_step_counter=global_step)
tf_agent.initialize()
# Make the replay buffer.
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=batch_size,
max_length=replay_buffer_capacity,
device="/device:GPU:0")
replay_observer = [replay_buffer.add_batch]
train_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(
buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
tf_metrics.AverageEpisodeLengthMetric(
buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
]
eval_policy = greedy_policy.GreedyPolicy(tf_agent.policy)
initial_collect_policy = random_tf_policy.RandomTFPolicy(
tf_env.time_step_spec(), tf_env.action_spec())
collect_policy = tf_agent.collect_policy
train_checkpointer = common.Checkpointer(
ckpt_dir=train_dir,
agent=tf_agent,
global_step=global_step,
metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
policy_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'policy'),
policy=eval_policy,
global_step=global_step)
rb_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
max_to_keep=1,
replay_buffer=replay_buffer)
train_checkpointer.initialize_or_restore()
rb_checkpointer.initialize_or_restore()
initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
tf_env,
initial_collect_policy,
observers=replay_observer + train_metrics,
num_steps=initial_collect_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(
tf_env,
collect_policy,
observers=replay_observer + train_metrics,
num_steps=collect_steps_per_iteration)
if use_tf_functions:
initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_agent.train = common.function(tf_agent.train)
if replay_buffer.num_frames() == 0:
# Collect initial replay data.
logging.info(
'Initializing replay buffer by collecting experience for %d steps '
'with a random policy.', initial_collect_steps)
initial_collect_driver.run()
results = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
if eval_metrics_callback is not None:
eval_metrics_callback(results, global_step.numpy())
metric_utils.log_metrics(eval_metrics)
time_step = None
policy_state = collect_policy.get_initial_state(tf_env.batch_size)
timed_at_step = global_step.numpy()
time_acc = 0
# Prepare replay buffer as dataset with invalid transitions filtered.
def _filter_invalid_transition(trajectories, unused_arg1):
return ~trajectories.is_boundary()[0]
dataset = replay_buffer.as_dataset(
sample_batch_size=batch_size,
num_steps=2).unbatch().filter(
_filter_invalid_transition).batch(batch_size).prefetch(5)
# Dataset generates trajectories with shape [Bx2x...]
iterator = iter(dataset)
def train_step():
experience, _ = next(iterator)
return tf_agent.train(experience)
if use_tf_functions:
train_step = common.function(train_step)
global_step_val = global_step.numpy()
while global_step_val < num_iterations:
start_time = time.time()
time_step, policy_state = collect_driver.run(
time_step=time_step,
policy_state=policy_state,
)
for _ in range(train_steps_per_iteration):
train_loss = train_step()
time_acc += time.time() - start_time
global_step_val = global_step.numpy()
if global_step_val % log_interval == 0:
logging.info('step = %d, loss = %f', global_step_val,
train_loss.loss)
steps_per_sec = (global_step_val - timed_at_step) / time_acc
logging.info('%.3f steps/sec', steps_per_sec)
tf.compat.v2.summary.scalar(
name='global_steps_per_sec', data=steps_per_sec, step=global_step)
timed_at_step = global_step_val
time_acc = 0
for train_metric in train_metrics:
train_metric.tf_summaries(
train_step=global_step, step_metrics=train_metrics[:2])
if global_step_val % eval_interval == 0:
results = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
if eval_metrics_callback is not None:
eval_metrics_callback(results, global_step_val)
metric_utils.log_metrics(eval_metrics)
if global_step_val % train_checkpoint_interval == 0:
train_checkpointer.save(global_step=global_step_val)
if global_step_val % policy_checkpoint_interval == 0:
policy_checkpointer.save(global_step=global_step_val)
if global_step_val % rb_checkpoint_interval == 0:
rb_checkpointer.save(global_step=global_step_val)
return train_loss
def main(_):
tf.compat.v1.enable_v2_behavior()
logging.set_verbosity(logging.INFO)
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
train_eval(FLAGS.root_dir)
if __name__ == '__main__':
flags.mark_flag_as_required('root_dir')
app.run(main)
What is the appropriate way to create a batched environment for a custom, non-batched environment? I can share my custom environment, but I don't believe the issue lies there as the code works fine when using batch sizes of 1.
Also, any tips on increasing GPU utilization in reinforcement learning scenarios would be greatly appreciated. I have examined examples of using tensorboard-profiler to profile GPU utilization, but it seems these require callbacks and a fit function, which doesn't seem to be applicable in RL use-cases.
It turns out I neglected to pass batch_size when initializing the AverageReturnMetric and AverageEpisodeLengthMetric instances.

Data wrangling using CPU workers and training xgboost using GPU workers with dask

I am trying to read 200 parquet files from hdfs and then try to train a model using 4 GPUs. I have 48 vcores available on the machine as well. If I start the cluster with just the GPU workers then reading part is going to be very slow (since it just uses 4 cpu workers assigned to the gpu workers and you can't really run more workers than the number of gpus you have unless you run them on separate shells and then it gets nasty because you are on your own for memory management issues.) I would like to read the files using CPU workers, play with the data with the cpu workers and then train an xgboost model using GPU workers. I read the documentation here about how to start and assign workers with different resources to different tasks. Also I have seen this question, but I am confused a bit.
Here is the the code I am trying to run to read the .parquet files:
import dask.dataframe as dd
df = dd \
.read_parquet(
"hdfs://address/to/the/*.parquet",
storage_options = {
"user":user,
"kerb_ticket":kerb_ticket},
engine='pyarrow') \
.persist()
This will automatically use all the cpu and gpu workers which is fine. After this I need to create my training data and label. Let's say I have X_train, y_train, and params. Here I convert them to dask_cudf:
X_train = dask_cudf.from_dask_dataframe(X_train)
y_train = dask_cudf.from_dask_dataframe(y_train)
Here is the part that I need to use just GPU workers:
Xy = dxgb.DaskDMatrix(client, X_train, y_train)
in order to follow the document I should convert it to this:
Xy = client.submit(dxgb.DaskDMatrix, client, X_train, y_train, resources={'GPU': 1})
But then I'll get this error:
distributed.protocol.pickle - INFO - Failed to serialize (<Client: 'tcp://169.68.236.35:8786' processes=52 threads=52, memory=1.97 TiB>, <dask_cudf.DataFrame | 19200 tasks | 200 npartitions>, <dask_cudf.Series | 600 tasks | 200 npartitions>). Exception: cannot pickle 'socket' object
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/envs/dask/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
/envs/dask/lib/python3.8/socket.py in __getstate__(self)
271 def __getstate__(self):
--> 272 raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
273
TypeError: cannot pickle 'socket' object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-12-0d6a943365a9> in <module>
1 # Xy = dxgb.DaskDMatrix(client, X_train, y_train)
2 # Xy = dxgb.DaskDeviceQuantileDMatrix(client, X_train, y_train)
----> 3 Xy = client.submit(dxgb.DaskDMatrix, client, X_train, y_train, resources={'GPU': 1})
4 # Xy_valid = dxgb.DaskDMatrix(client, X_valid, y_valid)
/envs/dask/lib/python3.8/site-packages/distributed/client.py in submit(self, func, key, workers, resources, retries, priority, fifo_timeout, allow_other_workers, actor, actors, pure, *args, **kwargs)
1629 dsk = {skey: (func,) + tuple(args)}
1630
-> 1631 futures = self._graph_to_futures(
1632 dsk,
1633 [skey],
/envs/dask/lib/python3.8/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, workers, allow_other_workers, priority, user_priority, resources, retries, fifo_timeout, actors)
2646 # Pack the high level graph before sending it to the scheduler
2647 keyset = set(keys)
-> 2648 dsk = dsk.__dask_distributed_pack__(self, keyset, annotations)
2649
2650 # Create futures before sending graph (helps avoid contention)
/envs/dask/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, client, client_keys, annotations)
1045 "__module__": layer.__module__,
1046 "__name__": type(layer).__name__,
-> 1047 "state": layer.__dask_distributed_pack__(
1048 self.get_all_external_keys(),
1049 self.key_dependencies,
/envs/dask/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies, client, client_keys)
424 for k, v in dsk.items()
425 }
--> 426 dsk = toolz.valmap(dumps_task, dsk)
427 return {"dsk": dsk, "dependencies": dependencies}
428
/envs/dask/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/envs/dask/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/envs/dask/lib/python3.8/site-packages/distributed/worker.py in dumps_task(task)
3784 return d
3785 elif not any(map(_maybe_complex, task[1:])):
-> 3786 return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
3787 return to_serialize(task)
3788
/envs/dask/lib/python3.8/site-packages/distributed/worker.py in warn_dumps(obj, dumps, limit)
3793 def warn_dumps(obj, dumps=pickle.dumps, limit=1e6):
3794 """Dump an object to bytes, warn if those bytes are large"""
-> 3795 b = dumps(obj, protocol=4)
3796 if not _warn_dumps_warned[0] and len(b) > limit:
3797 _warn_dumps_warned[0] = True
/envs/dask/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
58 try:
59 buffers.clear()
---> 60 result = cloudpickle.dumps(x, **dump_kwargs)
61 except Exception as e:
62 logger.info("Failed to serialize %s. Exception: %s", x, e)
/envs/dask/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
/envs/dask/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
561 def dump(self, obj):
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
565 if "recursion" in e.args[0]:
/envs/dask/lib/python3.8/socket.py in __getstate__(self)
270
271 def __getstate__(self):
--> 272 raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
273
274 def dup(self):
TypeError: cannot pickle 'socket' object
Anyone knows how to fix this issue?
The problem is that dask.Client is not serializable, so you can't submit it.
You can work around this problem accessing dask.Client within a task by using dask.distributed.get_client:
from dask.distributed import get_client
def create_dmatrix(X_train, y_train):
client = get_client()
return dxgb.DaskDMatrix(client, X_train, y_train)
Xy = client.submit(create_dmatrix, X_train, y_train, resources={'GPU': 1})

Eager-Mode very slow (~22x slower than Graph-Mode)

I read that Tensorflow 2.0 will have some major changes, and a big part will be eager-execution [1], so I tried to play a bit around with the eager-mode of tensorflow.
I took a code from a github-repo and tried to run it in eager-mode (however, without usage of Keras-Model/Layers as proposed).
It turned out, that its quite slow. So I tried different modifications and compared it with the original source (graph-mode) of the model. The result is, that the graph-mode is around 22x times faster than the eager-mode. Its total clear to me, that the graph mode is faster, but by this number?
Is this always the case or do I need some special modifications / configurations of the variables to get a comparable performance to graph mode?
The source code, for both variants, can be found at [2].
Thanks in advance!
Eager-Mode:
# With
# with tf.device("/gpu:0"):
# ...
#
# Runtime is 0.35395
# Runtime is 0.12711
# Runtime is 0.12438
# Runtime is 0.12428
# Runtime is 0.12572
# Runtime is 0.12593
# Runtime is 0.12505
# Runtime is 0.12527
# Runtime is 0.12418
# Runtime is 0.12340
Graph Mode:
# Runtime is 0.81241
# Runtime is 0.00573
# Runtime is 0.00573
# Runtime is 0.00570
# Runtime is 0.00555
# Runtime is 0.00564
# Runtime is 0.00545
# Runtime is 0.00540
# Runtime is 0.00591
# Runtime is 0.00574
[1] https://groups.google.com/a/tensorflow.org/forum/#!topic/developers/JHDpgRyFVUs
[2] https://gist.github.com/lhlmgr/f6709e5aba4a5314b5221d58232b09bd
Using eager execution may mean undoing some habits developed with TensorFlow graphs since code snippets that used to run once (e.g., Python function that constructs the graph to compute the loss) will run repeatedly (the same Python function will now compute the loss on each iteration).
I took a cursory look at code links provided and noticed some easy wins that would probably also be seen by using standard Python profiling tools. You may want use those (cProfile, pyspy etc.)
For example, the Keras network is currently implemented as:
class NFModel(tf.keras.Model):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def call(self, *args, **kwargs):
num_layers = 6
d, r = 2, 2
bijectors = []
for i in range(num_layers):
with tf.variable_scope('bijector_%d' % i):
V = tf.get_variable('V', [d, r], dtype=DTYPE) # factor loading
shift = tf.get_variable('shift', [d], dtype=DTYPE) # affine shift
L = tf.get_variable('L', [d * (d + 1) / 2], dtype=DTYPE) # lower triangular
bijectors.append(tfb.Affine(
scale_tril=tfd.fill_triangular(L),
scale_perturb_factor=V,
shift=shift,
))
alpha = tf.get_variable('alpha', [], dtype=DTYPE)
abs_alpha = tf.abs(alpha) + .01
bijectors.append(LeakyReLU(alpha=abs_alpha))
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
dist = tfd.TransformedDistribution(distribution=base_dist, bijector=mlp_bijector)
Instead, if you create the variables in __init__ once and avoid tf.get_variable calls on every call to the network, you should see a big improvement.
class NFModel(tf.keras.Model):
def __init__(self, *args, **kwargs):
super(NFModel, self).__init__(*args, **kwargs)
num_layers = 6
d, r = 2, 2
self.num_layers = num_layers
self.V = [tf.get_variable('V', [d, r], dtype=DTYPE) for _ in range(num_layers)]
self.shift = [tf.get_variable('shift', [d], dtype=DTYPE) for _ in range(num_layers)]
self.L = [tf.get_variable('L', [d * (d + 1) / 2], dtype=DTYPE) for _ in range(num_layers)]
self.alpha = [tf.get_variable('alpha', [], dtype=DTYPE) for _ in range(num_layers)]
def call(self, *args, **kwargs):
bijectors = []
for i in range(self.num_layers):
V = self.V[i]
shift = self.shift[i]
L = self.L[i]
bijectors.append(tfb.Affine(
scale_tril=tfd.fill_triangular(L),
scale_perturb_factor=V,
shift=shift,
))
alpha = self.alpha[i]
abs_alpha = tf.abs(alpha) + .01
bijectors.append(LeakyReLU(alpha=abs_alpha))
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([2], DTYPE))
mlp_bijector = tfb.Chain(list(reversed(bijectors[:-1])), name='2d_mlp_bijector')
dist = tfd.TransformedDistribution(distribution=base_dist, bijector=mlp_bijector)
return {"dist": dist}
There are probably other such easy wins, a profiling tool will nudge you in the right direction.
Also, note that, TF 2.0 is less about "eager execution" and more about how one interacts with graphs, as per the RFC
Hope that helps.

Is there a way to get tensorflow tf.Print output to appear in Jupyter Notebook output

I'm using the tf.Print op in a Jupyter notebook. It works as required, but will only print the output to the console, without printing in the notebook. Is there any way to get around this?
An example would be the following (in a notebook):
import tensorflow as tf
a = tf.constant(1.0)
a = tf.Print(a, [a], 'hi')
sess = tf.Session()
a.eval(session=sess)
That code will print 'hi[1]' in the console, but nothing in the notebook.
Update Feb 3, 2017
I've wrapped this into memory_util package. Example usage
# install memory util
import urllib.request
response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py")
open("memory_util.py", "wb").write(response.read())
import memory_util
sess = tf.Session()
a = tf.random_uniform((1000,))
b = tf.random_uniform((1000,))
c = a + b
with memory_util.capture_stderr() as stderr:
sess.run(c.op)
print(stderr.getvalue())
** Old stuff**
You could reuse FD redirector from IPython core. (idea from Mark Sandler)
import os
import sys
STDOUT = 1
STDERR = 2
class FDRedirector(object):
""" Class to redirect output (stdout or stderr) at the OS level using
file descriptors.
"""
def __init__(self, fd=STDOUT):
""" fd is the file descriptor of the outpout you want to capture.
It can be STDOUT or STERR.
"""
self.fd = fd
self.started = False
self.piper = None
self.pipew = None
def start(self):
""" Setup the redirection.
"""
if not self.started:
self.oldhandle = os.dup(self.fd)
self.piper, self.pipew = os.pipe()
os.dup2(self.pipew, self.fd)
os.close(self.pipew)
self.started = True
def flush(self):
""" Flush the captured output, similar to the flush method of any
stream.
"""
if self.fd == STDOUT:
sys.stdout.flush()
elif self.fd == STDERR:
sys.stderr.flush()
def stop(self):
""" Unset the redirection and return the captured output.
"""
if self.started:
self.flush()
os.dup2(self.oldhandle, self.fd)
os.close(self.oldhandle)
f = os.fdopen(self.piper, 'r')
output = f.read()
f.close()
self.started = False
return output
else:
return ''
def getvalue(self):
""" Return the output captured since the last getvalue, or the
start of the redirection.
"""
output = self.stop()
self.start()
return output
import tensorflow as tf
x = tf.constant([1,2,3])
a=tf.Print(x, [x])
redirect=FDRedirector(STDERR)
sess = tf.InteractiveSession()
redirect.start();
a.eval();
print "Result"
print redirect.stop()
I ran into the same problem and got around it by using a function like this in my notebooks:
def tf_print(tensor, transform=None):
# Insert a custom python operation into the graph that does nothing but print a tensors value
def print_tensor(x):
# x is typically a numpy array here so you could do anything you want with it,
# but adding a transformation of some kind usually makes the output more digestible
print(x if transform is None else transform(x))
return x
log_op = tf.py_func(print_tensor, [tensor], [tensor.dtype])[0]
with tf.control_dependencies([log_op]):
res = tf.identity(tensor)
# Return the given tensor
return res
# Now define a tensor and use the tf_print function much like the tf.identity function
tensor = tf_print(tf.random_normal([100, 100]), transform=lambda x: [np.min(x), np.max(x)])
# This will print the transformed version of the tensors actual value
# (which was summarized to just the min and max for brevity)
sess = tf.InteractiveSession()
sess.run([tensor])
sess.close()
FYI, using a logger instead of calling "print" in my custom function worked wonders for me as the stdout is often buffered by jupyter and not shown before "Loss is Nan" kind of errors -- which was the whole point in using that function in the first place in my case.
You can check the terminal where you launched the jupyter notebook to see the message.
import tensorflow as tf
tf.InteractiveSession()
a = tf.constant(1)
b = tf.constant(2)
opt = a + b
opt = tf.Print(opt, [opt], message="1 + 2 = ")
opt.eval()
In the terminal, I can see:
2018-01-02 23:38:07.691808: I tensorflow/core/kernels/logging_ops.cc:79] 1 + 2 = [3]
A simple way, tried it in regular python, but not jupyter yet.
os.dup2(sys.stdout.fileno(), 1)
os.dup2(sys.stdout.fileno(), 2)
Explanation is here: In python, how to capture the stdout from a c++ shared library to a variable
The issue that I faced was that one can't run a session inside a Tensorflow Graph, like in the training or in the evaluation.
That's why the options to use sess.run(opt) or opt.eval() were not a solution for me.
The best thing was to use tf.Print() and redirect the logging to an external file.
I did this using a temporal file, which I transferred to a regular file like this:
STDERR=2
import os
import sys
import tempfile
class captured:
def __init__(self, fd=STDERR):
self.fd = fd
self.prevfd = None
def __enter__(self):
t = tempfile.NamedTemporaryFile()
self.prevfd = os.dup(self.fd)
os.dup2(t.fileno(), self.fd)
return t
def __exit__(self, exc_type, exc_value, traceback):
os.dup2(self.prevfd, self.fd)
with captured(fd=STDERR) as tmp:
...
classifier.evaluate(input_fn=input_fn, steps=100)
with open('log.txt', 'w') as f:
print(open(tmp.name).read(), file=f)
And then in my evaluation I do:
a = tf.constant(1)
a = tf.Print(a, [a], message="a: ")