Distributed Learning with TensorFlow2 is not working - tensorflow

I'm trying to get distributed TF working in VS-Code with the Tensorflow version 2.0.0a (the CPU Version).
I'm using a Windows and a Linux System (two different computers) and both are working well alone.
For the distibuted TF I followed the tutorial at
https://www.tensorflow.org/alpha/guide/distribute_strategy .
I already tried different ports and turning off the firewalls. I also tried to switch the master system from Windows to Linux but now i think it might be a Problem with the code or maybe the TF-Version which is labeled as experimental.
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow_datasets as tfds
import tensorflow as tf
import json
import os
BUFFER_SIZE = 10000
BATCH_SIZE = 64
def scale(image, label):
image = tf.cast(image, tf.float32)
image /= 255
return image, label
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
train_datasets_unbatched = datasets['train'].map(scale).shuffle(BUFFER_SIZE)
train_datasets = train_datasets_unbatched.batch(BATCH_SIZE)
def build_and_compile_cnn_model():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
loss=tf.keras.losses.sparse_categorical_crossentropy,
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=['accuracy'])
return model
#multiworker conf:
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["192.168.0.12:2468", "192.168.0.13:1357"]
},
'task': {'type': 'worker', 'index': 0}
})
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
NUM_WORKERS = 2
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
#--------------------------------------------------------------------
#In the following line the error occurs
train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE)
#--------------------------------------------------------------------
with strategy.scope():
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(x=train_datasets, epochs=3)
I expect the worker to start the learning process but instead I get the error:
"F tensorflow/core/framework/device_base.cc:33] Device does not implement name()"

As far as i know, each worker should have a unique task index, for example:
on the first machine you should have:
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["192.168.0.12:2468", "192.168.0.13:1357"]
},
'task': {'type': 'worker', 'index': 0}
})
and on the second:
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["192.168.0.12:2468", "192.168.0.13:1357"]
},
'task': {'type': 'worker', 'index': 1}
})

Related

tensorflow2 on slurm cluster not work correctly

I have seen similar questions but there were for tensorflow1.x or didn't work for me.
I want to train models on multiple threads on multiple CPUs across the cluster.
For now I have example code for simple MNIST classification :
import os
import tensorflow as tf
import numpy as np
import json
def mnist_dataset(batch_size):
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices(
(x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
return train_dataset
def build_and_compile_cnn_model():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(28, 28)),
tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10)
])
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=['accuracy'])
return model
from hostlist import expand_hostlist
task_index = int( os.environ['SLURM_PROCID'] )
n_tasks = int( os.environ['SLURM_NPROCS'] )
tf_hostlist = [ ("%s:33333" % host) for host in
expand_hostlist( os.environ['SLURM_NODELIST']) ]
node = os.environ['SLURMD_NODENAME']
print('id',task_index)
index = tf_hostlist.index(f'{node}:33333')
print('index', index)
tf_config = {
'cluster': {
'worker': [tf_hostlist[1], tf_hostlist[2]],
'chief': [tf_hostlist[0]]
},
'task': {'type': 'worker', 'index':index}
}
if index == 0:
tf_config['task']['type'] = 'chief'
tf_config['task']['index'] = index
else:
tf_config['task']['index'] = index-1
per_worker_batch_size = 64
os.environ['TF_CONFIG'] = json.dumps(tf_config)
num_workers = len(tf_config['cluster']['worker'])
print('num_workers', num_workers)
strategy = tf.distribute.MultiWorkerMirroredStrategy()
global_batch_size = per_worker_batch_size * num_workers
multi_worker_dataset = mnist_dataset(global_batch_size)
with strategy.scope():
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)
It's written to run on three dfferent CPUs
I have also script for run on slurm cluster:
#!/bin/bash
#SBATCH --job-name=mnist_tf_distributed
#SBATCH --nodes=3
#SBATCH --cpus-per-task=10
#SBATCH --time=00:05:00
#SBATCH --exclusive
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
srun python keras_multi.py
The problem is that training start three times, not once disturbed on three CPUs.
Can anyone know how to change code to run above training once across different machines.

Serving a Tensorflow 2 Keras model with feature columns and preprocessing (migrating from tf 1.x estimators)

I'm migrating a current Tensorflow 1.x model built with estimators across to Tensorflow 2.0 Keras. The migration has been relatively smooth until it comes to serialising the model for serving.
The model is specified as follows
model = tf.keras.Sequential()
model.add(tf.keras.layers.DenseFeatures(feature_columns))
for units in hidden_layers:
model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dense(2, activation=None))
I am using the Tensorflow feature columns api, which expects as input a dictionary of feature columns, and applying a transformation to those features before they pass into the model.
For example when training
def dataset_transformation_function(feature_dict: Dict[str, tf.Tensor]):
output_dict = feature_dict.copy()
output_dict['logx1'] = tf.math.log(feature_dict['x1'])
return output_dict
train_dataset = (
tf.data.Dataset.from_tensor_slices(
(train_feature_dict, train_label_vector)
)
.shuffle(n_train)
.batch(batch_size)
.map(dataset_transformation_function)
.repeat()
.prefetch(tf.data.experimental.AUTOTUNE)
)
It appears that to perform the same transformation at serve time I require:
input_tensors = [tf.Tensorspec(name=...), ...]
#tf.function(input_signature=input_tensors)
def dataset_transformation_function(args) -> Dict[str, tf.Tensor]:
...
And
tf.saved_model.save(
model,
MODEL_DIR,
signatures=feature_transform,
)
However I cannot determine the correct signature for the input tensor or the function.
The method I am migrating from is:
def serving_input_fn():
receiver_tensors = {
'x1': tf.placeholder(dtype=tf.float32, shape=[None, ], name='x1')
'x2': tf.placeholder(dtype=tf.string, shape=[None, ], name='x2')
}
features = dataset_transformation_function(
receiver_tensors
)
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
estimator.export_savedmodel(
MODEL_DIR,
serving_input_fn,
as_text=False,
checkpoint_path=estimator.best_checkpoint,
)
To answer my own question, it seems that the solution is to provide a function which, when called does both the preprocessing and calls the model. Example here:
# tensorflow 2.0.0
import tensorflow as tf
import numpy as np
hidden_layers = [4,4]
feature_columns = [fc.numeric_column(name) for name in ['x1', 'x2', 'logx1']]
# construct a simple sequential model
model = tf.keras.Sequential()
model.add(tf.keras.layers.DenseFeatures(feature_columns))
for units in hidden_layers:
model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dense(2, activation=None))
model.compile(
optimizer=tf.keras.optimizers.Adam(1e-3),
loss='mae',
metrics=['mae']
)
x_train = {'x1': np.arange(10), 'x2': np.arange(10), 'logx1': np.log1p(np.arange(10))}
x_predict = {'x1': np.arange(10), 'x2': np.arange(10)}
y = np.random.random(size=10)
model.fit(x=x_train, y=y)
trained_model_predictions = model.predict(x_train)
# preprocessing function for serving
#tf.function()
def serve_predict(x1, x2):
preprocessed_feature = tf.math.log1p(x1)
output = {
'x1': x1,
'x2': x2,
'logx1': preprocessed_feature
}
prediction = model(output)
return prediction
serve_predict = serve_predict.get_concrete_function(x1=tf.TensorSpec([None,]), x2=tf.TensorSpec([None,]))
tf.saved_model.save(
model,
'/tmp/tf',
signatures=serve_predict
)
# check the models give the same output
loaded = tf.saved_model.load('/tmp/tf')
loaded_model_predictions = loaded.serve_predict(x1=tf.range(10, dtype=tf.float32), x2=tf.range(10, dtype=tf.float32))
np.testing.assert_allclose(trained_model_predictions, loaded_model_predictions, atol=1e-6)

How can I connect with tensorboard

import cv2 # working with, mainly resizing, images
import numpy as np # dealing with arrays
import os # dealing with directories
from random import shuffle # mixing up or currently ordered data that might.
from tqdm import tqdm # a nice pretty percentage bar for tasks.
TRAIN_DIR = 'train'
TEST_DIR = 'test'
IMG_SIZE = 50
LR = 1e-3
MODEL_NAME = 'snakes-{}-{}.model'.format(LR, '2conv-basic')
def label_img(img):
print("\nImage = ",img)
print("\n",img.split('.')[-2])
temp_name= img.split('.')[-2]
temp_name=temp_name[:1]
word_label = temp_name
if word_label == 'A': return [0,0,0,0,1] #A_common_krait
elif word_label == 'B': return [0,0,0,1,0] #B_hump_nosed_viper
elif word_label == 'C': return [0,0,1,0,0] #C_indian_cobra
elif word_label == 'D': return [0,1,0,0,0] #D_russels_viper
elif word_label == 'E' : return [1,0,0,0,0] #E_saw_scaled_viper
def create_train_data():
training_data = []
for img in tqdm(os.listdir(TRAIN_DIR)):
label = label_img(img)
path = os.path.join(TRAIN_DIR,img)
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
training_data.append([np.array(img),np.array(label)])
shuffle(training_data)
np.save('train_data.npy', training_data)
return training_data
def process_test_data():
testing_data = []
for img in tqdm(os.listdir(TEST_DIR)):
path = os.path.join(TEST_DIR,img)
img_num = img.split('.')[0]
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
testing_data.append([np.array(img), img_num])
shuffle(testing_data)
np.save('test_data.npy', testing_data)
return testing_data
train_data = create_train_data()
import tflearn
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression
import tensorflow as tf
tf.reset_default_graph()
convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input')
convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 128, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)
convnet = fully_connected(convnet, 1024, activation='relu')
convnet = dropout(convnet, 0.8)
convnet = fully_connected(convnet, 5, activation='softmax')
convnet = regression(convnet, optimizer='adam', learning_rate=LR,
loss='categorical_crossentropy', name='targets')
model = tflearn.DNN(convnet, tensorboard_dir='log')
if os.path.exists('{}.meta'.format(MODEL_NAME)):
model.load(MODEL_NAME)
print('model loaded!')
train = train_data[:-11200]
test = train_data[-11200:]
X = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
Y = [i[1] for i in train]
test_x = np.array([i[0] for i in test]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
test_y = [i[1] for i in test]
model.fit({'input': X}, {'targets': Y}, n_epoch=3, validation_set=({'input':
test_x}, {'targets': test_y}),
snapshot_step=500, show_metric=True, run_id=MODEL_NAME)
model.save(MODEL_NAME)
with open('submission_file.csv','w') as f:
f.write('id,label\n')
with open('submission_file.csv','a') as f:
for data in tqdm(test):
img_num = data[1]
img_data = data[0]
orig = img_data
data = img_data.reshape(IMG_SIZE,IMG_SIZE,1)
model_out = model.predict([data])[0]
f.write('{},{}\n'.format(img_num,model_out[1]))
This model is based on classification of 5 types of snakes and train using 17300 images and 11200 images for validation.So i cannot see performance of my model because cannot view any graph(for accuracy,loss,over fitting etc.)
I want to visualize working flow of my model using tensorboard and how can i add that model to tensorboard,Any help will be highly appreciated.
The simple answer of why you cannot view the graph of accuracy, loss, overfitting, etc is because you're not doing any logging. Tensorflow will not automatically log things for you.
but I highly recommend you go through Tensorflow Mnist Example, try to implement how you think it should go and then reference Tensorflow MNIST with Tensorboard Example
If you want a quick start to the concepts look at quick introduction
I could find solution for some cases using,
sess = tf.Session()
tf.summary.histogram
file_writer = tf.summary.FileWriter('./log', sess.graph)

Tensorflow Estimator API doesn't work in distributed mode

here is my test code
`
from tensorflow.python.keras.layers import Conv1D, MaxPooling1D
from tensorflow.python.keras.models import Model
import logging
level = logging.getLevelName('INFO')
logging.getLogger().setLevel(level)
model = tf.keras.Sequential()
output = Dense(2, activation="softmax")
model.add(Dense(64, activation="relu", input_shape=(10,)))
model.add(output)
model.compile('rmsprop', 'categorical_crossentropy')
est_model = tf.keras.estimator.model_to_estimator(keras_model=model)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"dense_2_input": np.random.randint(10, size=(320, 10))},
y=np.random.rand(320, 2),
num_epochs=10000,
shuffle=False)
est_model.train(train_input_fn)
My TF_CONFIG is
`
TF_CONFIG={
"cluster": {"chief": ["localhost:2223"],
"worker": ["localhost:2221"],
"ps": ["lcoalhost:2222"]},
"task": {"index": "0", "type": "chief"}
}
The chief is stuck on logging Restoring paramater from ......
and no ports is listening.
Any suggestion?

TensorFlow's Estimator froze with low CPU usage

I updated my TF to v1.0rc1, and Estimator.evaluate does not work anymore because it froze at Restoring model.... I tried to reproduce this problem and the following sample code will make TF froze with a 220% (2CPU) CPU usage and no output at all. Any idea why this happen? Thanks!
import tensorflow as tf
from tensorflow.contrib.layers.python.layers.optimizers import optimize_loss
from tensorflow.contrib.learn.python.learn.estimators import model_fn
from tensorflow.contrib.learn.python.learn.estimators.estimator import Estimator
from tensorflow.python.framework import ops
def main(_):
def func(features, targets, mode, params):
idx = tf.concat([features['a'], features['b']], axis=1)
embedding = tf.get_variable("embed", [10, 20], dtype=tf.float32)
pred = tf.reduce_sum(tf.nn.embedding_lookup(embedding, idx))
train_op = optimize_loss(loss=pred,
global_step=tf.train.get_global_step(),
learning_rate=0.001,
optimizer='Adam',
variables=tf.trainable_variables(),
name="training_loss_optimizer")
eval_metric_dict = dict()
eval_metric_dict['metric'] = pred
return model_fn.ModelFnOps(mode=mode,
predictions=pred,
loss=pred,
train_op=train_op,
eval_metric_ops=eval_metric_dict)
model = Estimator(func, params={})
model.fit(
input_fn=lambda: (
{'a': ops.convert_to_tensor([[1, 2, 3, 4, 5]]), 'b': ops.convert_to_tensor([[2, 3, 4, 3, 5]])},
None), steps=1)
model.evaluate(
input_fn=lambda: (
{'a': ops.convert_to_tensor([[1, 2, 3, 4, 5]]), 'b': ops.convert_to_tensor([[2, 3, 4, 3, 5]])},
None))
if __name__ == "__main__":
tf.app.run()
By default Estimator.evaluate assumes queue-based input, and will continue evaluating until the input pipeline is exhausted. When there is no queue-based input, this means it will loop forever. The fix is easy: simply provide a steps argument to evaluate.