Tensorflow Estimator API doesn't work in distributed mode - tensorflow

here is my test code
`
from tensorflow.python.keras.layers import Conv1D, MaxPooling1D
from tensorflow.python.keras.models import Model
import logging
level = logging.getLevelName('INFO')
logging.getLogger().setLevel(level)
model = tf.keras.Sequential()
output = Dense(2, activation="softmax")
model.add(Dense(64, activation="relu", input_shape=(10,)))
model.add(output)
model.compile('rmsprop', 'categorical_crossentropy')
est_model = tf.keras.estimator.model_to_estimator(keras_model=model)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"dense_2_input": np.random.randint(10, size=(320, 10))},
y=np.random.rand(320, 2),
num_epochs=10000,
shuffle=False)
est_model.train(train_input_fn)
My TF_CONFIG is
`
TF_CONFIG={
"cluster": {"chief": ["localhost:2223"],
"worker": ["localhost:2221"],
"ps": ["lcoalhost:2222"]},
"task": {"index": "0", "type": "chief"}
}
The chief is stuck on logging Restoring paramater from ......
and no ports is listening.
Any suggestion?

Related

keras model prediction did not return probability when using load_model

I have Covid-19 X-ray dataset from Kaggle. I split and resize image in to the following dimension.
X_train (675, 256, 256, 3), X_test (225, 256, 256, 3) and X_val (225, 256, 256, 3). My code to train a densenet121 is the following
import numpy as np
import os
import random
from sklearn.utils import class_weight
from keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input, Activation, BatchNormalization
from keras.applications import DenseNet121
from keras.models import Model
from keras import applications as A
from tensorflow.keras.models import load_model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.optimizers import SGD
seed_value = 1234
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
X_train = A.densenet.preprocess_input(X_train)
X_test = A.densenet.preprocess_input(X_test)
X_val = A.densenet.preprocess_input(X_val)
def get_model(hparams):
input_tensor = Input(shape=(256, 256, 3))
pretrain = DenseNet121(weights='imagenet', input_tensor=input_tensor, include_top=False)
idx = 52
x = pretrain.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, use_bias=False)(x)
x = Dropout(0.25)(x)
x = BatchNormalization(axis=-1)(x)
x = Activation("relu")(x)
predictions = Dense(hparams["nclass"], activation="softmax")(x)
model = Model(inputs=pretrain.input, outputs=predictions)
for layer in model.layers:
if "BatchNormalization" in layer.__class__.__name__:
layer.trainable = True
else:
layer.trainable = False
for i in range(len(model.layers)):
if i > idx:
model.layers[i].trainable = True
model.compile(optimizer=SGD(lr=hparams["lr"]), loss="categorical_crossentropy", metrics=["accuracy"])
return model
weights = class_weight.compute_class_weight("balanced", classes=np.unique(y_train_labels), y=y_train_labels)
class_weights = dict(zip(np.unique(y_train_labels), weights))
es = EarlyStopping(monitor="val_loss",
mode="min",
patience=20,
verbose=1,
restore_best_weights=True)
mc = ModelCheckpoint(filepath="../models/mymodel.h5",
monitor="val_loss",
mode="min",
verbose=1,
save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss",
factor=0.9,
patience=5,
min_lr=0.000001,
verbose=1)
history = model.fit(x=X_train,
y=y_train,
class_weight=class_weights,
validation_data=(X_val, y_val),
epochs=500,
batch_size=8,
callbacks=[es, mc, reduce_lr])
Prediction of shows probability of 3 classes (e.g. [0.1, 0.6, 0.3]) but when I load model later using this command.
classifier = load_model("mymodel.h5", compile=False)
probs = classifier.predict(X_test)
It seems that the prediction results is no longer probability but a class label (also incorrectly if we refer to the previous prediction [0.1, 0.6, 0.3] ... I got [0, 0, 1] as the output of the load model. I'm using keras version 2.3.1 and tensorflow 2.1.0. May I know what went wrong and how to fix it?

Reproduce same results on each run - Keras, Google Colab

I run the following code in Google Colab(with GPU):
import random
random.seed(1)
import numpy as np
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import pandas as pd
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense, Lambda, SimpleRNN
from keras.optimizers import *
from keras.utils import np_utils
from keras.initializers import *
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, auc, precision_recall_curve
from sklearn.metrics import confusion_matrix
from keras.callbacks import EarlyStopping
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
##Loading dataset train and validation files, the files are same for every run
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
print("***********************************************************************************************")
def make_model():
model = Sequential()
model.add(Conv2D(10,(5,5), kernel_initializer=glorot_uniform(seed=1), input_shape = (22,10,1), use_bias = True, activation = "relu", strides = 1, padding = "valid"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(20, kernel_initializer=glorot_uniform(seed=1), activation = "relu"))
model.add(Lambda(lambda x: tf.expand_dims(x, axis=1)))
model.add(SimpleRNN(20, kernel_initializer=glorot_uniform(seed=1), activation="relu",return_sequences=False))
model.add(Dense(1, kernel_initializer=glorot_uniform(seed=1), activation="sigmoid"))
opti = SGD(lr = 0.01)
model.compile(loss = "binary_crossentropy", optimizer = opti, metrics = ["accuracy"])
return model
model = make_model()
model.fit(x_train, y_train, validation_data = (x_validation,y_validation), epochs = 50, batch_size = 20, verbose = 2, callbacks=[es])
Despite setting all seed values, my prediction results of the model are different on subsequent runs. The training and testing of the model happens in the same Colab cell.
You are dealing with floating point numbers that are multiplied and added on different threads and can therefore happen in different order. Floating point additions and multiplications are not commutative. See What Every Computer Scientist Should Know About Floating-Point Arithmetic.

Keras: MLP on CPU Reproducibility

I'm building and testing a simple MLP model, but am running into an issue with the Keras reproducibility for my results. I am trying to set up my neural network so that the prediction outputs won't change when I run the network.
I have already followed the Keras guide online as well as this post (Reproducible results using Keras with TensorFlow backend). I am running Keras on my local machine with Tensorflow backend and the following versions:
tensorflow 2.0.0-alpha0,
keras 2.2.4-tf,
numpy 1.16.0
import os
os.environ['PYTHONHASHSEED']=str(0)
import random
random.seed(0)
from numpy.random import seed
seed(1)
import tensorflow as tf
tf.compat.v1.set_random_seed(2)
from keras import backend as K
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)
import numpy as np
from tensorflow.python.keras.layers import Dropout, BatchNormalization
from tensorflow.python.keras.optimizers import Adam
class Machine_Learning_Classifier_Keras(object):
#classmethod
def _get_classifier(cls, n_input_features=None, **params):
KerasClassifier = tf.keras.wrappers.scikit_learn.KerasClassifier
Dense = tf.keras.layers.Dense
Sequential = tf.keras.models.Sequential
sk_params = {"epochs": 200, "batch_size": 128, "shuffle": False}
def create_model(optimizer='adam', init='he_normal'):
# create model
model = Sequential()
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(500, input_dim=4, kernel_initializer=init, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(250, kernel_initializer=init, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(500, kernel_initializer=init, activation='relu'))
model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=3e-3, decay=0.85), metrics=['accuracy'])
return model
return KerasClassifier(build_fn=create_model, **sk_params)
if __name__ == "__main__":
X = np.asarray([[0.0, 0.0], [1.0, 1.0], [2.0, 2.5], [1.5, 1.6]])
y = np.asarray([0, 0, 1, 1])
nn = Machine_Learning_Classifier_Keras._get_classifier()
nn.fit(X, y, sample_weight=np.asarray([0, 0, 1, 1]))
values = np.asarray([[0.5, 0.5], [0.6, 0.5], [0.8, 1.0], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]])
probas = nn.predict_proba(values)
print(probas)
I would expect my outputs for the predict_proba values to stay the same between runs; however, I am getting the following for two successive runs (results will vary):
Run 1:
[[0.9439231 0.05607685]
[0.91351616 0.08648387]
[0.06378722 0.9362128 ]
[0.9439231 0.05607685]
[0.9439231 0.05607685]
[0.9439231 0.05607685]
[0.94392323 0.05607677]
[0.94392323 0.05607677]]
Run 2:
[[0.94391584 0.05608419]
[0.91350436 0.08649567]
[0.06378281 0.9362172 ]
[0.94391584 0.05608419]
[0.94391584 0.05608419]
[0.94391584 0.05608419]
[0.94391584 0.05608416]
[0.94391584 0.05608416]]
Ended up figuring out what the issue is, but not sure how to resolve -- it has something to do with the first BatchNormalization() layer, which is supposed to standardize the inputs. If you remove that layer the results are entirely reproducible, but something in the BatchNormalization() implementation is leading to non-reproducible behavior
If you run the mentioned code twice then it will show the behavior that you have just described. Cause every-time the model is being trained and its not necessary that it may lead to same local minimum every-time.
However if you train your model only once and save the weights and use those weights to predict the output then you will get same results every-time for same data.

Distributed Learning with TensorFlow2 is not working

I'm trying to get distributed TF working in VS-Code with the Tensorflow version 2.0.0a (the CPU Version).
I'm using a Windows and a Linux System (two different computers) and both are working well alone.
For the distibuted TF I followed the tutorial at
https://www.tensorflow.org/alpha/guide/distribute_strategy .
I already tried different ports and turning off the firewalls. I also tried to switch the master system from Windows to Linux but now i think it might be a Problem with the code or maybe the TF-Version which is labeled as experimental.
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow_datasets as tfds
import tensorflow as tf
import json
import os
BUFFER_SIZE = 10000
BATCH_SIZE = 64
def scale(image, label):
image = tf.cast(image, tf.float32)
image /= 255
return image, label
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
train_datasets_unbatched = datasets['train'].map(scale).shuffle(BUFFER_SIZE)
train_datasets = train_datasets_unbatched.batch(BATCH_SIZE)
def build_and_compile_cnn_model():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
loss=tf.keras.losses.sparse_categorical_crossentropy,
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=['accuracy'])
return model
#multiworker conf:
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["192.168.0.12:2468", "192.168.0.13:1357"]
},
'task': {'type': 'worker', 'index': 0}
})
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
NUM_WORKERS = 2
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
#--------------------------------------------------------------------
#In the following line the error occurs
train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE)
#--------------------------------------------------------------------
with strategy.scope():
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(x=train_datasets, epochs=3)
I expect the worker to start the learning process but instead I get the error:
"F tensorflow/core/framework/device_base.cc:33] Device does not implement name()"
As far as i know, each worker should have a unique task index, for example:
on the first machine you should have:
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["192.168.0.12:2468", "192.168.0.13:1357"]
},
'task': {'type': 'worker', 'index': 0}
})
and on the second:
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["192.168.0.12:2468", "192.168.0.13:1357"]
},
'task': {'type': 'worker', 'index': 1}
})

Keras Model Always Predicts the Same Result

I've written the following Keras model:
import configuration_reader
# Import libraries
import pandas
import keras
from pprint import pprint
# Import TensorFlow
import tensorflow as tf
# Import keras dependencies
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import GRU
# Read configuration
configuration = configuration_reader.read_configuration()
training_configuration = configuration['train']
MAX_DOCUMENT_LENGTH = training_configuration['max_document_length']
MAX_SIZE = training_configuration['max_size']
LOSS_FUNCTION = training_configuration['loss']
TENSORBOARD_DIR = configuration['tensorboard']['dir']
ACTIVATION_FUNCTION = training_configuration['activation_function']
EMBEDDING_SIZE = training_configuration['embedding_size']
LSTM_UNITS = training_configuration['lstm_units']
RECURRENT_DROPOUT = training_configuration['recurrent_dropout']
DROPOUT = training_configuration['dropout']
MAX_LABEL = §training_configuration['max_label']
BATCH_SIZE = training_configuration['batchSize']
EPOCHS = training_configuration['epochs']
WORDS_FEATURE = training_configuration['wordsFeature']
MODEL_FILE_NAME = configuration['modelFileName']
LEARNING_RATE = training_configuration['learning_rate']
def get_data():
tf.logging.set_verbosity(tf.logging.INFO)
dbpedia = tf.contrib.learn.datasets.load_dataset('dbpedia')
x_train = pandas.Series(dbpedia.train.data[:, 1])
y_train = pandas.Series(dbpedia.train.target)
x_test = pandas.Series(dbpedia.test.data[:, 1])
y_test = pandas.Series(dbpedia.test.target)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)
tokenizer = Tokenizer(num_words=MAX_DOCUMENT_LENGTH)
X_train = tokenizer.sequences_to_matrix(x_train_sequences, mode='binary')
X_test = tokenizer.sequences_to_matrix(x_test_sequences, mode='binary')
num_classes = len(y_test)
num_classes_unique = len(y_test.unique())
print(u'Number of classes: {}'.format(num_classes_unique))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
return (X_train, y_train, X_test, y_test, num_classes, tokenizer)
if __name__ == '__main__':
(X_train, y_train, X_test, y_test, num_classes, tokenizer) = get_data()
model = Sequential()
model.add(Embedding(MAX_SIZE, EMBEDDING_SIZE))
model.add(GRU(LSTM_UNITS, recurrent_dropout=RECURRENT_DROPOUT))
model.add(Dropout(DROPOUT))
model.add(Dense(num_classes))
model.add(Activation(ACTIVATION_FUNCTION))
model.compile(loss=LOSS_FUNCTION,
optimizer=keras.optimizers.Adam(LEARNING_RATE),
metrics=['accuracy'])
print(model.summary())
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=TENSORBOARD_DIR,
histogram_freq=1,
write_graph=True)
history = model.fit(X_train, y_train,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
shuffle=True,
validation_data=(X_test, y_test),
callbacks=[tensorboard_callback])
score = model.evaluate(X_test, y_test,
batch_size=BATCH_SIZE, verbose=1)
model.save(MODEL_FILE_NAME)
print(u'Loss: {}'.format(score[0]))
print(u'Accuracy: {}'.format(score[1]))
def _predict(text):
text_to_predict = pandas.Series([text])
tokenizer.fit_on_texts(text_to_predict)
text_to_predict_sequences = tokenizer.texts_to_sequences(text_to_predict)
to_predict = tokenizer.sequences_to_matrix(text_to_predict_sequences, mode='binary')
prediction = model.predict(to_predict, steps=1000)
predicted_class_index = prediction.argmax(axis=1)
pprint(predicted_class_index)
while True:
text_input = input('Enter: ')
_predict(text_input)
This is the configuration I'm using:
{
{
"logDir": "graph",
"port": 9001,
"modelFileName": "model.h5",
"tensorboard": {
"dir": "./graph"
}
},
"train": {
"wordsFeature": "words",
"epochs": 3,
"batchSize": 32,
"dropout": 0.5,
"recurrent_dropout": 0.5,
"max_label": 15,
"lstm_units": 128,
"embedding_size": 200,
"activation_function": "sigmoid",
"loss": "binary_crossentropy",
"max_size": 1000,
"max_document_length": 50,
"learning_rate": 0.001
}
}
The model trains as expected and gets an accuracy of about 98% when evaluating.
But there are two things, that are quite unusual there.
The predictions of the model are inaccurate
After I predict the first label, all of the next predictions are exactly the same.
As for the first point, there might be something wrong with my model. But for the second, I'm really confused as to why this is happening?
Does anybody have an idea, why this is happening and what the solution might be?
I'm using Keras with the TensorFlow backend (CUDA version).