How to efficiently use a tf.data.Dataset made of ordereddict? - tensorflow

Using TensorFlow 2.3.1, the code snippet below fails.
import tensorflow as tf
url = "https://storage.googleapis.com/download.tensorflow.org/data/creditcard.zip"
tf.keras.utils.get_file(
origin=url,
fname='creditcard.zip',
cache_dir="/tmp/datasets/",
extract=True)
ds = tf.data.experimental.make_csv_dataset(
"/tmp/datasets/*.csv",
batch_size=2048,
label_name="Class",
select_columns=["V1","V2","Class"],
num_rows_for_inference=None,
shuffle_buffer_size=600,
ignore_errors=True)
model = tf.keras.Sequential(
[
tf.keras.layers.Dense(256, activation="relu"),
tf.keras.layers.Dense(1, activation="sigmoid", name="labeling"),
],
)
model.compile(
optimizer=tf.keras.optimizers.Adam(1e-2),
loss="binary_crossentropy",
)
model.fit(
ds,
steps_per_epoch=5,
epochs=3,
)
The error stack is
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-c79f80f9d0fd> in <module>
----> 1 model.fit(
2 ds,
3 steps_per_epoch=5,
4 epochs=3,
5 )
[...]
ValueError: Layer sequential expects 1 inputs, but it received 2 input tensors. Inputs received: [<tf.Tensor 'ExpandDims:0' shape=(2048, 1) dtype=float32>, <tf.Tensor 'ExpandDims_1:0' shape=(2048, 1) dtype=float32>]
The solution I use so far is
def workaround(features, labels):
return (tf.stack(list(features.values()), axis=1), labels)
model.fit(
ds.map(workaround),
steps_per_epoch=5,
epochs=3,
)
My questions to you TF gurus:
Am I doing the right thing or is there a better solution?
Performance wise, is that solution viable for a dataset that would not fit in memory?

I'am not sure your code can fit the data in memoey or not .
If not , you can change your code like this :
import tensorflow as tf
url = "https://storage.googleapis.com/download.tensorflow.org/data/creditcard.zip"
ds = tf.data.experimental.make_csv_dataset(
"/tmp/datasets/*.csv",
batch_size=2048,
label_name="Class",
select_columns=["V1","V2","Class"],
num_rows_for_inference=None,
ignore_errors=True,
num_epochs = 1,
shuffle_buffer_size=2048*1000,
prefetch_buffer_size=tf.data.experimental.AUTOTUNE
)
input_list = []
for column in ["V1", "V2"]:
_input = tf.keras.Input(shape=(1,))
input_list.append(_input)
concat = tf.keras.layers.Concatenate(name="concat")(input_list)
dense = tf.keras.layers.Dense(256, activation="relu", name="dense", dtype='float64' )(concat)
output_dense = tf.keras.layers.Dense(1, activation="sigmoid", name="labeling", dtype='float64')(dense)
model = tf.keras.Model(inputs=input_list, outputs=output_dense)
model.compile(
optimizer=tf.keras.optimizers.Adam(1e-2),
loss="binary_crossentropy",
)
model.fit(
ds,
steps_per_epoch=5,
epochs=10,
)

Related

AttributeError: 'Sequential' object has no attribute 'predict_proba'

predict_proba returns the error in the neural network
i saw the example on this link https://machinelearningmastery.com/how-to-make-classification-and-regression-predictions-for-deep-learning-models-in-keras/
https://faroit.com/keras-docs/1.0.0/models/sequential/#the-sequential-model-api
I am using Tensorflow Version: 2.6.0
Code:
#creating the object (Initializing the ANN)
import tensorflow as tf
from tensorflow import keras
LAYERS = [
tf.keras.layers.Dense(50, activation="relu", input_shape=X_train.shape[1:]),
tf.keras.layers.LeakyReLU(),
tf.keras.layers.Dense(25, activation="relu"),
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(5, activation="relu"),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1, activation='sigmoid')
]
LOSS = "binary_crossentropy"
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=1e-3)
model_cEXT = tf.keras.models.Sequential(LAYERS)
model_cEXT.compile(loss=LOSS , optimizer=OPTIMIZER, metrics=['accuracy'])
EPOCHS = 100
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("model_cEXT.h5", save_best_only=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir="logs")
CALLBACKS = [checkpoint_cb, early_stopping_cb, tensorboard_cb]
model_cEXT.fit(X_train, y_train['cEXT'], epochs = EPOCHS, validation_data=(X_test, y_test['cEXT']), callbacks = CALLBACKS)
model_cEXT.predict_proba(X_test)
Error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-72-8f06353cf345> in <module>()
----> 1 model_cEXT.predict_proba(X_test)
AttributeError: 'Sequential' object has no attribute 'predict_proba'
Edit:
i need sklearn's like predict_proba output it is needed for visualization
skplt.metrics.plot_precision_recall_curve(y_test['cEXT'].values, y_prob)
plt.title('Precision-Recall Curve - cEXT')
plt.show()
Use this code instead
predict_prob=model.predict([testa,testb])
predict_classes=np.argmax(predict_prob,axis=1)
New Version might not have predict_proba method so i have creadted my own using .predict method
def predict_prob(number):
return [number[0],1-number[0]]
y_prob = np.array(list(map(predict_prob, model_cEXT.predict(X_test))))
y_prob

Splitting data into sequences for LSTM

I have input with 3 features and I want to predict only one feature. I want to split the data into 10 samples in each sequence and then train the LSTM model. My code is below.
def split_sequences(sequence_x, sequence_y, n_steps):
X, y = [], []
for i in range(0,len(sequence_x),n_steps):
X.append(sequence_x[i:i+n_steps])
y.append(sequence_y[i:i+n_steps])
X = np.array(X)
y = np.array(y)
return X,y
First I separated the feature that I would predictfrom the dataframe.
sequence_y = df['feature4'].to_list()
df = df.drop(columns = ['feature4'])
n_steps_s = 10
X, y = split_sequences(df.values.tolist(), sequence_y, n_steps_s)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
LSTM model:
n_features = len(X_train[0][0])
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
model.add(LSTM(100, activation='relu'))
model.add(Dense(n_steps_out))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit(X_train, y_train, epochs=2, verbose=1)
But then I get this error:
ValueError Traceback (most recent call last)
<ipython-input-682-fa5811eb8173> in <module>
8 model.compile(optimizer='adam', loss='mse')
9 # fit model
---> 10 model.fit(X_train, y_train, epochs=2, verbose=1)
.
.
.
.
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
def split_sequence(sequence, n_steps):
X,y = list(),list()
for i in range(len(sequence)):
end_ix=i+n_steps
if end_ix > len(sequence)-1:
break
seq_x, seq_y =sequence[i:end_ix], sequence[end_ix]
X.append(seq_x)
y.append(seq_y)
return array(X),array(y)

ValueError: The `batch_size` argument must not be specified for the given input type

I have created dummy data for an autoencoder model using tf.data.dataset. I am using tensorflow-1.15.x, keras 2.3.1 and numpy 1.19.5 for running on my GPU.
N = 5000
H = 128
W = 128
C = 2
train_data = np.random.randn(N,H,W,C)
train_data = tf.convert_to_tensor(train_data)
test_data = np.random.randn(500,H,W,C)
test_data = tf.convert_to_tensor(test_data)
batch_size = 1
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_data))
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_data))
test_dataset = test_dataset.batch(batch_size, drop_remainder=True)
But while fitting this data on the model, I get the following error -
epochs = 5
rms = RMSprop(learning_rate=0.00002,
rho=0.9,
epsilon=1e-07)
model.compile(loss='mean_squared_error', optimizer=rms, metrics=['mean_squared_error'])
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
logs = "logs/"
tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logs,
histogram_freq = 1)
history = model.fit(train_dataset, epochs=epochs, batch_size=batch_size, callbacks=[callback, tboard_callback],
validation_data=test_dataset)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-47134a802115> in <module>()
15
16 history = model.fit(train_dataset, epochs=epochs, batch_size=batch_size, callbacks=[callback, tboard_callback],
---> 17 validation_data=test_dataset)
2 frames
/tensorflow-1.15.2/python3.7/tensorflow_core/python/keras/engine/training.py in _validate_or_infer_batch_size(self, batch_size, steps, x)
1813 'The `batch_size` argument must not be specified for the given '
1814 'input type. Received input: {}, batch_size: {}'.format(
-> 1815 x, batch_size))
1816 return
1817
ValueError: The `batch_size` argument must not be specified for the given input type. Received input: <DatasetV1Adapter shapes: ((1, 128, 128, 2), (1, 128, 128, 2)), types: (tf.float64, tf.float64)>, batch_size: 1
Answers to other questions on this error suggest batching both train and validation datasets, but I have batched both already. What could be causing this error? Thanks.
Remove the batch_size in model.fit()
history = model.fit(train_dataset, epochs=epochs, callbacks=[callback, tboard_callback],validation_data=test_dataset)

Unable to interpret an argument of type tensorflow.python.data.ops.dataset_ops.PrefetchDataset as a TFF value in iterative process

I'm trying to run a classification simulation in tff, but I'm getting this error:
TypeError: Unable to interpret an argument of type tensorflow.python.data.ops.dataset_ops.PrefetchDataset as a TFF value.
Here is the code I'm using
client_lr = 1e-3
server_lr = 1e-1
NUM_ROUNDS = 200
NUM_EPOCHS = 5
BATCH_SIZE = 2048
EPOCHS = 400
TH = 0.5
def base_model():
return Sequential([
Dense(256, activation='relu', input_shape=(x_train.shape[-1],)),
Dropout(0.5),
Dense(256, activation='relu'),
Dropout(0.5),
Dense(256, activation='relu'),
Dropout(0.5),
Dense(1, activation='sigmoid'),
])
client_train_dataset = collections.OrderedDict()
for i in range(1, total_clients+1):
client_name = "client_" + str(i)
start = samples_per_set * (i-1)
end = samples_per_set * i
data = collections.OrderedDict((('y', y_train[start:end]), ('x', x_train[start:end])))
client_train_dataset[client_name] = data
train_dataset = tff.simulation.FromTensorSlicesClientData(client_train_dataset)
sample_dataset = train_dataset.create_tf_dataset_for_client(train_dataset.client_ids[0])
sample_element = next(iter(sample_dataset))
PREFETCH_BUFFER = 10
SHUFFLE_BUFFER = samples_per_set
def preprocess(dataset):
def batch_format_fn(element):
return collections.OrderedDict(
x=element['x'],
y=tf.reshape(element['y'], [-1, 1]))
return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)
preprocessed_sample_dataset = preprocess(sample_dataset)
sample_batch = tf.nest.map_structure(lambda x: x.numpy(), next(iter(preprocessed_sample_dataset)))
def make_federated_data(client_data, client_ids):
return [preprocess(client_data.create_tf_dataset_for_client(x)) for x in client_ids]
federated_train_data = make_federated_data(train_dataset, train_dataset.client_ids)
def model_tff():
model = base_model()
return tff.learning.from_keras_model(
model,
input_spec=preprocessed_sample_dataset.element_spec,
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=[
tfa.metrics.F1Score(num_classes=1, threshold=TH),
keras.metrics.Precision(name="precision", thresholds=TH),
keras.metrics.Recall(name="recall", thresholds=TH)
])
iterative_process = tff.learning.build_federated_averaging_process(
model_tff,
client_optimizer_fn=lambda: optimizers.Adam(learning_rate=client_lr),
server_optimizer_fn=lambda: optimizers.SGD(learning_rate=server_lr))
state = iterative_process.initialize()
federated_model = None
for round_num in range(1, NUM_ROUNDS+1):
state, tff_metrics = iterative_process.next(state, federated_train_data) # THE ERROR IS HERE
federated_model = base_model()
federated_model.compile(optimizer=optimizers.Adam(learning_rate=client_lr),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=[
tfa.metrics.F1Score(num_classes=1, threshold=TH),
keras.metrics.Precision(name="precision", thresholds=TH),
keras.metrics.Recall(name="recall", thresholds=TH)
])
state.model.assign_weights_to(model=federated_model)
federated_result = federated_model.evaluate(x_val, y_val, verbose=1, return_dict=True)
federated_test = federated_model.evaluate(x_test, y_test, verbose=1, return_dict=True)
I'm using this creditcard dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud
The federated_train_data is a list of <PrefetchDataset shapes: OrderedDict([(x, (None, 29)), (y, (None, 1))]), types: OrderedDict([(x, tf.float64), (y, tf.int64)])>, just like the tutorial from the Tensorflow Federated website Federated Learning for Image Classification.
This might be issue#918. Does this only occur when running in Google Colab? What version of TFF is being used?
Commit#4e57386 is believed to have fixed this, which is now part of the tensorflow-federated-nightly pip package.

model.fit on keras.sequential when using tf.data.Dataset raises a ValueError

I am trying to build my first classifier on tensorflow 1.10 using tf.data.dataset as an input to a Keras.sequential but the fit method returns the following error:
ValueError: Error when checking target: expected dense_1 to have 2 dimensions, but got array with shape (None,)
First I initialized 2 tf.data.Dataset with the filenames of my dataset
#Initialize dataset directories location and parameters
image_size=50
batch_size=10
mortys_file_pattern = r'C:\Users\Jonas\Downloads\mortys\*'
ricks_file_pattern = r'C:\Users\Jonas\Downloads\ricks\*'
#Each tensor in those dataset will be a filename for a specific image
mortys_dataset = tf.data.Dataset.list_files(mortys_file_pattern)
ricks_dataset = tf.data.Dataset.list_files(ricks_file_pattern)
Then I used the map method to prepare my datasets
#Now, each dataset entry will contain 2 tensors: image,label
mortys_dataset.map(lambda filename: load_resize_label(filename, "morty"))
ricks_dataset.map(lambda filename: load_resize_label(filename, "rick"))
def load_resize_label(filename, label):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [image_size, image_size])
image_resized=image_resized/255.0
return image_resized, tf.convert_to_tensor(label)
Then, I concatenate the datasets into one final dataset and initialize the batch size
#Merge the datasets
dataset = mortys_dataset.concatenate(ricks_dataset)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat()
In the end, use the compile and fit method of the model object
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(dataset, epochs=10, steps_per_epoch=30)
(Full code bellow)
I'm using:
Windows 10 64bits
cudnn-9.0-windows10-x64-v7.2.1.38
cuda_9.0.176_win10
tensorflow-gpu 1.10.0
import tensorflow as tf
from tensorflow import keras
image_size=50
batch_size=10
# Reads an image from a file, decodes it into a dense tensor, resizes it
# to a fixed shape.
def load_resize_label(filename, label):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [image_size, image_size])
image_resized=image_resized/255.0
return image_resized, tf.convert_to_tensor(label)
#Initialize dataset directories location
mortys_file_pattern = r'C:\Users\Jonas\Downloads\mortys\*'
ricks_file_pattern = r'C:\Users\Jonas\Downloads\ricks\*'
#Each tensor in those dataset will be a filename for a specific image
mortys_dataset = tf.data.Dataset.list_files(mortys_file_pattern)
ricks_dataset = tf.data.Dataset.list_files(ricks_file_pattern)
#Now, each dataset entry will contain 2 tensors: image,label
mortys_dataset = mortys_dataset.map(lambda filename: load_resize_label(filename, "morty"))
ricks_dataset = ricks_dataset.map(lambda filename: load_resize_label(filename, "rick"))
#Merge the datasets
dataset = mortys_dataset.concatenate(ricks_dataset)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat()
#the CNN architecture
model = keras.Sequential([
keras.layers.Convolution2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(image_size, image_size,3)),
keras.layers.MaxPool2D(pool_size=2),
keras.layers.BatchNormalization(),
keras.layers.Flatten(),
keras.layers.Dense(128, activation=tf.nn.relu),
keras.layers.Dropout(0.3),
keras.layers.Dense(2, activation=tf.nn.softmax)
])
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(dataset, epochs=10, steps_per_epoch=30)
Traceback:
Traceback (most recent call last):
File "C:/Users/Jonas/PycharmProjects/learning/lesson2.py", line 47, in <module>
model.fit(dataset, epochs=10, steps_per_epoch=30)
File "C:\Users\Jonas\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1278, in fit
validation_split=validation_split)
File "C:\Users\Jonas\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 917, in _standardize_user_data
exception_prefix='target')
File "C:\Users\Jonas\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training_utils.py", line 182, in standardize_input_data
'with shape ' + str(data_shape))
ValueError: Error when checking target: expected dense_1 to have 2 dimensions, but got array with shape (None,)
You're missing some '=' in your code.
Each dataset operation should be like :
dataset = dataset.some_ops(...)
Here is how your code should look:
import tensorflow as tf
from tensorflow import keras
image_size=50
batch_size=10
# Reads an image from a file, decodes it into a dense tensor, resizes it
# to a fixed shape.
def load_resize_label(filename, label):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [image_size, image_size])
image_resized=image_resized/255.0
if label == 'morty':
label = [0, 1]
elif label == 'rick':
label = [1, 0]
else:
raise ValueError(label)
return image_resized, tf.convert_to_tensor(label)
#Initialize dataset directories location
mortys_file_pattern = r'C:\Users\Jonas\Downloads\mortys\*'
ricks_file_pattern = r'C:\Users\Jonas\Downloads\ricks\*'
#Each tensor in those dataset will be a filename for a specific image
mortys_dataset = tf.data.Dataset.list_files(mortys_file_pattern)
ricks_dataset = tf.data.Dataset.list_files(ricks_file_pattern)
#Now, each dataset entry will contain 2 tensors: image,label
mortys_dataset = mortys_dataset.map(lambda filename: load_resize_label(filename, "morty"))
ricks_dataset = ricks_dataset.map(lambda filename: load_resize_label(filename, "rick"))
#Merge the datasets
dataset = mortys_dataset.concatenate(ricks_dataset)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat()
#the CNN architecture
model = keras.Sequential([
keras.layers.Convolution2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(image_size, image_size, 3)),
keras.layers.MaxPool2D(pool_size=2),
keras.layers.BatchNormalization(),
keras.layers.Flatten(),
keras.layers.Dense(128, activation=tf.nn.relu),
keras.layers.Dropout(0.3),
keras.layers.Dense(2, activation=tf.nn.softmax)
])
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(dataset, epochs=10, steps_per_epoch=30)
Also, I advise you to use dataset.prefetch(None) and use the num_parallel_calls argument in the map function. Here is why. TLDR: it's faster.