Feature Extraction Using Attention and then applying LSTM - tensorflow

I have a data having 100 features: Partial Preview of Data
I want to implement Attention to extract 32 features from these 100 and feed those to LSTM as done similarly in this paper: https://arxiv.org/abs/1902.11074
I want to implement the same architecture as given in the paper but am not able to formulate the code for doing it.
X = data.iloc[:, 1:101].values
y = data.iloc[:, 0].values
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=2)
X_train = X_train.reshape(-1, 1, 100)
X_test = X_test.reshape(-1, 1, 100)
print("Training & Testing Data Shape: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Model
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(128, return_sequences=True, input_shape=(1, 100)))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.LSTM(32, return_sequences=False))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1, activation = 'linear'))
# Compile Model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7)
model.compile(loss='mean_absolute_error',
optimizer=optimizer)
# Fitting the model
history = model.fit(X_train, y_train,
epochs=30,
batch_size=64,
verbose=1,
validation_split=0.2,
shuffle=True)
Some more details:
Training & Testing Data Shape: (1890, 1, 100) (473, 1, 100) (1890,) (473,)
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 1, 128) 117248
_________________________________________________________________
dropout (Dropout) (None, 1, 128) 0
_________________________________________________________________
lstm_1 (LSTM) (None, 32) 20608
_________________________________________________________________
dropout_1 (Dropout) (None, 32) 0
_________________________________________________________________
dense (Dense) (None, 1) 33
=================================================================
Total params: 137,889
Trainable params: 137,889
Non-trainable params: 0
_________________________________________________________________

Related

Error on custom dataset dimensions feeding transfer model in TensorFLow

Can someone explain this TensorFlow error for me, I'm having trouble understanding what I am doing wrong.
I have a dataset in Tensorflow constructed with a generator. When I test the output of the generator, output dimensions look correct (224 x 224 x 1). But when I try to train the model, I get an error:
WARNING:tensorflow:Model was constructed with shape (None, 224, 224, 1) for input
KerasTensor(type_spec=TensorSpec(shape=(None, 224, 224, 1), dtype=tf.float32,
name='input_2'), name='input_2', description="created by layer 'input_2'"),
but it was called on an input with incompatible shape (224, 224, 1, 1).
I'm unsure why the dimension of this output has an extra 1 at the end.
Here is the code to create the generator and model. df is a dataframe with file-paths to data and labels. The data are 2D matrices of variable dimensions. I'm using cv2.resize to make them 224x224 and then np.reshape to transform dimensions to (224x224x1). Then I yield the result.
def datagen_row():
# ======================== #
# Import data
# ======================== #
df = get_data()
rowsize = 224
colsize = 224
# ======================== #
#
# ======================== #
for row in range(len(df)):
data = get_data_from_filepath(df.iloc[row].file_path)
data = cv2.resize(data, dsize=(rowsize, colsize), interpolation=cv2.INTER_CUBIC)
labels = df.iloc[row].label
data = data.reshape( 224, 224, 1)
yield data, labels
dataset = tf.data.Dataset.from_generator(
datagen_row,
output_signature=(
tf.TensorSpec(shape = (int(os.getenv('rowsize')), int(os.getenv('colsize')), 1), dtype=tf.float32, name=None),
tf.TensorSpec(shape=(), dtype=tf.int64, name=None)
)
)
Testing the following I get what I expected:
iterator = iter(dataset.batch(8))
x = iterator.get_next()
x[0].shape # TensorShape([8, 224, 224, 1])
x[1].shape # TensorShape([8])
x[0] # <tf.Tensor: shape=(8, 224, 224, 1), dtype=float32, numpy=array(...
x[1] # <tf.Tensor: shape=(8,), dtype=int64, numpy=array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)>
I'm trying to plug this into InceptionV3 model to do a classification
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.layers import Input
from tensorflow.keras import layers
origModel = InceptionV3(weights = 'imagenet', include_top = False)
inputs = layers.Input(shape = (224, 224, 1))
modified_inputs = layers.Conv2D(3, 3, padding = 'same', activation='relu')(inputs)
x = origModel(modified_inputs)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(1024, activation = 'relu')(x)
x = layers.Dense(512, activation = 'relu')(x)
x = layers.Dense(256, activation = 'relu')(x)
x = layers.Dense(128, activation = 'relu')(x)
x = layers.Dense(64, activation = 'relu')(x)
x = layers.Dense(32, activation = 'relu')(x)
outputs = layers.Dense(2)(x)
model = tf.keras.Model(inputs, outputs)
model.summary() # 24.6 M trainable params
for layer in origModel.layers:
layer.trainable = False
model.summary() # now shows 2.8 M trainable params
model.compile(
optimizer = 'adam',
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
metrics = ['accuracy']
)
model.fit(dataset, epochs = 1, verbose = True, batch_size = 32)
Here is the output of model.summary
model.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 224, 224, 1)] 0
conv2d_94 (Conv2D) (None, 224, 224, 3) 30
inception_v3 (Functional) (None, None, None, 2048) 21802784
global_average_pooling2d (G (None, 2048) 0
lobalAveragePooling2D)
dense (Dense) (None, 1024) 2098176
dense_1 (Dense) (None, 512) 524800
dense_2 (Dense) (None, 256) 131328
dense_3 (Dense) (None, 128) 32896
dense_4 (Dense) (None, 64) 8256
dense_5 (Dense) (None, 32) 2080
dense_6 (Dense) (None, 2) 66
=================================================================
Total params: 24,600,416
Trainable params: 2,797,632
Non-trainable params: 21,802,784
_________________________________________________________________
This code worked after changing
model.fit(dataset, epochs = 1, verbose = True, batch_size = 32)
to
model.fit(dataset.batch(2), epochs = 1, verbose = True, batch_size = 32)
So... I will have to look into using dataset.batch versus batch_size in model.fit

why can't I split my image dataset to 8:1:1?

I try to split my dataset to 8:1:1 and my dataset is in a directory, at first I try this code
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
dir,
validation_split=0.2,
subset="training",
seed=123,
image_size=(img_height, img_width),
batch_size=batch_size)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
dir,
validation_split=0.1,
subset="validation",
seed=123,
image_size=(img_height, img_width),
batch_size=batch_size)
But it doesn't do that job and just split my directory to val_ds and test_ds
after this, I use this code
# create a data generator
datagen = ImageDataGenerator()
# load and iterate training dataset
train_it = datagen.flow_from_directory(dir, target_size=(32, 32), color_mode='grayscale', class_mode='binary', batch_size=32, shuffle=True, follow_links=False, subset=None, interpolation='nearest')
# load and iterate validation dataset
val_it = datagen.flow_from_directory(dir, target_size=(32, 32), color_mode='grayscale', class_mode='binary', batch_size=32, shuffle=True, follow_links=False, subset=None, interpolation='nearest')
# load and iterate test dataset
test_it = datagen.flow_from_directory(dir, target_size=(32, 32), color_mode='grayscale', class_mode='binary', batch_size=32, shuffle=True, follow_links=False, subset=None, interpolation='nearest')
This code also has a problem with my model so when I use this code My model summary will be like this
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
rescaling_1 (Rescaling) (None, None, None, None) 0
_________________________________________________________________
conv2d_3 (Conv2D) (None, None, None, 32) 320
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, None, None, 32) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, None, None, 32) 9248
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, None, None, 32) 0
_________________________________________________________________
conv2d_5 (Conv2D) (None, None, None, 32) 9248
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, None, None, 32) 0
_________________________________________________________________
dropout_1 (Dropout) (None, None, None, 32) 0
_________________________________________________________________
flatten_1 (Flatten) (None, None) 0
_________________________________________________________________
dense_2 (Dense) (None, 128) 16512
_________________________________________________________________
dense_3 (Dense) (None, 26) 3354
=================================================================
Total params: 38,682
Trainable params: 38,682
Non-trainable params: 0
_________________________________________________________________
and this is my model
num_classes = 26
model = tf.keras.Sequential([
tf.keras.layers.experimental.preprocessing.Rescaling(1./255),
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
layers.Dropout(0.2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(num_classes)
])
model.compile(
optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
So I need to know How can I split my data without any problem?
You can do the following
import glob # To get the whole path for all the images
'''
Let's consider that your images lie inside 2 folders - 'a' and 'b' which are inside your 'dir' folder. To get paths to each of those images you can use
the below code
'''
image_paths_a = glob.glob('./dir/a/*.jpg') # .jpg if the files ends with jpg
image_paths_b = glob.glob('./dir/b/*.jpg') # to get images from b
images_total = image_paths_a + image_paths_b
# In case you have other folders you can also do this
# to get all images inside all folder in 'dir' folder.
images_total = glob.glob('./dir/*/*.jpg')
# Now get the labels corresponding to these images
# If you have labeled as folder names then you can do it like this
image_labels = [i.split('/')[-2] for i in images_total]
'''
After doing the above you have 2 lists -> 1) Image paths 2) corresponding labels and now you can just use the 'sklearn.model_selection.train_test_split' to get your splits
'''
from sklearn.model_selection import train_test_split
# To set train data and get rest 20% for further split
xtrain, xtest, ytrain, ytest = trian_test_split(images_total,
image_labels,
stratify=image_labels,
random_state=1234,
test_size=0.2)
# get 10%-10% of original data
xvalid, xtest, yvalid, ytest= trian_test_split(xtest,
ytest,
stratify=ytest,
random_state=1234,
test_size=0.5)
'''
Now you can just create a dataset but before that you will create a function to read images from image paths.
'''
def read_img(path, label):
file = tf.io.read_file(path)
img = tf.image.decode_png(file)
# dim1 and dim2 are your desired dimensions
img = tf.image.resize(img, (dim1, dim2))
return img, label
train_dataset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
train_dataset = train_dataset.map(read_img).batch(batch_size)
valid_dataset = tf.data.Dataset.from_tensor_slices((xvalid, yvalid))
valid_dataset = valid_dataset.map(read_img).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((xtest, ytest))
test_dataset = test_dataset.map(read_img).batch(batch_size)
# Now you just need to train your model
model.fit(train_dataset, epochs=5, validation_data=valid_dataset)

Sequential VGG16 model, graph disconnected error

I have a sequential model with a VGG16 at the top.:
def rescale(x):
return x/65535.
base_model = tf.keras.applications.VGG16(
include_top=True, weights=None, input_tensor=None, input_shape=(224,224,1),
pooling=None, classes=102, classifier_activation='softmax')
model = tf.keras.Sequential([
tf.keras.Input(shape=(None, None, 1)),
tf.keras.layers.Lambda(rescale),
tf.keras.layers.experimental.preprocessing.Resizing(224, 224),
tf.keras.layers.experimental.preprocessing.RandomFlip(mode='horizontal_and_vertical', seed=42),
base_model
])
Output model.summary():
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lambda (Lambda) (None, None, None, 1) 0
_________________________________________________________________
resizing (Resizing) (None, 224, 224, 1) 0
_________________________________________________________________
random_flip (RandomFlip) (None, 224, 224, 1) 0
_________________________________________________________________
vgg16 (Functional) (None, 102) 134677286
=================================================================
Total params: 134,677,286
Trainable params: 134,677,286
Non-trainable params: 0
Now I want to create a new model with two outputs:
vgg_model = model.layers[3]
last_conv_layer = vgg_model.get_layer('block5_conv3')
new_model = tf.keras.models.Model(inputs=[model.inputs], outputs=[last_conv_layer.output, model.output])
But I get this error:
ValueError: Graph disconnected: cannot obtain value for tensor Tensor("input_1_6:0", shape=(None, 224, 224, 1), dtype=float32) at layer "block1_conv1". The following previous layers were accessed without issue: []
What am I missing here?
Given a fitted model in this form:
def rescale(x):
return x/65535.
base_model = tf.keras.applications.VGG16(
include_top=True, weights=None, input_tensor=None, input_shape=(224,224,1),
pooling=None, classes=102, classifier_activation='softmax')
model = tf.keras.Sequential([
tf.keras.Input(shape=(None, None, 1)),
tf.keras.layers.Lambda(rescale),
tf.keras.layers.experimental.preprocessing.Resizing(224, 224),
tf.keras.layers.experimental.preprocessing.RandomFlip(mode='horizontal_and_vertical', seed=42),
base_model
])
### model.fit(...)
You can wrap your vgg in a Model that returns all the outputs you need
new_model = Model(inputs=model.layers[3].input,
outputs=[model.layers[3].output,
model.layers[3].get_layer('block5_conv3').output])
inp = tf.keras.Input(shape=(None, None, 1))
x = tf.keras.layers.Lambda(rescale)(inp)
x = tf.keras.layers.experimental.preprocessing.Resizing(224, 224)(x)
outputs = new_model(x)
new_model = Model(inp, outputs)
The summary of new_model:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_49 (InputLayer) [(None, None, None, 1)] 0
_________________________________________________________________
lambda_25 (Lambda) (None, None, None, 1) 0
_________________________________________________________________
resizing_25 (Resizing) (None, 224, 224, 1) 0
_________________________________________________________________
functional_47 (Functional) [(None, 102), (None, 14, 134677286
=================================================================
Total params: 134,677,286
Trainable params: 134,677,286
Non-trainable params: 0

How to use model subclassing in Keras?

Having the following model written in the sequential API:
config = {
'learning_rate': 0.001,
'lstm_neurons':32,
'lstm_activation':'tanh',
'dropout_rate': 0.08,
'batch_size': 128,
'dense_layers':[
{'neurons': 32, 'activation': 'relu'},
{'neurons': 32, 'activation': 'relu'},
]
}
def get_model(num_features, output_size):
opt = Adam(learning_rate=0.001)
model = Sequential()
model.add(Input(shape=[None,num_features], dtype=tf.float32, ragged=True))
model.add(LSTM(config['lstm_neurons'], activation=config['lstm_activation']))
model.add(BatchNormalization())
if 'dropout_rate' in config:
model.add(Dropout(config['dropout_rate']))
for layer in config['dense_layers']:
model.add(Dense(layer['neurons'], activation=layer['activation']))
model.add(BatchNormalization())
if 'dropout_rate' in layer:
model.add(Dropout(layer['dropout_rate']))
model.add(Dense(output_size, activation='sigmoid'))
model.compile(loss='mse', optimizer=opt, metrics=['mse'])
print(model.summary())
return model
When using a distributed training framework, I need to convert the syntax to use model subclassing instead.
I've looked at the docs but couldn't figure out how to do it.
Here is one equivalent subclassed implementation. Though I didn't test.
import tensorflow as tf
# your config
config = {
'learning_rate': 0.001,
'lstm_neurons':32,
'lstm_activation':'tanh',
'dropout_rate': 0.08,
'batch_size': 128,
'dense_layers':[
{'neurons': 32, 'activation': 'relu'},
{'neurons': 32, 'activation': 'relu'},
]
}
# Subclassed API Model
class MySubClassed(tf.keras.Model):
def __init__(self, output_size):
super(MySubClassed, self).__init__()
self.lstm = tf.keras.layers.LSTM(config['lstm_neurons'],
activation=config['lstm_activation'])
self.bn = tf.keras.layers.BatchNormalization()
if 'dropout_rate' in config:
self.dp1 = tf.keras.layers.Dropout(config['dropout_rate'])
self.dp2 = tf.keras.layers.Dropout(config['dropout_rate'])
self.dp3 = tf.keras.layers.Dropout(config['dropout_rate'])
for layer in config['dense_layers']:
self.dense1 = tf.keras.layers.Dense(layer['neurons'],
activation=layer['activation'])
self.bn1 = tf.keras.layers.BatchNormalization()
self.dense2 = tf.keras.layers.Dense(layer['neurons'],
activation=layer['activation'])
self.bn2 = tf.keras.layers.BatchNormalization()
self.out = tf.keras.layers.Dense(output_size,
activation='sigmoid')
def call(self, inputs, training=True, **kwargs):
x = self.lstm(inputs)
x = self.bn(x)
if 'dropout_rate' in config:
x = self.dp1(x)
x = self.dense1(x)
x = self.bn1(x)
if 'dropout_rate' in config:
x = self.dp2(x)
x = self.dense2(x)
x = self.bn2(x)
if 'dropout_rate' in config:
x = self.dp3(x)
return self.out(x)
# A convenient way to get model summary
# and plot in subclassed api
def build_graph(self, raw_shape):
x = tf.keras.layers.Input(shape=(None, raw_shape),
ragged=True)
return tf.keras.Model(inputs=[x],
outputs=self.call(x))
Build and compile the mdoel
s = MySubClassed(output_size=1)
s.compile(
loss = 'mse',
metrics = ['mse'],
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001))
Pass some tensor to create weights (check).
raw_input = (16, 16, 16)
y = s(tf.ones(shape=(raw_input)))
print("weights:", len(s.weights))
print("trainable weights:", len(s.trainable_weights))
weights: 21
trainable weights: 15
Summary and Plot
Summarize and visualize the model graph.
s.build_graph(16).summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, None, 16)] 0
_________________________________________________________________
lstm (LSTM) (None, 32) 6272
_________________________________________________________________
batch_normalization (BatchNo (None, 32) 128
_________________________________________________________________
dropout (Dropout) (None, 32) 0
_________________________________________________________________
dense_2 (Dense) (None, 32) 1056
_________________________________________________________________
batch_normalization_3 (Batch (None, 32) 128
_________________________________________________________________
dropout_1 (Dropout) (None, 32) 0
_________________________________________________________________
dense_3 (Dense) (None, 32) 1056
_________________________________________________________________
batch_normalization_4 (Batch (None, 32) 128
_________________________________________________________________
dropout_2 (Dropout) (None, 32) 0
_________________________________________________________________
dense_4 (Dense) (None, 1) 33
=================================================================
Total params: 8,801
Trainable params: 8,609
Non-trainable params: 192
tf.keras.utils.plot_model(
s.build_graph(16),
show_shapes=True,
show_dtype=True,
show_layer_names=True,
rankdir="TB",
)

CNN implementation using Keras and Tensorflow

I have created a CNN model using Keras and I am training it on a MNIST dataset. I got a reasonable accuracy around 98%, which is what I expected:
model = Sequential()
model.add(Conv2D(64, 5, activation="relu", input_shape=(28, 28, 1)))
model.add(MaxPool2D())
model.add(Conv2D(64, 5, activation="relu"))
model.add(MaxPool2D())
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(data.x_train, data.y_train,
batch_size=256, validation_data=(data.x_test, data.y_test))
Now I want to build the same model, but using vanilla Tensorflow, here is how I did that:
X = tf.placeholder(shape=[None, 784], dtype=tf.float32, name="X")
Y = tf.placeholder(shape=[None, 10], dtype=tf.float32, name="Y")
net = tf.reshape(X, [-1, 28, 28, 1])
net = tf.layers.conv2d(
net, filters=64, kernel_size=5, padding="valid", activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, pool_size=2, strides=2)
net = tf.layers.conv2d(
net, filters=64, kernel_size=5, padding="valid", activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, pool_size=2, strides=2)
net = tf.contrib.layers.flatten(net)
net = tf.layers.dense(net, name="dense1", units=256, activation=tf.nn.relu)
model = tf.layers.dense(net, name="output", units=10)
And here is how I train/test it:
loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=model)
opt = tf.train.AdamOptimizer().minimize(loss)
accuracy = tf.cast(tf.equal(tf.argmax(model, 1), tf.argmax(Y, 1)), tf.float32)
with tf.Session() as sess:
tf.global_variables_initializer().run()
for batch in range(data.get_number_of_train_batches(batch_size)):
x, y = data.get_next_train_batch(batch_size)
sess.run([loss, opt], feed_dict={X: x, Y: y})
for batch in range(data.get_number_of_test_batches(batch_size)):
x, y = data.get_next_test_batch(batch_size)
sess.run(accuracy, feed_dict={X: x, Y: y})
But the resulting accuracy of the model dropped to ~80%. What are the principal differences between my implementation of that model using Keras and Tensorflow ? Why the accuracy varies so much ?
I don't see any mistakes in your code. Note that your current model is heavily parameterized for such a simple problem because of the Dense layers, which introduce over 260k trainable parameters:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_3 (Conv2D) (None, 24, 24, 64) 1664
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 12, 12, 64) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, 8, 8, 64) 102464
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 4, 4, 64) 0
_________________________________________________________________
flatten_2 (Flatten) (None, 1024) 0
_________________________________________________________________
dense_2 (Dense) (None, 256) 262400
_________________________________________________________________
dense_3 (Dense) (None, 10) 2570
=================================================================
Total params: 369,098
Trainable params: 369,098
Non-trainable params: 0
_________________________________________________________________
Below, I will run your code with:
minor adaptations to make the code work with the MNIST dataset in keras.datasets
a simplified model: basically I remove the 256-node Dense layer, drastically reducing the number of trainable parameters, and introduce some dropout for regularization.
With these changes, both models achieve 90%+ validation set accuracy after the first epoch. So it seems the problem you encountered has to do with an ill-posed optimization problem which leads to highly variable outcomes, and not with a bug in your code.
# Import the datasets
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Add batch dimension
x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)
# One-hot encode the labels
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)
batch_size = 64
# Fit model using Keras
import keras
import numpy as np
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from keras.models import Sequential
model = Sequential()
model.add(Conv2D(32, 5, activation="relu", input_shape=(28, 28, 1)))
model.add(MaxPool2D())
model.add(Conv2D(32, 5, activation="relu"))
model.add(MaxPool2D())
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=32, validation_data=(x_test, y_test), epochs=1)
Result:
Train on 60000 samples, validate on 10000 samples
Epoch 1/1
60000/60000 [==============================] - 35s 583us/step - loss: 1.5217 - acc: 0.8736 - val_loss: 0.0850 - val_acc: 0.9742
Note that the number of trainable parameters is now just a fraction of the amount in your model:
model.summary()
Layer (type) Output Shape Param #
=================================================================
conv2d_3 (Conv2D) (None, 24, 24, 32) 832
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 12, 12, 32) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, 8, 8, 32) 25632
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 4, 4, 32) 0
_________________________________________________________________
flatten_2 (Flatten) (None, 512) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 512) 0
_________________________________________________________________
dense_2 (Dense) (None, 10) 5130
=================================================================
Total params: 31,594
Trainable params: 31,594
Non-trainable params: 0
Now, doing the same with TensorFlow:
# Fit model using TensorFlow
import tensorflow as tf
X = tf.placeholder(shape=[None, 28, 28, 1], dtype=tf.float32, name="X")
Y = tf.placeholder(shape=[None, 10], dtype=tf.float32, name="Y")
net = tf.layers.conv2d(
X, filters=32, kernel_size=5, padding="valid", activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, pool_size=2, strides=2)
net = tf.layers.conv2d(
net, filters=32, kernel_size=5, padding="valid", activation=tf.nn.relu)
net = tf.layers.max_pooling2d(net, pool_size=2, strides=2)
net = tf.contrib.layers.flatten(net)
net = tf.layers.dropout(net, rate=0.25)
model = tf.layers.dense(net, name="output", units=10)
loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=model)
opt = tf.train.AdamOptimizer().minimize(loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(model, 1), tf.argmax(Y, 1)), tf.float32))
with tf.Session() as sess:
tf.global_variables_initializer().run()
L = []
l_ = 0
for i in range(x_train.shape[0] // batch_size):
x, y = x_train[i*batch_size:(i+1)*batch_size],\
y_train[i*batch_size:(i+1)*batch_size]
l, _ = sess.run([loss, opt], feed_dict={X: x, Y: y})
l_ += np.mean(l)
L.append(l_ / (x_train.shape[0] // batch_size))
print('Training loss: {:.3f}'.format(L[-1]))
acc = []
for j in range(x_test.shape[0] // batch_size):
x, y = x_test[j*batch_size:(j+1)*batch_size],\
y_test[j*batch_size:(j+1)*batch_size]
acc.append(sess.run(accuracy, feed_dict={X: x, Y: y}))
print('Test set accuracy: {:.3f}'.format(np.mean(acc)))
Result:
Training loss: 0.519
Test set accuracy: 0.968
Possible improvement of your models.
I used CNN networks on different problems and always got good effectiveness improvements with regularization techniques, the best ones with dropout.
I suggest to use Dropout on the Dense layers and in case with lower probability on the convolutional ones.
Also data augmentation on the input data is very important, but applicability depends on the problem domain.
P.s: in one case I had to change the optimization from Adam to SGD with Momentum. So, playing with the optimization makes sense. Also Gradient clipping can be considered when your networks starves and doesn't improve effectiveness, may be a numeric issue.