tensorflow GPU training doesn't take place - tensorflow

I am using the following code (once on CPU (without the "disable_eager_execution" part)) and yet once more with GPU.
On the CPU training, one epoch takes 12 Hours but the loss changes from batch to batch and I see that training takes place.
On the GPU version. Nothing happens. Training one epoch takes around 1 Hour but loss and accuracy stay the same.
PLEASE HELP ME UNDERSTAND WHAT DO I DO WRONG...
I am running this code using aws sage maker (ml.g4dn.4xlarge)
Code:
import numpy as np
import pandas as pd
import os
import datetime
import tensorflow as tf
import re
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.python.framework.ops import disable_eager_execution
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')), '\n')
disable_eager_execution()
# Read data
# read dictionaries:
# company_dict:
company_df = pd.read_csv("/home/ec2-user/SageMaker/company_similarity/data/company_dict.csv", sep='\t', header=None)
company_df.columns = ['company_id', 'idx']
# payee dict
payee_df = pd.read_csv("/home/ec2-user/SageMaker/company_similarity/data/cleaned_up_payee_dict.csv", sep='\t', header=None)
payee_df.columns = ['payee', 'idx']
# Read raw data
BATCH_SIZE = 32
raw_data = tf.data.experimental.make_csv_dataset(
"/home/ec2-user/SageMaker/company_similarity/data/training_data.csv",
column_names=['company_id', 'payee', 'label'],
select_columns=['company_id', 'payee', 'label'],
field_delim='\t',
column_defaults=[tf.string, tf.string, tf.int32],
batch_size=BATCH_SIZE,
label_name='label',
na_value="?",
num_epochs=1,
ignore_errors=True,
)
class PreprocessingFeatures(object):
def __init__(self, company_idx, payee_idx):
self.payee_idx = payee_idx
self.company_idx = company_idx
self.symbols = '!"$%&\'\?()*+,-./:;<=>?[\\]^_`{|}~a-zA-Z0-9 '
self.payee_lookup = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=payee_idx,
mask_token=None,
num_oov_indices=1
)
self.company_lookup = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=company_idx,
mask_token=None,
num_oov_indices=1
)
def __call__(self, features, labels):
payee = self.payee_lookup(features['payee'])
company = self.company_lookup(features['company_id'])
return (company, payee), labels
payee_list = list(payee_df['payee'])
company_list = [str(si) for si in list(company_df['company_id'])]
# ************ START TRAINING ************ #
log_dir = '/home/ec2-user/SageMaker/company_similarity/models/logs/fit/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
checkpoint_dir = '/home/ec2-user/SageMaker/company_similarity/models/embedding_checkpoints'
file_writer = tf.summary.create_file_writer(log_dir + "/metrics")
file_writer.set_as_default()
def present_topK(model, listA, item_title='eBay', topK=10):
'''
show top 10 similar items using model embedding
:param model: the actual model
:param item_index: dictionary with item name:index format
:param index_item: dictionary with index:item name format
:param item_title: title text
:return: table to print (string)
'''
assert item_title in listA, "Item not in Vocabulary"
emb = model.layers[2].get_weights()[0]
# we started from 1 not zero on dictionary
score = cosine_similarity(emb[listA.index(item_title)+1].reshape(1, -1), emb)[0]
similar_items = np.argsort(score)[::-1][:topK]
res = {'payee': [], 'score': []}
for i in similar_items:
res['payee'] += [listA[i-1]]
res['score'] += [score[i]]
return "\n".join("{}\t{}".format(k, v) for k, v in res.items())
class GenerateExamplesCallback(tf.keras.callbacks.Callback):
def __init__(self):
self.step = 0
def on_epoch_end(self, epoch, logs=None):
self.step += 1
self.model.save('/home/ec2-user/SageMaker/company_similarity/models/embedding_checkpoints/model_{}'.format(epoch))
sim_table = present_topK(self.model, payee_list)
print("\nSimilar Items to 'eBay': ", sim_table)
with file_writer.as_default():
tf.summary.text('Similarity sanity check', data=tf.convert_to_tensor(sim_table), step=epoch)
def on_batch_end(self, batch, logs=None):
if batch % 1000 == 0:
sim_table = present_topK(self.model, payee_list)
print("\nSimilar Items to 'eBay': ", sim_table)
with file_writer.as_default():
tf.summary.text('Similarity sanity check', data=tf.convert_to_tensor(sim_table), step=batch)
print('TensorBoard logging folder: ', log_dir)
print("checkpoint_dir:", checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True, save_freq=200000)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, update_freq=10000)
callbacks = [checkpoint_callback, tensorboard_callback, GenerateExamplesCallback()]
# read the data
# train_data = raw_data.map(PreprocessingFeatures(company_list, payee_list)).shuffle(buffer_size=10000).repeat()
train_data = raw_data.map(PreprocessingFeatures(company_list, payee_list)).repeat()
# examples
# next(iter(raw_data.take(1)))
# next(iter(train_data))
# wc -l <filename> on terminal
fileLen = 5851184
STEPS_PER_EPOCH = (fileLen // BATCH_SIZE) + 1
# STEPS_PER_EPOCH = 1000
def build_model(company_embedding=128, payee_embedding=128, loss=tf.keras.losses.binary_crossentropy):
company_input = tf.keras.layers.Input(name='company_input', shape=(1,))
payee_input = tf.keras.layers.Input(name='payee_input', shape=(1,))
company_emb = tf.keras.layers.Embedding(name='company_embedding',
input_dim=len(company_list)+1,
output_dim=company_embedding)(company_input)
company_emb = tf.keras.layers.Flatten()(company_emb)
payee_emb = tf.keras.layers.Embedding(name='payee_embedding',
input_dim=len(payee_list)+1,
output_dim=payee_embedding)(payee_input)
payee_emb = tf.keras.layers.Flatten()(payee_emb)
merged = tf.keras.layers.Dot(name='dot', normalize=True, axes=1)([payee_emb, company_emb])
merged = tf.keras.layers.Reshape(target_shape = [1])(merged)
x = tf.keras.layers.Dense(1, activation='sigmoid')(merged)
# x = tf.keras.layers.Concatenate()([item_emb, device_emb])
# x = tf.keras.layers.Dense(128, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.5)(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.5)(x)
# x = tf.keras.layers.Dense(32, activation='relu')(x)
# # x = tf.keras.layers.BatchNormalization()(x)
# x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=(company_input, payee_input), outputs=x)
model.compile(
loss=loss,
optimizer='adam',
metrics=['accuracy'],
)
return model
model = build_model()
EPOCHS = 5
r = model.fit(train_data,
epochs=EPOCHS,
steps_per_epoch=STEPS_PER_EPOCH,
callbacks=callbacks
)
model.save("/home/ec2user/SageMaker/company_similarity/models/models/embedding_model_final.h5")
print("Training is completed")

Related

Learning a simple pattern with RNN

I am trying to make RNN in tensorflow capture a basic pattern in a simple time series in hours. I am trying to solve a bigger problem involving count time series of customer demand.
The simple time series is as follows:
Every 24 hours (1 day) there will be a small integer number either 1 or 2 from a random uniform distirbution.
In between these 24 hours will be zero values.
Every 168 hours (7 days) there will be a high integer number (5 or 6 or 7 or 8 or 9) from a random uniform distirbution.
I tried following the code at https://r2rt.com/recurrent-neural-networks-in-tensorflow-i.html using dynamic_rnn.
Is my test data correct? How can I feed the batches of output from previous times step as input to the next time step? I have 5 hyperparamters to play with
batch_size = 8 num_steps = 192 state_size = 5 learning_rate = 0.00001
num_epochs=1
However, after training each time with the same hyperparameters I am getting different results. Each time the training error is very small. The different results seem quite random (local minima probably??). orange is actual, blue is predicted.
Can my test batch start at any point in the sequence? Does the RNN learn the number of zeros inbetween non-zero values? if the test batch starts with a small non-zero number then the RNN should know that it should output 23 zero value steps after this and then after 167 steps output a high non-zero value. if I start my test sequence at 0 then it should wait 23 more zero value steps before outputing a small non-zero value and after 167 steps output a high non-zero value?
or does it learn another pattern? I am not sure if my method of testing is correct?
Is it better to just pass one time step integer value and let the network generate the remaining time steps integer values by passing the current time step output as input to the next time step?
Currently, I just take a random sequence of X generated by the same method for training and check if my output Y is the shifted version of X by 1 time step. Could you please explain?
My code is given below. you can just copy and paste and it should run. Basically, I just generate the data, build the model, train the network and test it.
from data_generator import gen_data
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import numpy as np
import time
import matplotlib.pyplot as plt
num_classes = 11
batch_size = 8
num_steps = 192
state_size = 5
learning_rate = 0.00001
num_epochs=1
dem = gen_data(len=1576)
def gen_batch(dem, batch_size, num_steps):
raw_x = dem[:-1]
raw_y = dem[1:]
data_length = len(raw_x)
num_of_win = data_length - num_steps - 1 # 1382 windows
batch_partition_length = num_of_win // batch_size # 172 batches
data_x = []
data_y = []
j=0
for i in range(batch_partition_length):
windows_x = []
windows_y = []
k=0
while(k<batch_size):
windows_x.append( raw_x[ j:num_steps + j] )
windows_y.append( raw_y[ j:num_steps + j] )
j+=1
k+=1
data_x.append(np.array(windows_x)) # each batch is stacked horizontally.
data_y.append(np.array(windows_y))
for windows_x, windows_y in zip(data_x,data_x):
x = windows_x
y = windows_y
z = x.shape
z = y.shape
yield (x, y)
def gen_epoch(num_epochs,batch_size, num_steps):
for n in range(num_epochs):
yield gen_batch(dem, batch_size, num_steps)
def reset_graph():
# if 'sess' in globals() and sess:
# sess.close()
tf.compat.v1.reset_default_graph()
def build_RNN_model(batch_size, num_classes,state_size,num_steps,learning_rate):
reset_graph()
x = tf.compat.v1.placeholder(dtype=tf.int32, shape=(batch_size,num_steps))
y = tf.compat.v1.placeholder(dtype=tf.int32, shape=(batch_size,num_steps))
init_state = tf.zeros([batch_size, state_size])
# with tf.compat.v1.variable_scope('rnn_cell'):
# W = tf.compat.v1.get_variable('inp_state_w', shape=(num_classes+state_size,state_size),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
# b = tf.compat.v1.get_variable('inp_state_b', shape=(state_size),initializer=tf.compat.v1.initializers.constant(0.0) )
# def rnn_cell(rnn_input,state):
# with tf.compat.v1.variable_scope('rnn_cell', reuse=True):
# W = tf.compat.v1.get_variable('inp_state_w', shape=(num_classes+state_size,state_size),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
# b = tf.compat.v1.get_variable('inp_state_b', shape=(state_size),initializer=tf.compat.v1.initializers.constant(0.0) )
# return tf.tanh( tf.matmul( tf.concat([rnn_input,state], axis=1),W) + b )
#cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size, reuse=True, name='rnn_cell' )
rnn_inputs = tf.one_hot(x, num_classes)
cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size)
rnn_outputs, final_state = tf.compat.v1.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)
with tf.compat.v1.variable_scope('output'):
W = tf.compat.v1.get_variable('out_state_w', shape=(state_size,num_classes),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
b = tf.compat.v1.get_variable('out_state_b', shape=(num_classes),initializer=tf.compat.v1.initializers.constant(0.0) )
logits = tf.reshape( tf.compat.v1.matmul(tf.reshape(rnn_outputs, [-1, state_size]), W) + b, [batch_size, num_steps, num_classes])
predictions = tf.compat.v1.nn.softmax(logits)
tru_labels = y
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
total_loss = tf.reduce_mean(losses)
train_step = tf.compat.v1.train.AdagradOptimizer(learning_rate).minimize(total_loss)
return dict(
x=x,
y=y,
final_state = final_state,
total_loss = total_loss,
train_step = train_step,
init_state = init_state,
predictions = predictions,
tru_labels = tru_labels,
saver = tf.compat.v1.train.Saver()
)
def train_network(g,num_epochs, batch_size,num_steps, dem,save=' '):
tf.compat.v1.set_random_seed(2345)
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.initialize_all_variables())
training_losses = []
for idx, epoch in enumerate(gen_epoch(num_epochs,batch_size, num_steps)):
training_loss = 0
steps=0 # number of batches
training_state = None
for X,Y in epoch:
steps+=1
feed_dict = {g['x'] : X, g['y'] : Y}
if training_state is not None:
feed_dict[g['init_state']] = training_state
training_loss_, training_state, train_step = \
sess.run([g['total_loss'], g['final_state'], g['train_step']], feed_dict)
training_loss+=training_loss_
print("Average training loss for Epoch", idx, ":", training_loss/steps)
print('steps',steps)
training_losses.append(training_loss/steps)
if isinstance(save, str):
g['saver'].save(sess, save)
e = gen_batch(dem, batch_size, num_steps)
e = gen_batch(dem, batch_size, num_steps)
for X,Y in e:
tru_labels, predictions = \
sess.run([g['tru_labels'], g['predictions']], feed_dict={g['x'] : X, g['y'] : Y, g['init_state'] : training_state})
pred = np.argmax(predictions, axis=2)
print(pred.shape)
pred = pred[0]
print('predictions',pred)
tru_labels = tru_labels[0]
print('tru_labels',tru_labels )
plt.plot(pred)
plt.plot(tru_labels)
plt.show()
return training_loss
g = build_RNN_model(batch_size, num_classes,state_size,num_steps,learning_rate)
t = time.time()
train_network(g, num_epochs,batch_size,num_steps, dem,save='saver' )
print("It took", time.time() - t, "seconds to train for 3 epochs.")
I have written some keras code with a single RNN cell and a dense layer to capture the following two patterns which is similar to the two patterns above. However, the distribution of magnitudes of high vehicles and low vehicles that are drawn from a categorical distribution below are not being represented in the test output.
Categorical Random Variable, x = {0,1,2} and p(x) = {0.6,0.3,0.1}
low vehicles = 1 + x , every 4 hours
high vehicles = 6 + x , every 8 hours
I managed to get the results like the following
with this code
from copyreg import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import sys
#### for reproduclvle resutls
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(2)
n_steps = 12
batch_size = 32
lay1_state_size = 64
lay2_state_size = 0
dense_state_size = 1
num_epochs = 25
horizon = 24
loss_function_type = 'sparse_categorical_crossentropy or mse or rmse'
num_layers = 1
optimizer_type = 'Adam'
metrics = 'rmse'
# spikes at regrular interval
dem = np.load('const_dem_2_freq_stoch.npy')
dem_len = len(dem)
def gen_batch(dem, batch_size, n_steps):
n = n_steps + 1
raw_x = dem[:-1]
data_length = len(raw_x)
num_of_win = data_length - n - 1 # 1382 windows
batch_partition_length = num_of_win // batch_size # 172 batches
#print('batch_partition_length',batch_partition_length)
data_x = []
j=0
for i in range(batch_partition_length):
windows_x = []
k=0
while(k<batch_size):
windows_x.append( raw_x[ j:n + j] )
j+=1
k+=1
data_x.append(np.array(windows_x)) # each batch is stacked horizontally.
data_x = np.array(data_x)
data_x = np.reshape(data_x,(-1,n)) # 224 x 13
#print(data_x.shape)
return data_x,batch_partition_length
data_x,batch_partition_length = gen_batch(dem, batch_size, n_steps)
data_x = np.expand_dims(data_x,axis=-1)
tr = int(0.7*dem_len)
val = int(0.2*dem_len)
x_train, y_train = data_x[:tr,:n_steps], data_x[:tr,-1]
x_valid, y_valid = data_x[tr:tr+val,:n_steps], data_x[tr:tr+val,-1]
print('\n\n')
print('tr+val',tr+val)
print('\n\n')
x_test, y_test = data_x[tr+val:,:n_steps], data_x[tr+val:,-1]
#model
model = keras.models.Sequential([keras.layers.SimpleRNN(lay1_state_size,input_shape=[None,1]), keras.layers.Dense(dense_state_size)])
# model = keras.models.Sequential([keras.layers.SimpleRNN(lay1_state_size,return_sequences=True,input_shape=[None,1]),keras.layers.SimpleRNN(lay2_state_size),
# keras.layers.Dense(dense_state_size)])
model.compile(optimizer='Adam',loss=keras.losses.mean_absolute_error ,metrics=[tf.keras.metrics.RootMeanSquaredError()] )
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,validation_data=(x_valid,y_valid))
print('\n')
print('Model Evaluation on test set:\n')
model.evaluate(x_test, y_test,batch_size=batch_size)
print('\n')
#model.summary()
y_tru = np.array([])
for step_ahead in range(horizon):
# tru label
y = np.append(data_x[step_ahead+1:,n_steps ], np.array([[0]*(step_ahead+1)]))
y_tru = np.append(y_tru,y)
# prediction
y_pred_one = model.predict(data_x[:,step_ahead:])[:,np.newaxis,:]
data_x = np.concatenate([data_x,y_pred_one ],axis=1)
y_tru = np.reshape(y_tru,(batch_partition_length*batch_size,horizon),order='F')
y_pred_horizon = data_x[:,n_steps+1:]
y_pred_horizon = np.squeeze(y_pred_horizon)
print('print(y_pred_horizon.shape)',y_pred_horizon.shape)
print(' RNN prediction on all data MSE',np.mean(keras.losses.mean_squared_error(y_tru,y_pred_horizon )) )
print(' RNN prediction on all data MAE',np.mean(keras.losses.mean_absolute_error(y_tru,y_pred_horizon )) )
print('\n')
for i in range(10):
plt.figure(i)
plt.plot(y_tru[i])
plt.plot(np.squeeze(y_pred_horizon[i]))
plt.show()
The data generation code is given below
from copyreg import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras
dem_len = 1240
def categorical(p):
return (p.cumsum(-1) >= np.random.uniform(size=p.shape[:-1])[..., None]).argmax(-1)
p = np.array([0.6, 0.3, 0.1])
def dem_hr(hr, lo_veh, hi_veh,len):
dem_hrs = np.array([])
for i in range(10000):
#d = np.random.randint(lo_veh,hi_veh)
d = lo_veh + categorical(p)
z = np.array([0]*(hr-1))
dem_hrs = np.append(dem_hrs, d)
dem_hrs = np.append(dem_hrs, z)
dem_hrs = dem_hrs[:len]
return dem_hrs
def gen_data(len):
dzero = np.zeros(len)
# for hr,lo_veh, hi_veh in zip([4, 8],[1, 6],[3,9]):
# d = dem_hr(hr, lo_veh, hi_veh,len)
# dem = dem + d
# dem = np.array(dem,dtype=np.float32)
d4 = dem_hr(4, 1, 3,len)
d8 = dem_hr(8, 6, 9,len)
dall = dzero + d8
dsub = dall - d4
dem = np.where(dsub>=0,d8,d4)
# plt.plot(dem)
# plt.plot(d4)
# plt.plot(d8)
# plt.show()
return dem
dem = gen_data(len=dem_len)
np.save('const_dem_2_freq_stoch_cat',dem)
plt.plot(dem)
plt.show()
I think incresing the number of steps may help to capture the distribution of magnitudes at different periods. Does increasing the layers also help to capture the magnitude distribution?

The method to use gradient accumulate in BERT finetune

I was doing a Bert finetune and I had OOM issues. I heard a good method to handle this is to use "gradient accumulate". Below are my optimization.py(include the gradient accumulate)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.training import optimizer
from tensorflow.python.framework import ops
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = MultistepAdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)
# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class MultistepAdamWeightDecayOptimizer(optimizer.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
n = 1,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="MultistepAdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(MultistepAdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self._n = n
self.exclude_from_weight_decay = exclude_from_weight_decay
self._n_t = None
def _prepare(self):
super(MultistepAdamWeightDecayOptimizer, self)._prepare()
self._n_t=tf.convert_to_tensor(self._n, name="n")
def _create_slots(self,var_list):
super(MultistepAdamWeightDecayOptimizer, self)._create_slots(var_list)
first_var = min(var_list, key=lambda x: x.name)
self._create_non_slot_variable(initial_value=0 if self._n == 1 else 1,
name="iter",
colocate_with=first_var)
for v in var_list:
self._zeros_slot(v,"grad_acc",self._name)
def _get_iter_variable(self):
if tf.contrib.eager.in_eager_mode():
graph = None
else:
graph = tf.get_default_graph()
return self._get_non_slot_variable("iter", graph=graph)
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""See base class."""
update_ops = []
var_list = [v for g, v in grads_and_vars if g is not None]
with ops.init_scope():
self._create_slots(var_list)
self._prepare()
for(grad, param) in grads_and_vars:
if grad is None or param is None:
continue
grad_acc = self.get_slot(param, "grad_acc")
param_name = self._get_variable_name(params.name)
m = tf.get_variable(name=param_name + "/adam_m", shape=param.shape.as_list(),
dtype=tf.float32,trainable=False, initializer=tf.zeros_initializer())
v = tf.get_variable(name =param_name + "/adam_v", shape=param.sahpe.as_list(),
dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer())
def _apply_adam(grad_acc, grad, param, m, v):
total_grad = (grad_acc + grad) / tf.cast(self._n_t, grad.dtype)
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, total_grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(total_grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param
update_with_lr =self.learning_rate * update
next_param = param - update_with_lr
adam_op = tf.group(param.assign(next_param), m.assign(next_m),
v.assign(next_v))
with tf.control_dependencies([adam_op]):
grad_acc_to_zero_op = grad_acc.assign(tf.zero_like(grad_acc), use_locking=self._use_locking)
return tf.group(adam_op, grad_acc_to_zero_op)
def _accumulate_gradient(grad_acc, grad):
assign_up = tf.assign_add(grad_acc, grad, use_locking=self._use_locking)
return tf.group(assign_op)
update_op = tf.cond(tf.equal(self._get_iter_variable(),0),
lambda: _apply_adam(grad_acc, grad, param,m, v),
lambda: _accumulate_gradient(grad_acc, grad))
update_ops.append(update_op)
apply_updates = self._finish(update_ops, name_scope=name)
return apply_updates
def _finish(self, update_ops, name_scope):
iter_=self._get_iter_variable()
with tf.control_dependencies(update_ops):
with tf.colocate_with(iter_):
update_iter = iter_.assign(tf.mod(iter_+1, self._n_t),
use_locking=self._use_locking)
return tf.group(
*update_ops + [update_iter], name=name_scope)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
After I used this optimization.py, i could use large batch. But loss did not decrease and after 300 steps(i got 550000 training data, batch size 64, iteration 1000 and epoch 20), it said: train loop marked as finished and stopped.
I am not sure what problem is, could you please help me out? thanks.

Keras GPU memory overflow using with keras.utils.sequence and generator

Dataset.py
import os
import random
from skimage import io
import cv2
from skimage.transform import resize
import numpy as np
import tensorflow as tf
import keras
import Augmentor
def iter_sequence_infinite(seq):
"""Iterate indefinitely over a Sequence.
# Arguments
seq: Sequence object
# Returns
Generator yielding batches.
"""
while True:
for item in seq:
yield item
# data generator class
class DataGenerator(keras.utils.Sequence):
def __init__(self, ids, imgs_dir, masks_dir, batch_size=10, img_size=128, n_classes=1, n_channels=3, shuffle=True):
self.id_names = ids
self.indexes = np.arange(len(self.id_names))
self.imgs_dir = imgs_dir
self.masks_dir = masks_dir
self.batch_size = batch_size
self.img_size = img_size
self.n_classes = n_classes
self.n_channels = n_channels
self.shuffle = shuffle
self.on_epoch_end()
# for printing the statistics of the function
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.id_names))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation__(self, id_name):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
img_path = os.path.join(self.imgs_dir, id_name) # polyp segmentation/images/id_name.jpg
mask_path = os.path.join(self.masks_dir, id_name) # polyp segmenatation/masks/id_name.jpg
img = io.imread(img_path)
mask = cv2.imread(mask_path)
p = Augmentor.DataPipeline([[img, mask]])
p.resize(probability=1.0, width=self.img_size, height=self.img_size)
p.rotate_without_crop(probability=0.3, max_left_rotation=10, max_right_rotation=10)
#p.random_distortion(probability=0.3, grid_height=10, grid_width=10, magnitude=1)
p.shear(probability=0.3, max_shear_left=1, max_shear_right=1)
#p.skew_tilt(probability=0.3, magnitude=0.1)
p.flip_random(probability=0.3)
sample_p = p.sample(1)
sample_p = np.array(sample_p).squeeze()
p_img = sample_p[0]
p_mask = sample_p[1]
augmented_mask = (p_mask // 255) * 255 # denoising
q = Augmentor.DataPipeline([[p_img]])
q.random_contrast(probability=0.3, min_factor=0.2, max_factor=1.0) # low to High
q.random_brightness(probability=0.3, min_factor=0.2, max_factor=1.0) # dark to bright
sample_q = q.sample(1)
sample_q = np.array(sample_q).squeeze()
image = sample_q
mask = augmented_mask[::, ::, 0]
"""
# reading the image from dataset
## Reading Image
image = io.imread(img_path) # reading image to image vaiable
image = resize(image, (self.img_size, self.img_size), anti_aliasing=True) # resizing input image to 128 * 128
mask = io.imread(mask_path, as_gray=True) # mask image of same size with all zeros
mask = resize(mask, (self.img_size, self.img_size), anti_aliasing=True) # resizing mask to fit the 128 * 128 image
mask = np.expand_dims(mask, axis=-1)
"""
# image normalization
image = image / 255.0
mask = mask / 255.0
return image, mask
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.floor(len(self.id_names) / self.batch_size))
def __getitem__(self, index): # index : batch no.
# Generate indexes of the batch
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
batch_ids = [self.id_names[k] for k in indexes]
imgs = list()
masks = list()
for id_name in batch_ids:
img, mask = self.__data_generation__(id_name)
imgs.append(img)
masks.append(np.expand_dims(mask,-1))
imgs = np.array(imgs)
masks = np.array(masks)
return imgs, masks # return batch
train.py
import argparse
import logging
import os
import sys
from tqdm import tqdm # progress bar
import numpy as np
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import segmentation_models as sm
from segmentation_models.utils import set_trainable
from dataset import DataGenerator, iter_sequence_infinite
def train_model(model, train_gen, valid_gen, epochs, save_cp=True):
total_batch_count = 0
train_img_num = len(train_gen.id_names)
train_batch_num = len(train_gen)
train_gen_out = iter_sequence_infinite(train_gen)
valid_batch_num = len(valid_gen)
valid_img_num = len(valid_gen.id_names)
valid_gen_out = iter_sequence_infinite(valid_gen)
for epoch in range(epochs): # interation as many epochs
set_trainable(model)
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=train_img_num, desc=f'Epoch {epoch + 1}/{epochs}', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(train_batch_num):
batch = next(train_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.train_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
total_batch_count += 1
train_gen.on_epoch_end()
print( "Epoch : loss: {}, IoU : {}".format(epoch_loss/count, epoch_iou/count))
# Do validation
validation_model(model, valid_gen_out, valid_batch_num, valid_img_num)
valid_gen.on_epoch_end()
if save_cp:
try:
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
logging.info('Created checkpoint directory')
else:
pass
except OSError:
pass
model.save_weights(os.path.join(checkpoint_dir , f'CP_epoch{epoch + 1}.h5'))
logging.info(f'Checkpoint {epoch + 1} saved !')
def validation_model(model, valid_gen_out, valid_batch_num, valid_img_num):
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=valid_img_num, desc='Validation round', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(valid_batch_num):
batch = next(valid_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.test_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch, loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
print("Validation loss: {}, IoU: {}".format(epoch_loss / count, epoch_iou / count))
pred_mask = model.predict(np.expand_dims(imgs[0],0))
plt.subplot(131)
plt.imshow(imgs[0])
plt.subplot(132)
plt.imshow(true_masks[0].squeeze(), cmap="gray")
plt.subplot(133)
plt.imshow(pred_mask.squeeze(), cmap="gray")
plt.show()
print()
def get_args():
parser = argparse.ArgumentParser(description='Train the UNet on images and target masks',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-e', '--epochs', metavar='E', type=int, default=50,
help='Number of epochs', dest='epochs')
parser.add_argument('-b', '--batch_size', metavar='B', type=int, nargs='?', default=2,
help='Batch size', dest='batch_size')
parser.add_argument('-l', '--learning-rate', metavar='LR', type=float, nargs='?', default=1e-5,
help='Learning rate', dest='lr')
parser.add_argument('-bb', '--backbone', default='resnet50', metavar='FILE',
help="backcone name")
parser.add_argument('-w', '--weight', dest='load', type=str, default=False,
help='Load model from a .h5 file')
parser.add_argument('-s', '--resizing', dest='resizing', type=int, default=384,
help='Downscaling factor of the images')
parser.add_argument('-v', '--validation', dest='val', type=float, default=20.0,
help='Percent of the data that is used as validation (0-100)')
return parser.parse_args()
if __name__ == '__main__':
img_dir = './data/train/imgs/' # ./data/train/imgs/CVC_Original/'
mask_dir = './data/train/masks/' # ./data/train/masks/CVC_Ground Truth/'
checkpoint_dir = './checkpoints'
args = get_args()
# train path
train_ids = os.listdir(img_dir)
# Validation Data Size
n_val = int(len(train_ids) * args.val/100) # size of validation set
valid_ids = train_ids[:n_val] # list of image ids used for validation of result 0 to 9
train_ids = train_ids[n_val:] # list of image ids used for training dataset
# print(valid_ids, "\n\n")
print("training_size: ", len(train_ids), "validation_size: ", len(valid_ids))
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
print("total training batches: ", len(train_gen))
print("total validaton batches: ", len(valid_gen))
train_steps = len(train_ids) // args.batch_size
valid_steps = len(valid_ids) // args.batch_size
# define model
model = sm.Unet(args.backbone, encoder_weights='imagenet')
optimizer = optimizers.Adam(lr=args.lr, decay=1e-4)
model.compile(
optimizer=optimizer,
# "Adam",
loss=sm.losses.bce_dice_loss, # sm.losses.bce_jaccard_loss, # sm.losses.binary_crossentropy,
metrics=[sm.metrics.iou_score],
)
#model.summary()
callbacks = [
EarlyStopping(patience=6, verbose=1),
ReduceLROnPlateau(factor=0.1, patience=3, min_lr=1e-7, verbose=1),
ModelCheckpoint('./weights.Epoch{epoch:02d}-Loss{loss:.3f}-VIou{val_iou_score:.3f}.h5', verbose=1,
monitor='val_accuracy', save_best_only=True, save_weights_only=True)
]
train_model(model=model, train_gen=train_gen, valid_gen=valid_gen, epochs=args.epochs)
When I try to run this code, some epochs are well progressed but, in 20epochs, it occurs gpu memory overflow error like below
(0) Resource exhausted: OOM when allocating tensor with shape[2,64,96,96] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node decoder_stage2b_bn/FusedBatchNorm}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
so, I think that it is because of data generation.
This code generate batch in this order.
in train.py, initialize Datageneratr class which is sequence model that is implemented in Dataset.py
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
At the first in the function 'train_model' convert Datagenerator(sequence model) to generator with using function 'iter_sequence_infinite'
train_gen_out = iter_sequence_infinite(train_gen)
valid_gen_out = iter_sequence_infinite(valid_gen)
using magic-function, 'next', get batch
batch = next(train_gen_out)
I think that there will be no memory problem but it's occurred.
What is the problem and how to solve it?
Thanks.

TensorFlow: loss jumps up after restoring RNN net

Environment info
Operating System: Windows 7 64-bit
Tensorflow installed from pre-built pip (no CUDA): 1.0.1
Python 3.5.2 64-bit
Problem
I have problems with restoring my net (RNN character base language model). Below is a simplified version with the same problem.
When I run it the first time, I get, for example, this.
...
step 160: loss = 1.956 (perplexity = 7.069016620211226)
step 180: loss = 1.837 (perplexity = 6.274748642468816)
step 200: loss = 1.825 (perplexity = 6.202084762557817)
But on the second run, after restoring parameters, I get this.
step 220: loss = 2.346 (perplexity = 10.446611983898903)
step 240: loss = 2.346 (perplexity = 10.446709120339545)
...
All the tf variables seem to be correctly restored, including the state, which will be fed to RNN.
Data position is also restored (from 'step').
I also made a similar program for MNIST recognition model, and this one works fine: the losses before and after the restoring are continuous.
Are there any other parameters or states that should be saved and restored?
import argparse
import os
import tensorflow as tf
import numpy as np
import math
B = 20 # batch size
H = 200 # size of hidden layer of neurons
T = 25 # number of time steps to unroll the RNN for
data_file = 'ptb.train.txt' # any plain text file will do
checkpoint_dir = "tmp"
#----------------
# prepare data
#----------------
data = open(data_file, 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has {0} characters, {1} unique.'.format(data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
input_index_raw = np.array([char_to_ix[ch] for ch in data])
input_index_raw = input_index_raw[0:len(input_index_raw) // T * T]
input_index_raw_shift = np.append(input_index_raw[1:], input_index_raw[0])
input_all = input_index_raw.reshape([-1, T])
target_all = input_index_raw_shift.reshape([-1, T])
num_packed_data = len(input_all)
#----------------
# build model
#----------------
class Model(object):
def __init__(self):
self.input_ph = tf.placeholder(tf.int32, [None, T], name="input_ph")
self.target_ph = tf.placeholder(tf.int32, [None, T], name="target_ph")
embedding = tf.get_variable("embedding", [vocab_size, H], initializer=tf.random_normal_initializer(), dtype=tf.float32)
# input_ph is B x T.
# input_embedded is B x T x H.
input_embedded = tf.nn.embedding_lookup(embedding, self.input_ph)
cell = tf.contrib.rnn.BasicRNNCell(H)
self.state_ph = tf.placeholder(tf.float32, (None, cell.state_size), name="state_ph")
# Make state variable so that it will be saved by the saver.
self.state = tf.get_variable("state", (B, cell.state_size), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.float32)
# Construct initial_state according to whether restoring or not.
self.isRestore = tf.placeholder(tf.bool, shape=(), name="isRestore")
zero_state = cell.zero_state(B, dtype=tf.float32)
self.initial_state = tf.cond(self.isRestore, lambda: self.state, lambda: zero_state)
# input_embedded : B x T x H
# output: B x T x H
# state : B x cell.state_size
output, state_ = tf.nn.dynamic_rnn(cell, input_embedded, initial_state=self.state_ph)
self.final_state = tf.assign(self.state, state_)
# reshape to (B * T) x H.
output_flat = tf.reshape(output, [-1, H])
# Convert hidden layer's output to vector of logits for each vocabulary.
softmax_w = tf.get_variable("softmax_w", [H, vocab_size], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
logits = tf.matmul(output_flat, softmax_w) + softmax_b
# cross_entropy is a vector of length B * T
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.target_ph, [-1]), logits=logits)
self.loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
self.global_step = tf.get_variable("global_step", (), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.int32)
self.training_op = optimizer.minimize(cross_entropy, global_step=self.global_step)
def train_batch(self, sess, input_batch, target_batch, initial_state):
final_state_, _, final_loss = sess.run([self.final_state, self.training_op, self.loss], feed_dict={self.input_ph: input_batch, self.target_ph: target_batch, self.state_ph: initial_state})
return final_state_, final_loss
# main
with tf.Session() as sess:
if not tf.gfile.Exists(checkpoint_dir):
tf.gfile.MakeDirs(checkpoint_dir)
batch_stride = num_packed_data // B
# make model
model = Model()
saver = tf.train.Saver()
# always initialize
init = tf.global_variables_initializer()
init.run()
# restore if necessary
isRestore = False
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt:
isRestore = True
last_model = ckpt.model_checkpoint_path
print("Loading " + last_model)
saver.restore(sess, last_model)
# set initial step
step = tf.train.global_step(sess, model.global_step) + 1
print("start step = {0}".format(step))
# fetch initial state
state = sess.run(model.initial_state, feed_dict={model.isRestore: isRestore})
print("Initial state: {0}".format(state))
while True:
# prepare batch data
idx = [(step + x * batch_stride) % num_packed_data for x in range(0, B)]
input_batch = input_all[idx]
target_batch = target_all[idx]
state, last_loss = model.train_batch(sess, input_batch, target_batch, state)
if step % 20 == 0:
print('step {0}: loss = {1:.3f} (perplexity = {2})'.format(step, last_loss, math.exp(last_loss)))
if step % 200 == 0:
saved_file = saver.save(sess, os.path.join(checkpoint_dir, "model.ckpt"), global_step=step)
print("Saved to " + saved_file)
print("Last state: {0}".format(model.state.eval()))
break;
step = step + 1
The problem is solved. It had nothing to do with RNN nor TensorFlow.
I changed
chars = list(set(data))
to
chars = sorted(set(data))
and now it works.
This is because python uses a random hash function to build the set, and every time python restarted, 'chars' had a different ordering.

debugging 'TypeError: Can not convert a ndarray into a Tensor or Operation' for CNN

I am trying to build a CNN, I have 8 classes in the input_samples with 45 samples in each class. so total number of input samples are 360. I have divided my first 20 samples as train samples and remaining 25 samples as test samples in each class (My input is a text file and the data is in rows is my preprocessed data, so I am reading the rows in textfile and reshaping the images which are 16x12 size).
I am unable to fix the error in the code
My code:
import numpy as np
import random
import tensorflow as tf
folder = 'D:\\Lab_Project_Files\\TF\\Practice Files\\'
Datainfo = 'dataset_300.txt'
ClassInfo = 'classTrain.txt'
INPUT_WIDTH = 16
IMAGE_HEIGHT = 12
IMAGE_DEPTH = 1
IMAGE_PIXELS = INPUT_WIDTH * IMAGE_HEIGHT # 192 = 12*16
NUM_CLASSES = 8
STEPS = 500
STEP_VALIDATE = 100
BATCH_SIZE = 5
def load_data(file1,file2,folder):
filename1 = folder + file1
filename2 = folder + file2
# loading the data file
x_data = np.loadtxt(filename1, unpack=True)
x_data = np.transpose(x_data)
# loading the class information of the data loaded
y_data = np.loadtxt(filename2, unpack=True)
y_data = np.transpose(y_data)
# divide the data in to test and train data
x_data_train = x_data[np.r_[0:20, 45:65, 90:110, 135:155, 180:200, 225:245, 270:290, 315:335],:]
x_data_test = x_data[np.r_[20:45, 65:90, 110:135, 155:180, 200:225, 245:270, 290:315, 335:360], :]
y_data_train = y_data[np.r_[0:20, 45:65, 90:110, 135:155, 180:200, 225:245, 270:290, 315:335]]
y_data_test = y_data[np.r_[20:45, 65:90, 110:135, 155:180, 200:225, 245:270, 290:315, 335:360],:]
return x_data_train,x_data_test,y_data_train,y_data_test
def reshapedata(data_train,data_test):
data_train = np.reshape(data_train, (len(data_train),INPUT_WIDTH,IMAGE_HEIGHT))
data_test = np.reshape(data_test, (len(data_test), INPUT_WIDTH, IMAGE_HEIGHT))
return data_train,data_test
def batchdata(data,label, batchsize):
# generate random number required to batch data
order_num = random.sample(range(1, len(data)), batchsize)
data_batch = []
label_batch = []
for i in range(len(order_num)):
data_batch.append(data[order_num[i-1]])
label_batch.append(label[order_num[i-1]])
return data_batch, label_batch
# CNN trail
def conv_net(x):
weights = tf.Variable(tf.random_normal([INPUT_WIDTH * IMAGE_HEIGHT * IMAGE_DEPTH, NUM_CLASSES]))
biases = tf.Variable(tf.random_normal([NUM_CLASSES]))
out = tf.add(tf.matmul(x, weights), biases)
return out
sess = tf.Session()
# get filelist and labels for training and testing
data_train,data_test,label_train,label_test = load_data(Datainfo,ClassInfo,folder)
data_train, data_test, = reshapedata(data_train, data_test)
############################ get files for training ####################################################
image_batch, label_batch = batchdata(data_train,label_train,BATCH_SIZE)
# input output placeholders
x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS])
y_ = tf.placeholder(tf.float32,[None, NUM_CLASSES])
# create the network
y = conv_net( x )
# loss
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
# train step
train_step = tf.train.AdamOptimizer( 1e-3 ).minimize( cost )
############################## get files for validataion ###################################################
image_batch_test, label_batch_test = batchdata(data_test,label_test,BATCH_SIZE)
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
################ CNN Program ##############################
for i in range(STEPS):
# checking the accuracy in between.
if i % STEP_VALIDATE == 0:
imgs, lbls = sess.run([image_batch_test, label_batch_test])
print(sess.run(accuracy, feed_dict={x: imgs, y_: lbls}))
imgs, lbls = sess.run([image_batch, label_batch])
sess.run(train_step, feed_dict={x: imgs, y_: lbls})
imgs, lbls = sess.run([image_batch_test, label_batch_test])
print(sess.run(accuracy, feed_dict={ x: imgs, y_: lbls}))
file can be downloaded dataset_300.txt and ClassInfo.txt
Session.run accepts only a list of tensors or tensor names.
imgs, lbls = sess.run([image_batch_test, label_batch_test])
In the previous line, you are passing image_batch_test and label_batch_test which are numpy arrays. I am not sure what you are trying to do by imgs, lbls = sess.run([image_batch_test, label_batch_test])