Forever running while loop in Reinforcement Learning agent evaluation - while-loop

#Evaluating agents performance
total_epochs, total_penalties = 0, 0
episodes = 1
for _ in range(episodes):
state = env.reset()
epochs, penalties, reward = 0 ,0 ,0
done = False
while not done:
state, reward, done, info = env.step(np.argmax(q_table[state]))
if reward == -10:
penalties += 1
epochs += 1
total_penalties += penalties
total_epochs += epochs
print(f"Results after {episodes} episodes")
print(f"Averge Timestep: {total_epochs / episodes}")
print(f"Average penalties: {total_penalties / episodes}")
The execution is forever struck on line {state, reward, done, info = env.step(np.argmax(q_table[state])}
#specifically on np.argmax part
i tried breaking down the code and its working fine individually,
the np.argmax(q_table[state]) line if wroking as a standalone but as soon as i put it in while loop the exection is struck forever there.

Related

Multi-GPU TFF simulation errors "Detected dataset reduce op in multi-GPU TFF simulation"

I ran my code for an emotion detection model using Tensorflow Federated simulation. My code work perfectly fine using CPUs only. However, I received this error when trying to run TFF with GPU.
ValueError: Detected dataset reduce op in multi-GPU TFF simulation: `use_experimental_simulation_loop=True` for `tff.learning`; or use `for ... in iter(dataset)` for your own dataset iteration.Reduce op will be functional after b/159180073.
What is this error about and how can I fix it? I tried to search many places but found no answer.
Here is the call stack if it help. It is very long so I pasted into this link: https://pastebin.com/b1R93gf1
EDIT:
Here is the code containing iterative_process
def startTraining(output_file):
iterative_process = tff.learning.build_federated_averaging_process(
model_fn,
client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.01),
server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0),
use_experimental_simulation_loop=True
)
flstate = iterative_process.initialize()
evaluation = tff.learning.build_federated_evaluation(model_fn)
output_file.write(
'round,available_users,loss,sparse_categorical_accuracy,val_loss,val_sparse_categorical_accuracy,test_loss,test_sparse_categorical_accuracy\n')
curr_round_result = [0,0,100,0,100,0]
min_val_loss = 100
for round in range(1,ROUND_COUNT + 1):
available_users = fetch_available_users_and_increase_time(ROUND_DURATION_AVERAGE + random.randint(-ROUND_DURATION_VARIATION, ROUND_DURATION_VARIATION + 1))
if(len(available_users) == 0):
write_to_file(curr_round_result)
continue
train_data = make_federated_data(available_users, 'train')
flstate, metrics = iterative_process.next(flstate, train_data)
val_data = make_federated_data(available_users, 'val')
val_metrics = evaluation(flstate.model, val_data)
curr_round_result[0] = round
curr_round_result[1] = len(available_users)
curr_round_result[2] = metrics['train']['loss']
curr_round_result[3] = metrics['train']['sparse_categorical_accuracy']
curr_round_result[4] = val_metrics['loss']
curr_round_result[5] = val_metrics['sparse_categorical_accuracy']
write_to_file(curr_round_result)
Here is the code for make_federated_data
def make_federated_data(users, dataset_type):
offset = 0
if(dataset_type == 'val'):
offset = train_size
elif(dataset_type == 'test'):
offset = train_size + val_size
global LOADED_USER
for id in users:
if(id + offset not in LOADED_USER):
LOADED_USER[id + offset] = getDatasetFromFilePath(filepaths[id + offset])
return [
LOADED_USER[id + offset]
for id in users
]
TFF does support Multi-GPU, and as the error message says one of two things is happening:
The code is using tff.learning but using the default use_experimental_simulation_loop argument value of False. With multiple GPUs, this must be set to True when using APIs including tff.learning.build_federated_averaging_process. For example, calling with:
training_process = tff.learning.build_federated_averaging_process(
..., use_experimental_simulation_loop=True)
The code contains a custom tf.data.Dataset.reduce(...) call somewhere. This must be replaced with Python code that iterates over the dataset. For example:
result = dataset.reduce(initial_state=0, reduce_func=lambda s, x: s + x)
becomes
s = 0
for x in iter(dataset):
s += x
I realized that TFF has not yet supported multi-GPUs. Therefore, we need to limit number visible of GPUs to just 1, using:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Tensorflow: OOM when batch size too large

My script is failing due to too high memory usage. When I reduce the batch size it works.
#tf.function(autograph=not DEBUG)
def step(prev_state, input_b):
input_b = tf.reshape(input_b, shape=[1,input_b.shape[0]])
state = FastALIFStateTuple(v=prev_state[0], z=prev_state[1], b=prev_state[2], r=prev_state[3])
new_b = self.decay_b * state.b + (tf.ones(shape=[self.units],dtype=tf.float32) - self.decay_b) * state.z
thr = self.thr + new_b * self.beta
z = state.z
i_in = tf.matmul(input_b, W_in)
i_rec = tf.matmul(z, W_rec)
i_t = i_in + i_rec
I_reset = z * thr * self.dt
new_v = self._decay * state.v + (1 - self._decay) * i_t - I_reset
# Spike generation
is_refractory = tf.greater(state.r, .1)
zeros_like_spikes = tf.zeros_like(z)
new_z = tf.where(is_refractory, zeros_like_spikes, self.compute_z(new_v, thr))
new_r = tf.clip_by_value(state.r + self.n_refractory * new_z - 1,
0., float(self.n_refractory))
return [new_v, new_z, new_b, new_r]
#tf.function(autograph=not DEBUG)
def evolve_single(inputs):
accumulated_state = tf.scan(step, inputs, initializer=state0)
Z = tf.squeeze(accumulated_state[1]) # -> [T,units]
if self.model_settings['avg_spikes']:
Z = tf.reshape(tf.reduce_mean(Z, axis=0), shape=(1,-1))
out = tf.matmul(Z, W_out) + b_out
return out # - [BS,Num_labels]
# # - Using a simple loop
# out_store = []
# for i in range(fingerprint_3d.shape[0]):
# out_store.append(tf.squeeze(evolve_single(fingerprint_3d[i,:,:])))
# return tf.reshape(out_store, shape=[fingerprint_3d.shape[0],self.d_out])
final_out = tf.squeeze(tf.map_fn(evolve_single, fingerprint_3d)) # -> [BS,T,self.units]
return final_out
This code snippet is inside a tf.function, but I omitted it since I don't think it's relevant.
As can be seen, I run the code on fingerprint_3d, a tensor that has the dimension [BatchSize,Time,InputDimension], e.g. [50,100,20]. When I run this with BatchSize < 10 everything works fine, although tf.scan already uses a lot of memory for that.
When I now execute the code on a batch of size 50, suddenly I get an OOM, even though I am executing it in an iterative matter (here commented out).
How should I execute this code so that the Batch Size doesn't matter?
Is tensorflow maybe parallelizing my for loop so that it executed over multiple batches at once?
Another unrelated question is the following: What function instead of tf.scan should I use if I only want to accumulate one state variable, compared to the case for tf.scan where it just accumulates all the state variables? Or is that possible with tf.scan?
As mentioned in the discussions here, tf.foldl, tf.foldr, and tf.scan all require keeping track of all values for all iterations, which is necessary for computations like gradients. I am not aware of any ways to mitigate this issue; still, I would also be interested if anyone has a better answer than mine.
When I used
#tf.function
def get_loss_and_gradients():
with tf.GradientTape(persistent=False) as tape:
logits, spikes = rnn.call(fingerprint_input=graz_dict["train_input"], W_in=W_in, W_rec=W_rec, W_out=W_out, b_out=b_out)
loss = loss_normal(tf.cast(graz_dict["train_groundtruth"],dtype=tf.int32), logits)
gradients = tape.gradient(loss, [W_in,W_rec,W_out,b_out])
return loss, logits, spikes, gradients
it works.
When I remove the #tf.function decorator the memory blows up. So it really seems important that tensorflow can create a graph for you computations.

Is it possible to run python tensorflow code on TPU without using the Estimator API?

I have spent weeks now trying to write a Python level Tensorflow code that could communicate with TPUs directly. How would it be possible to implement the system that could run on a TPU without the Estimator API?
Resources I tried:
All the documentation about the Estimator API, TPU on https://www.tensorflow.org
Ways I tried:
Initialized a TPUClusterResolver and passed that as an argument for tf.Session() and it was just hanging without executing the session.run()
Also tried sess.run(tpu.initialize_system()) and it got stuck as well
Tried looking into the TPUEstimator API as there
def train_model(self, env, episodes=100,
load_model = False, # load model from checkpoint if available:?
model_dir = '/tmp/pgmodel/', log_freq=10 ) :
# initialize variables and load model
init_op = tf.global_variables_initializer()
self._sess.run(init_op)
if load_model:
ckpt = tf.train.get_checkpoint_state(model_dir)
print tf.train.latest_checkpoint(model_dir)
if ckpt and ckpt.model_checkpoint_path:
savr = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta')
out = savr.restore(self._sess, ckpt.model_checkpoint_path)
print("Model restored from ",ckpt.model_checkpoint_path)
else:
print('No checkpoint found at: ',model_dir)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
episode = 0
observation = env.reset()
xs,rs,ys = [],[],[] # environment info
running_reward = 0
reward_sum = 0
# training loop
day = 0
simrors = np.zeros(episodes)
mktrors = np.zeros(episodes)
alldf = None
victory = False
while episode < episodes and not victory:
# stochastically sample a policy from the network
x = observation
feed = {self._tf_x: np.reshape(x, (1,-1))}
aprob = self._sess.run(self._tf_aprob,feed)
aprob = aprob[0,:] # we live in a batched world :/
action = np.random.choice(self._num_actions, p=aprob)
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
#print observation, reward, done, info
reward_sum += reward
# record game history
xs.append(x)
ys.append(label)
rs.append(reward)
day += 1
if done:
running_reward = running_reward * 0.99 + reward_sum * 0.01
epx = np.vstack(xs)
epr = np.vstack(rs)
epy = np.vstack(ys)
xs,rs,ys = [],[],[] # reset game history
df = env.env.sim.to_df()
#pdb.set_trace()
simrors[episode]=df.bod_nav.values[-1]-1 # compound returns
mktrors[episode]=df.mkt_nav.values[-1]-1
alldf = df if alldf is None else pd.concat([alldf,df], axis=0)
feed = {self._tf_x: epx, self._tf_epr: epr, self._tf_y: epy}
_ = self._sess.run(self._train_op,feed) # parameter update
if episode % log_freq == 0:
log.info('year #%6d, mean reward: %8.4f, sim ret: %8.4f, mkt ret: %8.4f, net: %8.4f', episode,
running_reward, simrors[episode],mktrors[episode], simrors[episode]-mktrors[episode])
save_path = self._saver.save(self._sess, model_dir+'model.ckpt',
global_step=episode+1)
if episode > 100:
vict = pd.DataFrame( { 'sim': simrors[episode-100:episode],
'mkt': mktrors[episode-100:episode] } )
vict['net'] = vict.sim - vict.mkt
if vict.net.mean() > 0.0:
victory = True
log.info('Congratulations, Warren Buffet! You won the trading game.')
#print("Model saved in file: {}".format(save_path))
episode += 1
observation = env.reset()
reward_sum = 0
day = 0
return alldf, pd.DataFrame({'simror':simrors,'mktror':mktrors})
Problems I have with the Estimator API implementation:
I have a policy gradient based reinforcement learning code that contains a neural network
I have two session.run() during my execution. One is running on every step within the episode. The other is running at the end of the episode
tf.train.SessionRunHook is not a suitable implementation for my code

If I don't want to train in batches and my state is a vector, what should my tensors have for a shape?

I'm trying to use tensorflow to solve a reinforced learning problem. I created an gym environment of my own. The state is a one dimensional array (size 224) and there are 170 actions to choose from (0...169). I do not want to train in batches. What I want is to make the most simple version of the RL problem running with tensorflow.
My main problem is, i guess the dimensions. I would assume that TF would allow me to input the state as 1D tensor. But then I get an error when I want to calculate W*input=action. Dimensions error make it hard to know whats right. Also, examples on the web focus on training from images, in batches.
In general, I started in this tutorial, but the state is encoded differently, which again makes it hard to follow (especially since I'm not really familiar with python).
import gym
import numpy as np
import random
import tensorflow as tf
env = gym.make('MyOwnEnv-v0')
n_state = 224
n_action = 170
sess = tf.InteractiveSession()
# Implementing the network itself
inputs1 = tf.placeholder(shape=[1,n_state],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([n_state,n_action],0,0.01))
Qout = tf.transpose(tf.matmul(inputs1,W))
predict = tf.reshape(tf.argmax(Qout,1), [n_action,1])
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[n_action,1],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
# Training the network
init = tf.global_variables_initializer()
print("input: ", inputs1.get_shape()
, "\nW: ", W.get_shape()
, "\nQout: ", Qout.get_shape()
, "\npredict:", predict.get_shape()
, "\nnextQ: ", nextQ.get_shape()
, "\nloss: ", loss.get_shape())
# Set learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
rAll = 0
d = False
j = 0
#The Q-Network
while j < 99:
j+=1
#Choose an action by greedily (with e chance of random action) from the Q-network
a,allQ = sess.run([predict,Qout],feed_dict={inputs1:s})
if np.random.rand(1) < e:
a = env.action_space.sample()
#Get new state and reward from environment
s1,r,d,_ = env.step(a)
#Obtain the Q' values by feeding the new state through our network
Q1 = sess.run(Qout,feed_dict={inputs1:s1})
#Obtain maxQ' and set our target value for chosen action.
maxQ1 = np.max(Q1)
targetQ = allQ
#targetQ[0,a[0]] = r + y*maxQ1
targetQ[a,0] = r + y*maxQ1
#Train our network using target and predicted Q values
_,W1 = sess.run([updateModel,W],feed_dict={inputs1:s,nextQ:targetQ})
rAll += r
s = s1
if d == True:
#Reduce chance of random action as we train the model.
e = 1./((i/50) + 10)
break
jList.append(j)
rList.append(rAll)
print('Percent of succesful episodes: ' + str(sum(rList)/num_episodes) + '%')

how to use conda accelerate / benchmarks?

I'm attempting to use Conda Accelerate to speedup some data preprocessing, but initial benchmarks indicate either I'm not using it correctly or it has no effect on FFT & linear algebra execution times in numpy and librosa. Re-reading the literature - does this mean I'm supposed to decorate and recode every ndarray operation as in the batch-matmul example for NumbaPro? I'd assumed I simply installed and it made numpy faster, but this doesn't appear to be the case.
Benchmarks and code are below. I've installed accelerate via conda install accelerate and also imported it for good measure.
Thanks!
Result - negligible difference before and after conda install accelerate
Total time was 25.356
Total load time was 1.6743
Total math time was 22.1599
Total save time was 1.5139
Total stft math time was 12.9219
Total other numpy math time was 9.1886
Relevant code:
loads, maths, saves = [], [], []
stfts, nps = [], []
# now we have a dict of all source files grouped by voice
for i in range(30):
v0_fn = v0_list[i]
v1_fn = v1_list[i]
tl0 = time.time()
# Process v0 & v1 file
v0_fn = signal_dir+v0_fn
v0, fs_s = librosa.load(v0_fn, sr=None)
v1_fn = signal_dir+v1_fn
v1, fs_s = librosa.load(v1_fn, sr=None)
tl1 = time.time()
loads.append((tl1-tl0))
mix = v0 + v1
# Capture the magnitude and phase of signal and signal + noise
tm0 = time.time()
v0_stft = librosa.stft(v0, int(frame_size*fs), int(step_size*fs)).transpose()
tm1 = time.time()
v0_mag = (v0_stft.real**2 + v0_stft.imag**2)**0.5
v0_pha = np.arctan2(v0_stft.imag, v0_stft.real)
v0_rtheta = np.stack((v0_mag, v0_pha), axis=0)
tm2 = time.time()
v1_stft = librosa.stft(v1, int(frame_size*fs), int(step_size*fs)).transpose()
tm3 = time.time()
v1_mag = (v1_stft.real**2 + v1_stft.imag**2)**0.5
v1_pha = np.arctan2(v1_stft.imag, v1_stft.real)
v1_rtheta = np.stack((v1_mag, v1_pha), axis=0)
tm4 = time.time()
mix_stft = librosa.stft(mix, int(frame_size*fs), int(step_size*fs)).transpose()
tm5 = time.time()
mix_mag = (mix_stft.real**2 + mix_stft.imag**2)**0.5
mix_pha = np.arctan2(mix_stft.imag, mix_stft.real)
mix_rtheta = np.stack((mix_mag, mix_pha), axis=0)
tm6 = time.time()
stfts += [tm1-tm0, tm3-tm2, tm5-tm4]
nps += [tm2-tm1, tm4-tm3, tm6-tm5]
data['sig_rtheta'] = v0_rtheta
data['noi_rtheta'] = v1_rtheta
data['mix_rtheta'] = mix_rtheta
tl2 = time.time()
maths.append(tl2-tl1)
with open(write_name, 'w') as f:
cPickle.dump(all_info, f, protocol=-1)
tl3 = time.time()
saves.append(tl3-tl2)
t1 = time.time()
print 'Total time was %.3f' % (t1-t0)
print 'Total load time was %.4f' % np.sum(loads)
print 'Total math time was %.4f' % np.sum(maths)
print 'Total save time was %.4f' % np.sum(saves)
print 'Total stft math was %.4f' % np.sum(stfts)
print 'Total other numpy math time was %.4f' % np.sum(nps)