When reconnect to Colaboratory, my computer crashed - google-colaboratory

I tried to run my deep learning code on Colaboratory, but every time I run to 26% this progress, it automatically disconnects, and the computer freezes when I reconnect to the server. My training loop is shown below:
from tqdm import tqdm_notebook
import matplotlib
from PIL import Image
dev_gen_costs = []
config = tf.ConfigProto(device_count={'gpu':0})
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config = config)
sess.run(tf.global_variables_initializer())
for iteration in tqdm_notebook(range(ITERS), desc='generate and discriminate'):
time.sleep(0.01)
start_time = time.time()
num = iteration%(train_vis.shape[0]//BATCH_SIZE)
data_IR = train_ir[num*BATCH_SIZE:(num+1)*BATCH_SIZE,:,:]
data_VI = train_vis[num*BATCH_SIZE:(num+1)*BATCH_SIZE,:,:]
data_IR = np.expand_dims(data_IR,axis = -1)
data_VI = np.expand_dims(data_VI,axis = -1)
feed_dict={real_data1_int:data_IR, real_data2_int:data_VI}
# Train generator
_ = sess.run([gen_train_op],feed_dict)
# Train critic
for i in range(CRITIC_ITERS):
_costs, _ = sess.run([[disc_cost ,gen_cost],disc_train_op] ,feed_dict)
dev_gen_costs.append(_costs)#loss

The crash is due to some errors in the code you are running.

Related

multiple processing in dronekit not working

i am trying to make a code about a drone flying to multiple waypoint and the drone can't continue to the next waypoint when i not showing the red color on camera.
because the camera cv2 and the drone runs at the same time, my code runs very laggy, so i tried using multiprocessing method and modify my code. when i trying to run my new code, my multi processing doesn't work and it keeps skipping almost of my code and straight to RTL mode.
from inspect import ArgInfo
from dronekit import connect, VehicleMode, LocationGlobalRelative
from pymavlink import mavutil
from numpy import loadtxt, array
from time import sleep
import sys
import cv2
import numpy as np
import multiprocessing
cap = cv2.VideoCapture(0)
hsv_a = np.array([198, 255, 255])
hsv_b = np.array([158, 68, 137])
treshold = 150
lat = [-35.3629722, -35.3629064, -35.3634361, -35.3638474]
lon = [149.1649709, 149.1655721, 149.1657331, 149.1639733]
#vehicle = connect('udp:127.0.0.1:14551',wait_ready=True)
vehicle = connect('udp:127.0.0.1:14551',wait_ready=True)
def arm_and_takeoff(aTargetAltitude): #fungsi arming dan takeoff
print("Basic pre-arm checks")
# Don't let the user try to arm until autopilot is ready
while not(vehicle.is_armable):
print(" Waiting for vehicle to initialise...")
sleep(1)
print("Arming motors")
# Copter should arm in GUIDED mode
vehicle.mode = VehicleMode("GUIDED")
vehicle.armed = True
while not(vehicle.armed):
print(" Waiting for arming...")
sleep(1)
print("Taking off!")
vehicle.simple_takeoff(aTargetAltitude)
while True:
print(" Altitude: ", vehicle.location.global_relative_frame.alt)
#Break and return from function just below target altitude.
if (vehicle.location.global_relative_frame.alt>=aTargetAltitude*0.95):
print("Reached target altitude")
break
sleep(1)
def dist(a,z): #a=awal z=akhir
d_lat= (a.lat-z.lat)**2
d_long= (a.lon-z.lon)**2
jarak = (d_lat+d_long)**0.5
return jarak
def gerak_drone():
for i in range(0,len(lat)):
print(i)
wp = LocationGlobalRelative(lat[i],lon[i],2)
vehicle.simple_goto(wp)
sleep(1)
while (dist(vehicle.location.global_relative_frame,wp)>=0.0001):
print (str(round(dist(vehicle.location.global_relative_frame,wp)*100000,2)))
while True:
_,frame = cap.read()
hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
mask = cv2.inRange(hsv, hsv_b, hsv_a)
cv2.imshow("warna", mask)
cv2.imshow("hitamPutih", gray)
cv2.imshow("apa", frame)
print(cv2.countNonZero(mask))
if cv2.waitKey(500) == 27 or cv2.countNonZero(mask) > treshold :
break
if __name__ == "_main_":
altitude = 2
lat_distance = 1
lon_distance = 1
p1 = multiprocessing.Process(target=arm_and_takeoff, args=(altitude))
p2 = multiprocessing.Process(target=dist, args=(lat_distance, lon_distance))
p3 = multiprocessing.Process(target=gerak_drone)
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
print("Coming back")
vehicle.mode = VehicleMode("RTL")
sleep(20)
vehicle.mode = VehicleMode("LAND")
Here is my terminal result

Op type not registered \'IO>BigQueryClient\' with BigQuery connector on AI platform

I'm trying to parallelize the training step of my model with tensorflow ParameterServerStrategy. I work with GCP AI Platform to create the cluster and launch the task.
As my dataset is huge, I use the bigquery tensorflow connector included in tensorflow-io.
My script is inspired by the documentation of tensorflow bigquery reader and the documentation of tensorflow ParameterServerStrategy
Locally my script works well but when I launch it with AI Platform I get the following error :
{"created":"#1633444428.903993309","description":"Error received from peer ipv4:10.46.92.135:2222","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Op type not registered \'IO>BigQueryClient\' in binary running on gke-cml-1005-141531--n1-standard-16-2-644bc3f8-7h8p. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.","grpc_status":5}
The scripts works with fake data on AI platform and works locally with bigquery connector.
I imagine that the compilation of the model including the bigquery connector and its calls on other devices creates the bug but I don't know how to fix it.
I read this error happens when devices don't have same tensorflow versions so I checked tensorflow and tensorflow-io version on each device.
tensorflow : 2.5.0
tensorflow-io : 0.19.1
I created a similar example which reproduce the bug on AI platform
import os
from tensorflow_io.bigquery import BigQueryClient
from tensorflow_io.bigquery import BigQueryReadSession
import tensorflow as tf
import multiprocessing
import portpicker
from tensorflow.keras.layers.experimental import preprocessing
from google.cloud import bigquery
from tensorflow.python.framework import dtypes
import numpy as np
import pandas as pd
client = bigquery.Client()
PROJECT_ID = <your_project>
DATASET_ID = 'tmp'
TABLE_ID = 'bq_tf_io'
BATCH_SIZE = 32
# Bigquery requirements
def init_bq_table():
table = '%s.%s.%s' %(PROJECT_ID, DATASET_ID, TABLE_ID)
# Create toy_data
def create_toy_data(N):
x = np.random.random(size = N)
y = 0.2 + x + np.random.normal(loc=0, scale = 0.3, size = N)
return x, y
x, y =create_toy_data(1000)
df = pd.DataFrame(data = {'x': x, 'y': y})
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE",)
job = client.load_table_from_dataframe( df, table, job_config=job_config )
job.result()
# Create initial data
#init_bq_table()
CSV_SCHEMA = [
bigquery.SchemaField("x", "FLOAT64"),
bigquery.SchemaField("y", "FLOAT64"),
]
def transform_row(row_dict):
# Trim all string tensors
dataset_x = row_dict
dataset_x['constant'] = tf.cast(1, tf.float64)
# Extract feature column
dataset_y = dataset_x.pop('y')
#Export as tensor
dataset_x = tf.stack([dataset_x[column] for column in dataset_x], axis=-1)
return (dataset_x, dataset_y)
def read_bigquery(table_name):
tensorflow_io_bigquery_client = BigQueryClient()
read_session = tensorflow_io_bigquery_client.read_session(
"projects/" + PROJECT_ID,
PROJECT_ID, TABLE_ID, DATASET_ID,
list(field.name for field in CSV_SCHEMA),
list(dtypes.double if field.field_type == 'FLOAT64'
else dtypes.string for field in CSV_SCHEMA),
requested_streams=2)
dataset = read_session.parallel_read_rows()
return dataset
def get_data():
dataset = read_bigquery(TABLE_ID)
dataset = dataset.map(transform_row, num_parallel_calls=4)
dataset = dataset.batch(BATCH_SIZE).prefetch(2)
return dataset
cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
# parameter server and worker just wait jobs from the coordinator (chief)
if cluster_resolver.task_type in ("worker"):
worker_config = tf.compat.v1.ConfigProto()
server = tf.distribute.Server(
cluster_resolver.cluster_spec(),
job_name=cluster_resolver.task_type,
task_index=cluster_resolver.task_id,
config=worker_config,
protocol="grpc")
server.join()
elif cluster_resolver.task_type in ("ps"):
server = tf.distribute.Server(
cluster_resolver.cluster_spec(),
job_name=cluster_resolver.task_type,
task_index=cluster_resolver.task_id,
protocol="grpc")
server.join()
elif cluster_resolver.task_type == 'chief':
strategy = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver=cluster_resolver)
if cluster_resolver.task_type == 'chief':
learning_rate = 0.01
with strategy.scope():
# model
model_input = tf.keras.layers.Input(
shape=(2,), dtype=tf.float64)
layer_1 = tf.keras.layers.Dense( 8, activation='relu')(model_input)
dense_output = tf.keras.layers.Dense(1)(layer_1)
model = tf.keras.Model(model_input, dense_output)
#optimizer
optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate)
accuracy = tf.keras.metrics.MeanSquaredError()
#tf.function
def distributed_train_step(iterator):
def train_step(x_batch_train, y_batch_train):
with tf.GradientTape() as tape:
y_predict = model(x_batch_train, training=True)
loss_value = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(y_batch_train, y_predict)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
accuracy.update_state(y_batch_train, y_predict)
return loss_value
x_batch_train, y_batch_train = next(iterator)
return strategy.run(train_step, args=(x_batch_train, y_batch_train))
coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(strategy)
#test
def dataset_fn(_):
def create_toy_data(N):
x = np.random.random(size = N)
y = 0.2 + x + np.random.normal(loc=0, scale = 0.3, size = N)
return np.c_[x,y]
def toy_transform_row(row):
dataset_x = tf.stack([row[0], tf.cast(1, tf.float64)], axis=-1)
dataset_y = row[1]
return dataset_x, dataset_y
N = 1000
data =create_toy_data(N)
dataset = tf.data.Dataset.from_tensor_slices(data)
dataset = dataset.map(toy_transform_row, num_parallel_calls=4)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(2)
return dataset
#tf.function
def per_worker_dataset_fn():
return strategy.distribute_datasets_from_function(lambda x : get_data()) # <-- Not working with AI platform
#return strategy.distribute_datasets_from_function(dataset_fn) # <-- Working with AI platform
per_worker_dataset = coordinator.create_per_worker_dataset(per_worker_dataset_fn)
# Train model
for epoch in range(5):
per_worker_iterator = iter(per_worker_dataset)
accuracy.reset_states()
for step in range(5):
coordinator.schedule(distributed_train_step, args=(per_worker_iterator,))
coordinator.join()
print ("Finished epoch %d, accuracy is %f." % (epoch, accuracy.result().numpy()))
When I create the dataset with per_worker_dataset_fn() I can use the bigquery connector (bugging) or create the dataset in live (working).
AI Platform Cluster configuration :
runtimeVersion: "2.5"
pythonVersion: "3.7"
Did someone get this issue ? Bigquery connector worked pretty well with MirroredStrategy on AI Platform. Tell me if I should report the issue somewhere else.
I think this is due to lazy loading of libtensorflow_io.so.
https://github.com/tensorflow/io/commit/85d018ee59ceccfae06914ec2a2f6d6583775ff7
Can you try adding something like this to your code:
import tensorflow_io
tensorflow_io.experimental.oss()
As far as I understand this happens because when you submit your training job to Cloud AI training, it is using a stock TensorFlow 2.5 environment that doesn't have tensorflow-io package installed. Therefore it is complaining that it doesn't know about 'IO>BigQueryClient' op defined in tensorflow-io package.
Instead you can submit your training job to be using a custom container:
https://cloud.google.com/ai-platform/training/docs/custom-containers-training
You don't need to write a new Docker file, you can use
gcr.io/deeplearning-platform-release/tf-cpu.2-5
or
gcr.io/deeplearning-platform-release/tf-gpu.2-5 (if your training job needs GPU) that has the right version of tensorflow-io installed.
You can read more about these containers here:
https://cloud.google.com/tensorflow-enterprise/docs/use-with-deep-learning-containers
Here is my old example showing how to run a distributed training on Cloud AI using BigQueryReader: https://github.com/vlasenkoalexey/criteo/blob/master/scripts/train-cloud.sh
It is no longer maintained, but should give you a general idea how it should look like.

Is it possible to run python tensorflow code on TPU without using the Estimator API?

I have spent weeks now trying to write a Python level Tensorflow code that could communicate with TPUs directly. How would it be possible to implement the system that could run on a TPU without the Estimator API?
Resources I tried:
All the documentation about the Estimator API, TPU on https://www.tensorflow.org
Ways I tried:
Initialized a TPUClusterResolver and passed that as an argument for tf.Session() and it was just hanging without executing the session.run()
Also tried sess.run(tpu.initialize_system()) and it got stuck as well
Tried looking into the TPUEstimator API as there
def train_model(self, env, episodes=100,
load_model = False, # load model from checkpoint if available:?
model_dir = '/tmp/pgmodel/', log_freq=10 ) :
# initialize variables and load model
init_op = tf.global_variables_initializer()
self._sess.run(init_op)
if load_model:
ckpt = tf.train.get_checkpoint_state(model_dir)
print tf.train.latest_checkpoint(model_dir)
if ckpt and ckpt.model_checkpoint_path:
savr = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta')
out = savr.restore(self._sess, ckpt.model_checkpoint_path)
print("Model restored from ",ckpt.model_checkpoint_path)
else:
print('No checkpoint found at: ',model_dir)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
episode = 0
observation = env.reset()
xs,rs,ys = [],[],[] # environment info
running_reward = 0
reward_sum = 0
# training loop
day = 0
simrors = np.zeros(episodes)
mktrors = np.zeros(episodes)
alldf = None
victory = False
while episode < episodes and not victory:
# stochastically sample a policy from the network
x = observation
feed = {self._tf_x: np.reshape(x, (1,-1))}
aprob = self._sess.run(self._tf_aprob,feed)
aprob = aprob[0,:] # we live in a batched world :/
action = np.random.choice(self._num_actions, p=aprob)
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
#print observation, reward, done, info
reward_sum += reward
# record game history
xs.append(x)
ys.append(label)
rs.append(reward)
day += 1
if done:
running_reward = running_reward * 0.99 + reward_sum * 0.01
epx = np.vstack(xs)
epr = np.vstack(rs)
epy = np.vstack(ys)
xs,rs,ys = [],[],[] # reset game history
df = env.env.sim.to_df()
#pdb.set_trace()
simrors[episode]=df.bod_nav.values[-1]-1 # compound returns
mktrors[episode]=df.mkt_nav.values[-1]-1
alldf = df if alldf is None else pd.concat([alldf,df], axis=0)
feed = {self._tf_x: epx, self._tf_epr: epr, self._tf_y: epy}
_ = self._sess.run(self._train_op,feed) # parameter update
if episode % log_freq == 0:
log.info('year #%6d, mean reward: %8.4f, sim ret: %8.4f, mkt ret: %8.4f, net: %8.4f', episode,
running_reward, simrors[episode],mktrors[episode], simrors[episode]-mktrors[episode])
save_path = self._saver.save(self._sess, model_dir+'model.ckpt',
global_step=episode+1)
if episode > 100:
vict = pd.DataFrame( { 'sim': simrors[episode-100:episode],
'mkt': mktrors[episode-100:episode] } )
vict['net'] = vict.sim - vict.mkt
if vict.net.mean() > 0.0:
victory = True
log.info('Congratulations, Warren Buffet! You won the trading game.')
#print("Model saved in file: {}".format(save_path))
episode += 1
observation = env.reset()
reward_sum = 0
day = 0
return alldf, pd.DataFrame({'simror':simrors,'mktror':mktrors})
Problems I have with the Estimator API implementation:
I have a policy gradient based reinforcement learning code that contains a neural network
I have two session.run() during my execution. One is running on every step within the episode. The other is running at the end of the episode
tf.train.SessionRunHook is not a suitable implementation for my code

A2C is not working due to critic loss is not converging

I'm trying to do my own implementation of the Advantage Actor Critic algorithm by using tensorflow. I used the code in https://github.com/BoYanSTKO/Practical_RL-coursera/blob/master/week5_policy_based/practice_a3c.ipynb as a rough template on how I should write the algorithm.
I tried it on the simple CartPole-v0 gym environment but by implementation fails badly. The critics loss just explodes and becomes way to large while the actors loss is rather low.
I'm not sure what I'm doing wrong here. Any help? :)
I've tried separating the actor and critic from each other by having 2 different networks. This did not help either. Have also tried fine tuning some stuff like gamma and learning rate without any success.
!/usr/bin/python
import tensorflow as tf
import numpy as np
import gym
import random
from tensorboardX import SummaryWriter
class ActorCritic():
def __init__(self,state_dim,n_actions,learning_rate,gamma=0.99):
with tf.variable_scope("ActorCritic"):
self.states_ph = tf.placeholder(tf.float32,(None,state_dim),name="states")
self.action_ph = tf.placeholder(tf.int32,(None,),name="actions")
self.n_actions = n_actions
self.reward_ph = tf.placeholder(tf.float32,(None,),name="rewards")
self.next_state_values = tf.placeholder(tf.float32,(None,),name="rewards")
self.is_done_ph = tf.placeholder(tf.float32,(None,),name="rewards")
net = tf.layers.dense(self.states_ph,24,activation=tf.nn.relu)
self.logits = tf.layers.dense(net,n_actions,activation=None)
self.state_values = tf.layers.dense(net,1,activation=None)
self.action_probs = tf.nn.softmax(self.logits)
self.log_prob = tf.nn.log_softmax(self.logits)
self.entropy = -tf.reduce_sum(self.action_probs*self.log_prob,axis=-1,name="entropy")
self.logp_actions = tf.reduce_sum(self.log_prob*tf.one_hot(self.action_ph,depth=n_actions),axis=-1)
self.target_state_values = self.reward_ph + gamma*(1.0-self.is_done_ph)*self.next_state_values
self.advantage = self.target_state_values - self.state_values
self.actor_loss = -tf.reduce_mean(self.logp_actions * tf.stop_gradient(self.advantage)) - 0.01*tf.reduce_mean(self.entropy)
self.critic_loss = tf.reduce_mean(self.advantage**2.0)
self.train_opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.actor_loss+self.critic_loss)
def train(self,states,actions,rewards,is_done,nxt_state_values_batch):
sess = tf.get_default_session()
return sess.run([self.critic_loss,self.actor_loss,self.train_opt],feed_dict={
self.next_state_values:nxt_state_values_batch,
self.states_ph:states,
self.action_ph:actions,
self.reward_ph:rewards,
self.is_done_ph:is_done})
def predict_state_values(self,states):
sess = tf.get_default_session()
return sess.run(self.state_values,feed_dict={self.states_ph:states})
def sample_actions(self,states):
sess = tf.get_default_session()
action_probs = sess.run(self.action_probs,{self.states_ph:states})
return [ np.random.choice(range(self.n_actions),p=action_prob) for action_prob in action_probs ]
class EnvBatch():
def __init__(self,env_name,n_envs):
self.envs = [gym.make(env_name) for env in range(n_envs)]
self.n_actions = self.envs[0].action_space.n
self.state_dim = self.envs[0].observation_space.shape[0]
def reset(self):
return [env.reset().tolist() for env in self.envs ]
def step(self,actions):
states_batch, rewards_batch, is_done_batch = [], [], []
for action, env in zip(actions,self.envs):
s, r , d, _ = env.step(action)
if d:
s = env.reset()
states_batch.append(s)
rewards_batch.append(r)
is_done_batch.append(d)
return np.array(states_batch), np.array(rewards_batch), np.array(is_done_batch)
def evaluate_performance(env_name,agent,nr_runs=10):
env = gym.make(env_name)
rewards = []
for _ in range(nr_runs):
state = env.reset()
is_done = False
acc_reward = 0.0
while not is_done:
action = agent.sample_actions([state])
nxt_state, reward, is_done, _ = env.step(action[0])
state = nxt_state
acc_reward += reward
rewards.append(acc_reward)
return np.mean(rewards)
tf.reset_default_graph()
env = EnvBatch("CartPole-v0",10)
agent = ActorCritic(env.state_dim,env.n_actions,learning_rate=0.001)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
state_batch = env.reset()
writer = SummaryWriter()
for i in range(100000):
actions = agent.sample_actions(state_batch)
nxt_state_batch, rewards_batch, is_done_batch = env.step(actions)
nxt_state_values = agent.predict_state_values(nxt_state_batch).ravel()
critic_loss, actor_loss, _ = agent.train(state_batch,actions,rewards_batch,is_done_batch,nxt_state_values)
writer.add_scalar("actor_loss",actor_loss,i)
writer.add_scalar("critic_loss",critic_loss,i)
if i%50==0:
test_reward = evaluate_performance("CartPole-v0",agent)
writer.add_scalar("test_reward",test_reward,i)
if test_reward > 195:
print "Done!"
states_batch = nxt_state_batch
sess.close()
writer.close()

Display python variable in tensorboard

i wanna display some python variables in tensorboard, but i dont get it done.
My code so far, display only a line in tensorboard for the lines with static number, if i use the outcommented-lines, it does not work ? It then prints:
ValueError: Shapes () and (?,) are not compatible
Someone has an idea?
import tensorflow as tf
step = 0
session = tf.Session()
tensorboardVar = tf.Variable(0, "tensorboardVar")
pythonVar = tf.placeholder("int32", [None])
#update_tensorboardVar = tensorboardVar.assign(pythonVar)
update_tensorboardVar = tensorboardVar.assign(4)
tf.scalar_summary("myVar", update_tensorboardVar)
merged = tf.merge_all_summaries()
sum_writer = tf.train.SummaryWriter('/tmp/train/c/', session.graph)
session.run(tf.initialize_all_variables())
for i in range(100):
_, result = session.run([update_tensorboardVar, merged])
#_, result = session.run([update_tensorboardVar, merged], feed_dict={pythonVar: i})
sum_writer.add_summary(result, step)
step += 1
this is working:
import tensorflow as tf
import numpy as np
step = 0
session = tf.Session()
tensorboardVar = tf.Variable(0, "tensorboardVar")
pythonVar = tf.placeholder("int32", [])
update_tensorboardVar = tensorboardVar.assign(pythonVar)
tf.scalar_summary("myVar", update_tensorboardVar)
merged = tf.merge_all_summaries()
sum_writer = tf.train.SummaryWriter('/tmp/train/c/', session.graph)
session.run(tf.initialize_all_variables())
for i in range(100):
#_, result = session.run([update_tensorboardVar, merged])
j = np.array(i)
_, result = session.run([update_tensorboardVar, merged], feed_dict={pythonVar: j})
sum_writer.add_summary(result, step)
step += 1
An alternative way can be found in the second answer to Computing exact moving average over multiple batches in tensorflow. There it is shown how you can create custom summaries.