Tensorflow: running code on multiple nodes - tensorflow

import json
import tensorflow as tf
num_devices=3
with open('clusterspec.json', 'r') as f:
clusterspec = json.load(f)
cluster = tf.train.ClusterSpec(clusterspec)
a = tf.constant(3)
b = tf.constant(2)
for i in range(num_devices):
with tf.device("/job:worker/task:%d"%i):
if( i == 0):
x = tf.mul(a,b)
if( i == 1):
y = tf.mul(a,b)
if( i == 2):
z = tf.mul(a,b)
server = tf.train.Server(cluster, job_name="master", task_index=0)
with tf.Session(server.target) as sess:
print "here"
print(sess.run([x,y,z]))
Could someone please suggest a better way to run this simple example on multiple machines. I don't want to have three different tensors x, y, and z. Instead, I want only a single tensor, and not use the branching within the graph at all.

Related

Splitting a Tensorflow protobuf into two separate models

As part of my team's security measures, I need to split a Tensorflow protubuf model into two parts. The idea is that both protobuf splits can be stored separately. When the end user needs the model, there original model can be restored with the protobuf splits.
My current approach is to load the .pb file, split the model into two graphs and then save each graph.
def extract_sub_graph(graph_def, dest_nodes):
if not isinstance(graph_def, graph_pb2.GraphDef):
raise TypeError("graph_def must be a graph_pb2.GraphDef proto.")
if isinstance(dest_nodes, six.string_types):
raise TypeError("dest_nodes must be a list.")
name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(graph_def)
_assert_nodes_are_present(name_to_node, dest_nodes)
nodes_to_keep = _bfs_for_reachable_nodes(dest_nodes, name_to_input_name)
nodes_to_keep_copy = copy.deepcopy(nodes_to_keep)
for node in nodes_to_keep_copy:
if node not in dest_nodes:
nodes_to_keep.remove(node)
nodes_to_keep_list = sorted(
list(nodes_to_keep), key=lambda n: name_to_seq_num[n])
# Now construct the output GraphDef
out = graph_pb2.GraphDef()
for n in nodes_to_keep_list:
out.node.extend([copy.deepcopy(name_to_node[n])])
out.library.CopyFrom(graph_def.library)
out.versions.CopyFrom(graph_def.versions)
return out
def split_model(graph_def):
subgraphs = []
graph_nodes = [n for n in graph_def.node]
node_names = []
for t in graph_nodes:
node_names.append(t.name)
middle_node_index = int(len(graph_nodes) / 2)
subgraph_1_nodes = []
subgraph_2_nodes = []
for i in range(middle_node_index, len(graph_nodes)):
subgraph_1_nodes.append(node_names[i])
for i in range(0, middle_node_index):
subgraph_2_nodes.append(node_names[i])
subgraph_1 = extract_sub_graph(graph_def, subgraph_1_nodes)
subgraph_2 = extract_sub_graph(graph_def, subgraph_2_nodes)
subgraphs = [subgraph_1, subgraph_2]
return subgraphs
if __name__ == "__main__":
weights_path = "model.pb"
pbtxt_path = "protobuf_text.pbtxt"
with tf.gfile.FastGFile(weights_path, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
subgraphs = split_model(graph_def)
A modified version of extract_sub_graph() is taken from tensorflow.python.framework.graph_util_impl.
I am struggling to save the graphs as protubuf files. I used tf.io.write_file() and tf.keras.models.save_model() but none of them worked. What is the proper way to save a graph_pb2.GraphDef proto file?

How to calculate cosine similarity given sparse matrix data in TensorFlow?

I'm supposed to change part of a python script on the GitHub website. This code is an attention-based similarity measure, but I want to turn it to cosine similarity.
The respective code is in the layers.py file (inside the call method).
Attention-Based:
def __call__(self, inputs):
x = inputs
# dropout
if self.sparse_inputs:
x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
else:
x = tf.nn.dropout(x, 1-self.dropout)
# graph learning
h = dot(x, self.vars['weights'], sparse=self.sparse_inputs)
N = self.num_nodes
edge_v = tf.abs(tf.gather(h,self.edge[0]) - tf.gather(h,self.edge[1]))
edge_v = tf.squeeze(self.act(dot(edge_v, self.vars['a'])))
sgraph = tf.SparseTensor(indices=tf.transpose(self.edge), values=edge_v, dense_shape=[N, N])
sgraph = tf.sparse_softmax(sgraph)
return h, sgraph
I edited the above code to what I believe are my requirements (cosine similarity). However, when I run the following code, like so:
def __call__(self, inputs):
x = inputs
# dropout
if self.sparse_inputs:
x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
else:
x = tf.nn.dropout(x, 1-self.dropout)
# graph learning
h = dot(x, self.vars['weights'], sparse=self.sparse_inputs)
N = self.num_nodes
h_norm = tf.nn.l2_normalize(h)
edge_v = tf.matmul(h_norm, tf.transpose(h_norm))
h_norm_1 = tf.norm(h_norm)
edge_v /= h_norm_1 * h_norm_1
edge_v = dot(edge_v, self.vars['a']) # It causes an error when I add this line
zero = tf.constant(0, dtype=tf.float32)
where = tf.not_equal(edge_v, zero)
indices = tf.where(where)
values = tf.gather_nd(edge_v, indices)
sgraph = tf.SparseTensor(indices, values, dense_shape= [N,N])
return h, sgraph
The script shows some runtime errors:
Screenshot of error message
I suspect the error here is related to line 226:
edge_v = dot(edge_v, self.vars['a']) # It causes an error when I add this line
Any admonition on how to accomplish this successfully?
Link of the script on GitHub:
https://github.com/jiangboahu/GLCN-tf
Note: I don't want to use built-in functions, because I think they are not precise to do this job.
ETA: It appears that there are some answers around but they seem to tackle different problems, as far, as I understood them.
Thanks a bunch in advance
What's the dot? Have you imported the method?
It should either be:
edge_v = tf.keras.backend.dot(edge_v, self.vars['a'])
or
edge_v = tf.tensordot(edge_v, self.vars['a'])

How can I update tensor (weight value) trying to use two separate network?

I've been trying to make AI for blackjack using RL. Now I'm trying to make two separate networks which is one way of DQN. I've searched the web and found some way and tried to use it but failed.
This error has occurred:
TypeError: Using a tf.Tensor as a Python bool is not allowed. Use if t is not None: instead of if t: to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.
Code:
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
def one_hot(x):
s=np.identity(600)
b = s[x[0] * 20 + x[1] * 2 + x[2]]
return b.reshape(1, 600)
def boolstr_to_floatstr(v):
if v == True:
return 1
elif v == False:
return 0
env=gym.make('Blackjack-v0')
learning_rate=0.5
state_number=600
action_number=2
#######################################3
X=tf.placeholder(tf.float32, shape=[1,state_number], name='input_data')
W1=tf.Variable(tf.random_uniform([state_number,128],0,0.01))#network for update
layer1=tf.nn.tanh(tf.matmul(X,W1))
W2=tf.Variable(tf.random_uniform([128,256],0,0.01))
layer2=tf.nn.tanh(tf.matmul(layer1,W2))
W3=tf.Variable(tf.random_uniform([256,action_number],0,0.01))
Qpred=tf.matmul(layer2,W3) # Qprediction
#####################################################################3
X1=tf.placeholder(shape=[1,state_number],dtype=tf.float32)
W4=tf.Variable(tf.random_uniform([state_number,128],0,0.01))#network for target
layer3=tf.nn.tanh(tf.matmul(X1,W4))
W5=tf.Variable(tf.random_uniform([128,256],0,0.01))
layer4=tf.nn.tanh(tf.matmul(layer3,W5))
W6=tf.Variable(tf.random_uniform([256,action_number],0,0.01))
target=tf.matmul(layer4,W6) # target
#################################################################
update1=W4.assign(W1)
update2=W5.assign(W2)
update3=W6.assign(W3)
Y=tf.placeholder(shape=[1,action_number],dtype=tf.float32)
loss=tf.reduce_sum(tf.square(Y-Qpred))#cost(W)=(Ws-y)^2
train=tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
num_episodes=1000
dis=0.99 #discount factor
rList=[] #record the reward
init=tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes): #episode 만번
s = env.reset()
rALL = 0
done = False
e=1./((i/100)+1) #exploit or explore용 상수
total_loss=[]
while not done:
s = np.asarray(s)
s[2] = boolstr_to_floatstr(s[2])
#print(np.shape(one_hot(s)))
#print(one_hot(s))
Qs=sess.run(Qpred,feed_dict={X:one_hot(s).astype(np.float32)})
if np.random.rand(1)<e: #새로운 도전시도
a=env.action_space.sample()
else:
a=np.argmax(Qs) #그냥 내가아는한 최댓값의 액션 선택
s1,reward,done,_=env.step(a) #
s1=np.asarray(s1)
s1[2]=boolstr_to_floatstr(s1[2])
if done:
Qs[0,a]=reward
else:
Qs1=sess.run(target,feed_dict={X1:one_hot(s1)})
Qs[0,a]=reward+dis*np.max(Qs1) #optimal Q
sess.run(train,feed_dict={X:one_hot(s),Y:Qs})
if i%10==0: ##target 을 Qpredion으로 업데이트해줌
sess.run(update1,update2,update3)
if reward==1:
rALL += reward
else:
rALL+=0
s=s1
rList.append(rALL)
print('success rate: '+ str(sum(rList)/num_episodes))
print("Final Q-table values")
I need to print success rate finally. before DQN its 38%ish. If there is something wrong in my code considering its DQN algorithm, tell me please.
If you want to share the weights between different networks, then simply create layer with same name, using the scope with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): and then weights between networks will be shared automatically.

How to use `sparse_softmax_cross_entropy_with_logits`: without getting Incompatible Shapes Error

I would like to use the sparse_softmax_cross_entropy_with_logits
with the julia TensorFlow wrapper.
The operations is defined in the code here.
Basically, as I understand it the first argument should be logits, that would normally be fed to softmax to get them to be category probabilities (~1hot output).
And the second should be the correct labels as label ids.
I have adjusted the example code from the TensorFlow.jl readme
See below:
using Distributions
using TensorFlow
# Generate some synthetic data
x = randn(100, 50)
w = randn(50, 10)
y_prob = exp(x*w)
y_prob ./= sum(y_prob,2)
function draw(probs)
y = zeros(size(probs))
for i in 1:size(probs, 1)
idx = rand(Categorical(probs[i, :]))
y[i, idx] = 1
end
return y
end
y = draw(y_prob)
# Build the model
sess = Session(Graph())
X = placeholder(Float64)
Y_obs = placeholder(Float64)
Y_obs_lbl = indmax(Y_obs, 2)
variable_scope("logisitic_model", initializer=Normal(0, .001)) do
global W = get_variable("weights", [50, 10], Float64)
global B = get_variable("bias", [10], Float64)
end
L = X*W + B
Y=nn.softmax(L)
#costs = log(Y).*Y_obs #Dense (Orginal) way
costs = nn.sparse_softmax_cross_entropy_with_logits(L, Y_obs_lbl+1) #sparse way
Loss = -reduce_sum(costs)
optimizer = train.AdamOptimizer()
minimize_op = train.minimize(optimizer, Loss)
saver = train.Saver()
# Run training
run(sess, initialize_all_variables())
cur_loss, _ = run(sess, [Loss, minimize_op], Dict(X=>x, Y_obs=>y))
When I run it however, I get an error:
Tensorflow error: Status: Incompatible shapes: [1,100] vs. [100,10]
[[Node: gradients/SparseSoftmaxCrossEntropyWithLogits_10_grad/mul = Mul[T=DT_DOUBLE, _class=[], _device="/job:localhost/replica:0/task:0/cpu:0"](gradients/SparseSoftmaxCrossEntropyWithLogits_10_grad/ExpandDims, SparseSoftmaxCrossEntropyWithLogits_10:1)]]
in check_status(::TensorFlow.Status) at /home/ubuntu/.julia/v0.5/TensorFlow/src/core.jl:101
in run(::TensorFlow.Session, ::Array{TensorFlow.Port,1}, ::Array{Any,1}, ::Array{TensorFlow.Port,1}, ::Array{Ptr{Void},1}) at /home/ubuntu/.julia/v0.5/TensorFlow/src/run.jl:96
in run(::TensorFlow.Session, ::Array{TensorFlow.Tensor,1}, ::Dict{TensorFlow.Tensor,Array{Float64,2}}) at /home/ubuntu/.julia/v0.5/TensorFlow/src/run.jl:143
This only happens when I try to train it.
If I don't include an optimise function/output then it works fine.
So I am doing something that screws up the gradient math.

Display python variable in tensorboard

i wanna display some python variables in tensorboard, but i dont get it done.
My code so far, display only a line in tensorboard for the lines with static number, if i use the outcommented-lines, it does not work ? It then prints:
ValueError: Shapes () and (?,) are not compatible
Someone has an idea?
import tensorflow as tf
step = 0
session = tf.Session()
tensorboardVar = tf.Variable(0, "tensorboardVar")
pythonVar = tf.placeholder("int32", [None])
#update_tensorboardVar = tensorboardVar.assign(pythonVar)
update_tensorboardVar = tensorboardVar.assign(4)
tf.scalar_summary("myVar", update_tensorboardVar)
merged = tf.merge_all_summaries()
sum_writer = tf.train.SummaryWriter('/tmp/train/c/', session.graph)
session.run(tf.initialize_all_variables())
for i in range(100):
_, result = session.run([update_tensorboardVar, merged])
#_, result = session.run([update_tensorboardVar, merged], feed_dict={pythonVar: i})
sum_writer.add_summary(result, step)
step += 1
this is working:
import tensorflow as tf
import numpy as np
step = 0
session = tf.Session()
tensorboardVar = tf.Variable(0, "tensorboardVar")
pythonVar = tf.placeholder("int32", [])
update_tensorboardVar = tensorboardVar.assign(pythonVar)
tf.scalar_summary("myVar", update_tensorboardVar)
merged = tf.merge_all_summaries()
sum_writer = tf.train.SummaryWriter('/tmp/train/c/', session.graph)
session.run(tf.initialize_all_variables())
for i in range(100):
#_, result = session.run([update_tensorboardVar, merged])
j = np.array(i)
_, result = session.run([update_tensorboardVar, merged], feed_dict={pythonVar: j})
sum_writer.add_summary(result, step)
step += 1
An alternative way can be found in the second answer to Computing exact moving average over multiple batches in tensorflow. There it is shown how you can create custom summaries.