how to use conda accelerate / benchmarks? - numpy

I'm attempting to use Conda Accelerate to speedup some data preprocessing, but initial benchmarks indicate either I'm not using it correctly or it has no effect on FFT & linear algebra execution times in numpy and librosa. Re-reading the literature - does this mean I'm supposed to decorate and recode every ndarray operation as in the batch-matmul example for NumbaPro? I'd assumed I simply installed and it made numpy faster, but this doesn't appear to be the case.
Benchmarks and code are below. I've installed accelerate via conda install accelerate and also imported it for good measure.
Result - negligible difference before and after conda install accelerate
Total time was 25.356
Total load time was 1.6743
Total math time was 22.1599
Total save time was 1.5139
Total stft math time was 12.9219
Total other numpy math time was 9.1886
Relevant code:
loads, maths, saves = [], [], []
stfts, nps = [], []
# now we have a dict of all source files grouped by voice
for i in range(30):
v0_fn = v0_list[i]
v1_fn = v1_list[i]
tl0 = time.time()
# Process v0 & v1 file
v0_fn = signal_dir+v0_fn
v0, fs_s = librosa.load(v0_fn, sr=None)
v1_fn = signal_dir+v1_fn
v1, fs_s = librosa.load(v1_fn, sr=None)
tl1 = time.time()
mix = v0 + v1
# Capture the magnitude and phase of signal and signal + noise
tm0 = time.time()
v0_stft = librosa.stft(v0, int(frame_size*fs), int(step_size*fs)).transpose()
tm1 = time.time()
v0_mag = (v0_stft.real**2 + v0_stft.imag**2)**0.5
v0_pha = np.arctan2(v0_stft.imag, v0_stft.real)
v0_rtheta = np.stack((v0_mag, v0_pha), axis=0)
tm2 = time.time()
v1_stft = librosa.stft(v1, int(frame_size*fs), int(step_size*fs)).transpose()
tm3 = time.time()
v1_mag = (v1_stft.real**2 + v1_stft.imag**2)**0.5
v1_pha = np.arctan2(v1_stft.imag, v1_stft.real)
v1_rtheta = np.stack((v1_mag, v1_pha), axis=0)
tm4 = time.time()
mix_stft = librosa.stft(mix, int(frame_size*fs), int(step_size*fs)).transpose()
tm5 = time.time()
mix_mag = (mix_stft.real**2 + mix_stft.imag**2)**0.5
mix_pha = np.arctan2(mix_stft.imag, mix_stft.real)
mix_rtheta = np.stack((mix_mag, mix_pha), axis=0)
tm6 = time.time()
stfts += [tm1-tm0, tm3-tm2, tm5-tm4]
nps += [tm2-tm1, tm4-tm3, tm6-tm5]
data['sig_rtheta'] = v0_rtheta
data['noi_rtheta'] = v1_rtheta
data['mix_rtheta'] = mix_rtheta
tl2 = time.time()
with open(write_name, 'w') as f:
cPickle.dump(all_info, f, protocol=-1)
tl3 = time.time()
t1 = time.time()
print 'Total time was %.3f' % (t1-t0)
print 'Total load time was %.4f' % np.sum(loads)
print 'Total math time was %.4f' % np.sum(maths)
print 'Total save time was %.4f' % np.sum(saves)
print 'Total stft math was %.4f' % np.sum(stfts)
print 'Total other numpy math time was %.4f' % np.sum(nps)


Convert Tensorflow 1.x code with custom loss into 2.x

Suppose I have the following code written in Tensorflow 1.x where I define custom loss function. I wish to remove .compat.v1., Session, placeholder etc. and convert it into Tensorflow 2.x.
How to do so?
import DGM
import tensorflow as tf
import numpy as np
import scipy.stats as spstats
import matplotlib.pyplot as plt
from tqdm.notebook import trange
# Option parameters
phi = 10
n = 0.01
T = 4
# Solution parameters (domain on which to solve PDE)
t_low = 0.0 - 1e-10
x_low = 0.0 + 1e-10
x_high = 1.0
# neural network parameters
num_layers = 3
nodes_per_layer = 50
# Training parameters
sampling_stages = 2500 # number of times to resample new time-space domain points
steps_per_sample = 20 # number of SGD steps to take before re-sampling
# Sampling parameters
nsim_interior = 100
nsim_boundary_1 = 50
nsim_boundary_2 = 50
nsim_initial = 50
x_multiplier = 1.1 # multiplier for oversampling i.e. draw x from [x_low, x_high * x_multiplier]
def sampler(nsim_interior, nsim_boundary_1, nsim_boundary_2, nsim_initial):
''' Sample time-space points from the function's domain; points are sampled
uniformly on the interior of the domain, at the initial/terminal time points
and along the spatial boundary at different time points.
nsim_interior: number of space points in the interior of U
nsim_boundary_1: number of space points in the boundary of U
nsim_boundary_2: number of space points in the boundary of U_x
nsim_initial: number of space points at the initial time
# Sampler #1: domain interior
t_interior = np.random.uniform(low=t_low, high=T, size=[nsim_interior, 1])
x_interior = np.random.uniform(low=x_low, high=x_high*x_multiplier, size=[nsim_interior, 1])
# Sampler #2: spatial boundary 1
t_boundary_1 = np.random.uniform(low=t_low, high=T, size=[nsim_boundary_1, 1])
x_boundary_1 = np.ones((nsim_boundary_1, 1))
# Sampler #3: spatial boundary 2
t_boundary_2 = np.random.uniform(low=t_low, high=T, size=[nsim_boundary_2, 1])
x_boundary_2 = np.zeros((nsim_boundary_2, 1))
# Sampler #4: initial condition
t_initial = np.zeros((nsim_initial, 1))
x_initial = np.random.uniform(low=x_low, high=x_high*x_multiplier, size=[nsim_initial, 1])
return (
t_interior, x_interior,
t_boundary_1, x_boundary_1,
t_boundary_2, x_boundary_2,
t_initial, x_initial
def loss(
t_interior, x_interior,
t_boundary_1, x_boundary_1,
t_boundary_2, x_boundary_2,
t_initial, x_initial
''' Compute total loss for training.
model: DGM model object
t_interior, x_interior: sampled time / space points in the interior of U
t_boundary_1, x_boundary_1: sampled time / space points in the boundary of U
t_boundary_2, x_boundary_2: sampled time / space points in the boundary of U_x
t_initial, x_initial: sampled time / space points at the initial time
# Loss term #1: PDE
# compute function value and derivatives at current sampled points
u = model(t_interior, x_interior)
u_t = tf.gradients(ys=u, xs=t_interior)[0]
u_x = tf.gradients(ys=u, xs=x_interior)[0]
u_xx = tf.gradients(ys=u_x, xs=x_interior)[0]
diff_u = u_t - u_xx + phi**2 * (tf.nn.relu(u) + 1e-10)**n
# compute average L2-norm for the PDE
L1 = tf.reduce_mean(input_tensor=tf.square(diff_u))
# Loss term #2: First b. c.
u = model(t_boundary_1, x_boundary_1)
bc1_error = u - 1
# Loss term #3: Second b. c.
u = model(t_boundary_2, x_boundary_2)
u_x = tf.gradients(ys=u, xs=x_boundary_2)[0]
bc2_error = u_x - 0
# Loss term #3: Initial condition
u = model(t_initial, x_initial)
init_error = u - 1
# compute average L2-norm for the initial/boundary conditions
L2 = tf.reduce_mean(input_tensor=tf.square(bc1_error + bc2_error + init_error))
return L1, L2
# initialize DGM model (last input: space dimension = 1)
model = DGM.DGMNet(nodes_per_layer, num_layers, 1)
# tensor placeholders (_tnsr suffix indicates tensors)
# inputs (time, space domain interior, space domain at initial time)
t_interior_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_interior_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_boundary_1_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_boundary_1_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_boundary_2_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_boundary_2_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_initial_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_initial_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
# loss
L1_tnsr, L2_tnsr = loss(
t_interior_tnsr, x_interior_tnsr,
t_boundary_1_tnsr, x_boundary_1_tnsr,
t_boundary_2_tnsr, x_boundary_2_tnsr,
t_initial_tnsr, x_initial_tnsr
loss_tnsr = L1_tnsr + L2_tnsr
# set optimizer
starting_learning_rate = 3e-4
global_step = tf.Variable(0, trainable=False)
lr = tf.compat.v1.train.exponential_decay(
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr).minimize(loss_tnsr)
# initialize variables
init_op = tf.compat.v1.global_variables_initializer()
# open session
sess = tf.compat.v1.Session()
print("Loading from checkpoint.")
print("Checkpoint not found.")
# for each sampling stage
for i in trange(sampling_stages):
# sample uniformly from the required regions
t_interior, x_interior, \
t_boundary_1, x_boundary_1, \
t_boundary_2, x_boundary_2, \
t_initial, x_initial = sampler(
nsim_interior, nsim_boundary_1, nsim_boundary_2, nsim_initial
# for a given sample, take the required number of SGD steps
for _ in range(steps_per_sample):
loss, L1, L2, _ =
[loss_tnsr, L1_tnsr, L2_tnsr, optimizer],
feed_dict = {
t_interior_tnsr: t_interior,
x_interior_tnsr: x_interior,
t_boundary_1_tnsr: t_boundary_1,
x_boundary_1_tnsr: x_boundary_1,
t_boundary_2_tnsr: t_boundary_2,
x_boundary_2_tnsr: x_boundary_2,
t_initial_tnsr: t_initial,
x_initial_tnsr: x_initial,
if i % 10 == 0:
print(f"Loss: {loss:.5f},\t L1: {L1:.5f},\t L2: {L2:.5f},\t iteration: {i}")
I tried searching how to implement custom loss functions with model as an argument, but couldn't implement it.
For model.compile there is a loss argument for which you can pass the Loss function. May be a string (name of loss function), or a tf.keras.losses.Loss instance. For example
If you have created your custom loss function you can also pass that loss function to the loss argument by providing the name of that loss function. For example
def my_loss_fn(y_true, y_pred):
squared_difference = tf.square(y_true - y_pred)
return tf.reduce_mean(squared_difference, axis=-1)
model.compile(optimizer='adam', loss=my_loss_fn)
Thank You.

Is it possible to run python tensorflow code on TPU without using the Estimator API?

I have spent weeks now trying to write a Python level Tensorflow code that could communicate with TPUs directly. How would it be possible to implement the system that could run on a TPU without the Estimator API?
Resources I tried:
All the documentation about the Estimator API, TPU on
Ways I tried:
Initialized a TPUClusterResolver and passed that as an argument for tf.Session() and it was just hanging without executing the
Also tried and it got stuck as well
Tried looking into the TPUEstimator API as there
def train_model(self, env, episodes=100,
load_model = False, # load model from checkpoint if available:?
model_dir = '/tmp/pgmodel/', log_freq=10 ) :
# initialize variables and load model
init_op = tf.global_variables_initializer()
if load_model:
ckpt = tf.train.get_checkpoint_state(model_dir)
print tf.train.latest_checkpoint(model_dir)
if ckpt and ckpt.model_checkpoint_path:
savr = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta')
out = savr.restore(self._sess, ckpt.model_checkpoint_path)
print("Model restored from ",ckpt.model_checkpoint_path)
print('No checkpoint found at: ',model_dir)
if not os.path.exists(model_dir):
episode = 0
observation = env.reset()
xs,rs,ys = [],[],[] # environment info
running_reward = 0
reward_sum = 0
# training loop
day = 0
simrors = np.zeros(episodes)
mktrors = np.zeros(episodes)
alldf = None
victory = False
while episode < episodes and not victory:
# stochastically sample a policy from the network
x = observation
feed = {self._tf_x: np.reshape(x, (1,-1))}
aprob =,feed)
aprob = aprob[0,:] # we live in a batched world :/
action = np.random.choice(self._num_actions, p=aprob)
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
#print observation, reward, done, info
reward_sum += reward
# record game history
day += 1
if done:
running_reward = running_reward * 0.99 + reward_sum * 0.01
epx = np.vstack(xs)
epr = np.vstack(rs)
epy = np.vstack(ys)
xs,rs,ys = [],[],[] # reset game history
df = env.env.sim.to_df()
simrors[episode]=df.bod_nav.values[-1]-1 # compound returns
alldf = df if alldf is None else pd.concat([alldf,df], axis=0)
feed = {self._tf_x: epx, self._tf_epr: epr, self._tf_y: epy}
_ =,feed) # parameter update
if episode % log_freq == 0:'year #%6d, mean reward: %8.4f, sim ret: %8.4f, mkt ret: %8.4f, net: %8.4f', episode,
running_reward, simrors[episode],mktrors[episode], simrors[episode]-mktrors[episode])
save_path =, model_dir+'model.ckpt',
if episode > 100:
vict = pd.DataFrame( { 'sim': simrors[episode-100:episode],
'mkt': mktrors[episode-100:episode] } )
vict['net'] = vict.sim - vict.mkt
if > 0.0:
victory = True'Congratulations, Warren Buffet! You won the trading game.')
#print("Model saved in file: {}".format(save_path))
episode += 1
observation = env.reset()
reward_sum = 0
day = 0
return alldf, pd.DataFrame({'simror':simrors,'mktror':mktrors})
Problems I have with the Estimator API implementation:
I have a policy gradient based reinforcement learning code that contains a neural network
I have two during my execution. One is running on every step within the episode. The other is running at the end of the episode
tf.train.SessionRunHook is not a suitable implementation for my code

Tensorflow: sparse_tensor_dense_matmul slower than regular matmul

I have 2 scenarios:
scenario 1:
op: sparse_tensor_dense_matmul
A: 1000x1000 sparsity = 90%
B: 1000x1000 sparsity = 0%
scenario 2:
op: matmul
A: 1000x1000 sparsity = 0%
B: 1000x1000 sparsity = 0%
I understand that GPUs do not compute sparse matrix multiplication well but I would certainly expect them to perform it atleast as well as they perform non-sparse matrix mulipliation. In my code I get 10x slower for sparse_tensor_dense_matmul!
import tensorflow as tf
import numpy as np
import time
import itertools
rate = 0.1
N = 1000
itrs = 1000
num = int(rate * N * N)
combs = np.array(list(itertools.product(range(N), range(N))))
choices = range(len(combs))
_idxs = np.random.choice(a=choices, size=num, replace=False).tolist()
_idxs = combs[_idxs]
_idxs = _idxs.tolist()
_idxs = sorted(_idxs)
_vals = np.float32(np.random.rand(num))
_y = np.random.uniform(low=-1., high=1., size=(N, N))
_z = np.random.uniform(low=-1., high=1., size=(N, N))
x = tf.SparseTensor(indices=_idxs, values=_vals, dense_shape=(N, N))
y = tf.Variable(_y, dtype=tf.float32)
z = tf.Variable(_z, dtype=tf.float32)
sparse_dot = tf.sparse_tensor_dense_matmul(x, y)
dot = tf.matmul(z, y)
sess = tf.InteractiveSession()
start = time.time()
for i in range(itrs):
[_sparse_dot] =[sparse_dot], feed_dict={})
total = time.time() - start
print (total)
start = time.time()
for i in range(itrs):
[_dot] =[dot], feed_dict={})
total = time.time() - start
print (total)

If I don't want to train in batches and my state is a vector, what should my tensors have for a shape?

I'm trying to use tensorflow to solve a reinforced learning problem. I created an gym environment of my own. The state is a one dimensional array (size 224) and there are 170 actions to choose from (0...169). I do not want to train in batches. What I want is to make the most simple version of the RL problem running with tensorflow.
My main problem is, i guess the dimensions. I would assume that TF would allow me to input the state as 1D tensor. But then I get an error when I want to calculate W*input=action. Dimensions error make it hard to know whats right. Also, examples on the web focus on training from images, in batches.
In general, I started in this tutorial, but the state is encoded differently, which again makes it hard to follow (especially since I'm not really familiar with python).
import gym
import numpy as np
import random
import tensorflow as tf
env = gym.make('MyOwnEnv-v0')
n_state = 224
n_action = 170
sess = tf.InteractiveSession()
# Implementing the network itself
inputs1 = tf.placeholder(shape=[1,n_state],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([n_state,n_action],0,0.01))
Qout = tf.transpose(tf.matmul(inputs1,W))
predict = tf.reshape(tf.argmax(Qout,1), [n_action,1])
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[n_action,1],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
# Training the network
init = tf.global_variables_initializer()
print("input: ", inputs1.get_shape()
, "\nW: ", W.get_shape()
, "\nQout: ", Qout.get_shape()
, "\npredict:", predict.get_shape()
, "\nnextQ: ", nextQ.get_shape()
, "\nloss: ", loss.get_shape())
# Set learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
rAll = 0
d = False
j = 0
#The Q-Network
while j < 99:
#Choose an action by greedily (with e chance of random action) from the Q-network
a,allQ =[predict,Qout],feed_dict={inputs1:s})
if np.random.rand(1) < e:
a = env.action_space.sample()
#Get new state and reward from environment
s1,r,d,_ = env.step(a)
#Obtain the Q' values by feeding the new state through our network
Q1 =,feed_dict={inputs1:s1})
#Obtain maxQ' and set our target value for chosen action.
maxQ1 = np.max(Q1)
targetQ = allQ
#targetQ[0,a[0]] = r + y*maxQ1
targetQ[a,0] = r + y*maxQ1
#Train our network using target and predicted Q values
_,W1 =[updateModel,W],feed_dict={inputs1:s,nextQ:targetQ})
rAll += r
s = s1
if d == True:
#Reduce chance of random action as we train the model.
e = 1./((i/50) + 10)
print('Percent of succesful episodes: ' + str(sum(rList)/num_episodes) + '%')

how to use Apache Commons Math Optimization in Jython?

I want to transfer Matlab code to Jython version, and find that the fminsearch in Matlab might be replaced by Apache-Common-Math-Optimization.
I'm coding on the Mango Medical Image script manager, which uses Jython 2.5.3 as coding language. And the Math version is 3.6.1.
Here is my code:
def f(x,y):
return x^2+y^2
from org.apache.commons.math3.analysis import MultivariateFunction
from org.apache.commons.math3.optim.nonlinear.scalar.noderiv import NelderMeadSimplex,SimplexOptimizer
from org.apache.commons.math3.optim.nonlinear.scalar import ObjectiveFunction
from org.apache.commons.math3.optim import MaxEval,InitialGuess
from org.apache.commons.math3.optimization import GoalType
opt=SimplexOptimizer(2**(-6), 2**(-10))
skewParameters2 = solution.getPointRef()
print skewParameters2;
And I got the error below:
TypeError: optimize(): 1st arg can't be coerced to
I'm quite confused about how to use the optimization in Jython and the examples are all Java version.
I've given up this plan and find another method to perform the fminsearch in Jython. Below is the Jython version code:
import sys
sys.path.append('.../jnumeric-2.5.1_ra0.1.jar') #add the jnumeric path
import Numeric as np
def nelder_mead(f, x_start,
step=0.1, no_improve_thr=10e-6,
no_improv_break=10, max_iter=0,
alpha=1., gamma=2., rho=-0.5, sigma=0.5):
#param f (function): function to optimize, must return a scalar score
and operate over a numpy array of the same dimensions as x_start
#param x_start (float list): initial position
#param step (float): look-around radius in initial step
#no_improv_thr, no_improv_break (float, int): break after no_improv_break iterations with
an improvement lower than no_improv_thr
#max_iter (int): always break after this number of iterations.
Set it to 0 to loop indefinitely.
#alpha, gamma, rho, sigma (floats): parameters of the algorithm
(see Wikipedia page for reference)
return: tuple (best parameter array, best score)
# init
dim = len(x_start)
prev_best = f(x_start)
no_improv = 0
res = [[np.array(x_start), prev_best]]
for i in range(dim):
score = f(x)
res.append([x, score])
# simplex iter
iters = 0
while 1:
# order
res.sort(key=lambda x: x[1])
best = res[0][1]
# break after max_iter
if max_iter and iters >= max_iter:
return res[0]
iters += 1
# break after no_improv_break iterations with no improvement
print ' so far:', best
if best < prev_best - no_improve_thr:
no_improv = 0
prev_best = best
no_improv += 1
if no_improv >= no_improv_break:
return res[0]
# centroid
x0 = [0.] * dim
for tup in res[:-1]:
for i, c in enumerate(tup[0]):
x0[i] += c / (len(res)-1)
# reflection
xr = x0 + alpha*(x0 - res[-1][0])
rscore = f(xr)
if res[0][1] <= rscore < res[-2][1]:
del res[-1]
res.append([xr, rscore])
# expansion
if rscore < res[0][1]:
xe = x0 + gamma*(x0 - res[-1][0])
escore = f(xe)
if escore < rscore:
del res[-1]
res.append([xe, escore])
del res[-1]
res.append([xr, rscore])
# contraction
xc = x0 + rho*(x0 - res[-1][0])
cscore = f(xc)
if cscore < res[-1][1]:
del res[-1]
res.append([xc, cscore])
# reduction
x1 = res[0][0]
nres = []
for tup in res:
redx = x1 + sigma*(tup[0] - x1)
score = f(redx)
nres.append([redx, score])
res = nres
And the test example is as below:
def f(x):
return x[0]**2+x[1]**2+x[2]**2
print nelder_mead(f,[3.4,2.3,2.2])
Actually, the original version is for python, and the link below is the source: