weird numba behavior when assigning to an array - numpy

I have a function I'm jitting with #jit(nopython=True).
Inside it has a loop that does a bunch of stuff, calculates a correlation and then assigns that to a preallocated output array. Both the target array and the correlation have the same type (np.float32), but for some reason the assignment makes the function take 100X as long.
To make things even more strange, if i instead assign a meaningless float np.float32(i*1.01) instead of my correlation value, the function runs at an appropriate speed.
Given that everything is the same type, they should both run at the same speed no?
corrs = np.zeros(a.shape[0], dtype=np.float32)
for i in range(lb, a.shape[0]):
#a bunch of calculations happens here
correl = np.float32(covar/(a_std*b_std))
testval = np.float32(i*1.01)
#doing this makes the function take FOREVER
#corrs[i] = correl
#but doing this runs very quickly, even though it is also a np.float32
#corrs[i] = testval
here is a runable example. I added an argument called "assign" that if true will assign what I want to assign, and if false will assign my useless test value.
#jit(nopython=True)
def hist_corr_loop(a, b, lb = 1000, assign=True):
flb = np.float32(lb)
a_mu, b_mu = a[0], b[0]
for i in range(1, lb):
a_mu+=a[i]
b_mu+=b[i]
a_mu = a_mu/flb
b_mu = b_mu/flb
a_var, b_var = np.float32(0.0), np.float32(0.0)
for i in range(lb):
a_var += np.square(a[i] - a_mu)
b_var += np.square(b[i] - b_mu)
a_var = a_var/flb
b_var = b_var/flb
corrs = np.zeros(a.shape[0], dtype=np.float32)
for i in range(lb, a.shape[0]):
#calculate new means and stdevs
_a_mu = a_mu
_b_mu = b_mu
a_mu = _a_mu + (a[i] - a[i-lb])/flb
b_mu = _b_mu + (b[i] - b[i-lb])/flb
a_var += (a[i] - a[i-lb])*(a[i] - a_mu + a[i-lb] - _a_mu)/flb
b_var += (b[i] - b[i-lb])*(b[i] - b_mu + b[i-lb] - _b_mu)/flb
a_std = np.sqrt(a_var)#**0.5
b_std = np.sqrt(b_var)#**0.5
covar = np.float32(0.0)
for j in range(i-lb+1,i+1):
covar += (a[j] - a_mu)*(b[j] - b_mu)
covar = covar/flb
correl = np.float32(covar/(a_std*b_std))
testval = np.float32(i*1.01)
if assign:
corrs[i] = correl
else:
corrs[i] = testval
return corrs
to run:
n = 10000000
a = np.random.random(n)
b = np.random.random(n)
%timeit hist_corr_loop(a,b,1000, True)
%timeit hist_corr_loop(a,b, 1000, False)
I get
%timeit hist_corr_loop(a,b,1000, True)
10.5 s ± 52.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit hist_corr_loop(a,b, 1000, False)
220 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
10 seconds vs 220 ms.

Related

count rows with multiple criterion in pandas [duplicate]

I am trying to modify a DataFrame df to only contain rows for which the values in the column closing_price are between 99 and 101 and trying to do this with the code below.
However, I get the error
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()
and I am wondering if there is a way to do this without using loops.
df = df[(99 <= df['closing_price'] <= 101)]
Consider also series between:
df = df[df['closing_price'].between(99, 101)]
You should use () to group your boolean vector to remove ambiguity.
df = df[(df['closing_price'] >= 99) & (df['closing_price'] <= 101)]
there is a nicer alternative - use query() method:
In [58]: df = pd.DataFrame({'closing_price': np.random.randint(95, 105, 10)})
In [59]: df
Out[59]:
closing_price
0 104
1 99
2 98
3 95
4 103
5 101
6 101
7 99
8 95
9 96
In [60]: df.query('99 <= closing_price <= 101')
Out[60]:
closing_price
1 99
5 101
6 101
7 99
UPDATE: answering the comment:
I like the syntax here but fell down when trying to combine with
expresison; df.query('(mean + 2 *sd) <= closing_price <=(mean + 2
*sd)')
In [161]: qry = "(closing_price.mean() - 2*closing_price.std())" +\
...: " <= closing_price <= " + \
...: "(closing_price.mean() + 2*closing_price.std())"
...:
In [162]: df.query(qry)
Out[162]:
closing_price
0 97
1 101
2 97
3 95
4 100
5 99
6 100
7 101
8 99
9 95
newdf = df.query('closing_price.mean() <= closing_price <= closing_price.std()')
or
mean = closing_price.mean()
std = closing_price.std()
newdf = df.query('#mean <= closing_price <= #std')
If one has to call pd.Series.between(l,r) repeatedly (for different bounds l and r), a lot of work is repeated unnecessarily. In this case, it's beneficial to sort the frame/series once and then use pd.Series.searchsorted(). I measured a speedup of up to 25x, see below.
def between_indices(x, lower, upper, inclusive=True):
"""
Returns smallest and largest index i for which holds
lower <= x[i] <= upper, under the assumption that x is sorted.
"""
i = x.searchsorted(lower, side="left" if inclusive else "right")
j = x.searchsorted(upper, side="right" if inclusive else "left")
return i, j
# Sort x once before repeated calls of between()
x = x.sort_values().reset_index(drop=True)
# x = x.sort_values(ignore_index=True) # for pandas>=1.0
ret1 = between_indices(x, lower=0.1, upper=0.9)
ret2 = between_indices(x, lower=0.2, upper=0.8)
ret3 = ...
Benchmark
Measure repeated evaluations (n_reps=100) of pd.Series.between() as well as the method based on pd.Series.searchsorted(), for different arguments lower and upper. On my MacBook Pro 2015 with Python v3.8.0 and Pandas v1.0.3, the below code results in the following outpu
# pd.Series.searchsorted()
# 5.87 ms ± 321 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# pd.Series.between(lower, upper)
# 155 ms ± 6.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# Logical expressions: (x>=lower) & (x<=upper)
# 153 ms ± 3.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
import numpy as np
import pandas as pd
def between_indices(x, lower, upper, inclusive=True):
# Assumption: x is sorted.
i = x.searchsorted(lower, side="left" if inclusive else "right")
j = x.searchsorted(upper, side="right" if inclusive else "left")
return i, j
def between_fast(x, lower, upper, inclusive=True):
"""
Equivalent to pd.Series.between() under the assumption that x is sorted.
"""
i, j = between_indices(x, lower, upper, inclusive)
if True:
return x.iloc[i:j]
else:
# Mask creation is slow.
mask = np.zeros_like(x, dtype=bool)
mask[i:j] = True
mask = pd.Series(mask, index=x.index)
return x[mask]
def between(x, lower, upper, inclusive=True):
mask = x.between(lower, upper, inclusive=inclusive)
return x[mask]
def between_expr(x, lower, upper, inclusive=True):
if inclusive:
mask = (x>=lower) & (x<=upper)
else:
mask = (x>lower) & (x<upper)
return x[mask]
def benchmark(func, x, lowers, uppers):
for l,u in zip(lowers, uppers):
func(x,lower=l,upper=u)
n_samples = 1000
n_reps = 100
x = pd.Series(np.random.randn(n_samples))
# Sort the Series.
# For pandas>=1.0:
# x = x.sort_values(ignore_index=True)
x = x.sort_values().reset_index(drop=True)
# Assert equivalence of different methods.
assert(between_fast(x, 0, 1, True ).equals(between(x, 0, 1, True)))
assert(between_expr(x, 0, 1, True ).equals(between(x, 0, 1, True)))
assert(between_fast(x, 0, 1, False).equals(between(x, 0, 1, False)))
assert(between_expr(x, 0, 1, False).equals(between(x, 0, 1, False)))
# Benchmark repeated evaluations of between().
uppers = np.linspace(0, 3, n_reps)
lowers = -uppers
%timeit benchmark(between_fast, x, lowers, uppers)
%timeit benchmark(between, x, lowers, uppers)
%timeit benchmark(between_expr, x, lowers, uppers)
Instead of this
df = df[(99 <= df['closing_price'] <= 101)]
You should use this
df = df[(df['closing_price']>=99 ) & (df['closing_price']<=101)]
We have to use NumPy's bitwise Logic operators |, &, ~, ^ for compounding queries.
Also, the parentheses are important for operator precedence.
For more info, you can visit the link
:Comparisons, Masks, and Boolean Logic
If you're dealing with multiple values and multiple inputs you could also set up an apply function like this. In this case filtering a dataframe for GPS locations that fall withing certain ranges.
def filter_values(lat,lon):
if abs(lat - 33.77) < .01 and abs(lon - -118.16) < .01:
return True
elif abs(lat - 37.79) < .01 and abs(lon - -122.39) < .01:
return True
else:
return False
df = df[df.apply(lambda x: filter_values(x['lat'],x['lon']),axis=1)]

Cannot make sense of timing of numba-compiled functions

I'm running some simulations, where I use numba to compile my python code to speed up the simulations. I wrote a function that will overwrite one of the input arrays, and therefore I would like to pass in a copy of that array instead. However, this makes the code much slower, and far slower than the time it takes to make the copy.
Here are the timing results:
> population_ = population.copy()
> %timeit _ = run_simulation(population_, Tmax, dt, Nskip = Nskip)
64.6 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
> %timeit _ = run_simulation(population.copy(), Tmax, dt, Nskip = Nskip)
87.4 ms ± 778 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
> %timeit _ = population.copy()
442 ns ± 10.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
So calling run_simulation directly with the result of .copy() as an argument is about 23 milliseconds slower, despite the fact that making the copy only takes about 0.0004 milliseconds. I don't understand why this is the case.
For background, here is the full code:
import numpy as np
from numba import jit, int32, int64, float64
#jit('int32[:,:,:](int32[:,:,:], float64)', nopython=True)
def one_step(population, dt):
# Hard-coding model parameters here
beta = 0.55
tau = 10
# This probabilty doesn't depend on the other states
pIR = 1 - np.exp(-dt/tau)
# Double for loop over towns and towns
for i in range(population.shape[0]):
I = np.sum(population[i,1,:])
N = np.sum(population[i,:,:])
# Transition probability from susceptible to infected
pSI = 1 - np.exp(-dt*beta*I/N)
for j in range(population.shape[1]):
# Unpack variables for convenience
S, I, R = population[i,j,:]
S2I = np.random.binomial(S, pSI)
I2R = np.random.binomial(I, pIR)
# Calculate new values
S = S - S2I
I = I + S2I - I2R
R = R + I2R
population[i,j,:] = (S, I, R)
return population
#jit('int32[:,:,:](int32[:,:,:], float64, float64, int64)', nopython=True)
def run_simulation(population, Tmax, dt, Nskip = 10):
Nt = int(Tmax/dt)
history = np.zeros((population.shape[0], 3, int((Tmax/dt)/Nskip) + 1), dtype = np.int32)
history[:,:,0] = np.sum(population, axis = 1)
t = 0
for i in range(1, Nt+1):
population = one_step(population, dt)
t += dt
if i % Nskip == 0:
history[:,:,int(i/Nskip)] = np.sum(population, axis = 1)
return history
# Initial state
population = np.random.randint(low = 0, high = 1000, size = (10,10,3), dtype = np.int32)
# Run simulation for 100 days
Tmax = 100
dt = 0.01
# Only store once per day
Nskip = int(1/dt)
# Call one timestep to compile numba-decorated functions
# prior to measuring timing
_ = run_simulation(population, 1.0, 1.0, Nskip = 1)
# Run timing
population_ = population.copy()
%timeit _ = run_simulation(population_, Tmax, dt, Nskip = Nskip)
# Run timing
%timeit _ = run_simulation(population.copy(), Tmax, dt, Nskip = Nskip)
# Run timing
%timeit _ = population.copy()
What you are referring to isn't really related to numba.
Consider the following simple example:
import numpy as np
def run_simulation_2(population):
return population.sum(axis=0)
# Initial state
population = np.random.randint(low = 0, high = 1000, size = (10,10,3), dtype = np.int32)
# Run timing
population_ = population.copy()
%timeit _ = run_simulation_2(population_)
# Run timing
%timeit _ = run_simulation_2(population.copy())
# Run timing
%timeit _ = population.copy()
Timing results are:
3.45 µs ± 193 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.34 µs ± 91.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
680 ns ± 23.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
So you then is an overhead of about 25% even without numba which is about the same overhead you saw yourself.
Therefore I think it is not related to numba, but to different "behind the scenes" stuff that happens when you send aa variable as an argument vs result of a function.
Unfortunately I can't offer you a good explanation why it happens, but I hope that the fact that it's not related to numba is a well enough explanation for your needs.

how to optimize these pandas apply functions?

train["gender"] = train.apply(lambda x: 1 if x["gender"] == "F" else 0, axis=1)
train["car"] = train.apply(lambda x: 1 if x["car"] == "Y" else 0, axis=1)
train["reality"] = train.apply(lambda x: 1 if x["reality"] == "Y" else 0, axis=1)
these 3 codes require many time even it is simple change.
I guess, accessing each row 3 times makes inefficeny.
So, if I can make 1 access to row and apply function change 3 data, it can be faster 2~3 times than now.
like.....
# it is my imaginary code. not works
train[["gender","car", "reality"]] = train.apply(lambda x: 1 if x["gender"] == "F" else 0, axis=1,
lambda y: 1 if y["car"] == "Y" else 0, axis=1,
lambda z: 1 if z["reality"] == "Y" else 0, axis=1)
How can optimize these codes?
===========================
test result for tdy
You can try 3x np.where() which is generally the fastest option:
train['gender'] = np.where(train.gender == 'F', 1, 0)
train['car'] = np.where(train.car == 'Y', 1, 0)
train['reality'] = np.where(train.reality == 'Y', 1, 0)
Or 2x np.where() which is slightly slower:
train['gender'] = np.where(train.gender == 'F', 1, 0)
train[['car', 'reality']] = np.where(train[['car', 'reality']] == 'Y', 1, 0)
Timings with 10 million rows:
method
%timeit
3x np.where()
152 ms ± 8.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2x np.where()
198 ms ± 39.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3x apply()
8.91 s ± 495 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

How to test for list equality in a column where cells are lists

I want to be able to test if some cells that are lists are equal to [0] and Var1==4, and set a new column to 1 if this happens. Input and expected output are below.
I had several tries but only managed with apply and lambda , and this does not scale well for 50k+ rows. Is there a faster way I'm missing?
Input:
import numpy as np
import pandas as pd
df = pd.DataFrame({'Id': [1,2,3,4],
'Var1': [[0,1],[0],[6,7],[0]],
})
Id Var1
1 [0, 1]
2 [0]
3 [6, 7]
4 [0]
What I've tried:
df['ERR'] = 0
df.loc[(df['Id']==4) & (df['Var1']==[0]) , 'ERR'] = 1 # doesn't work
df.loc[(df['Id']==4) & (df['Var1'].isin([0])) , 'ERR'] = 1 # doesn't work
df['ERR'] = df.apply(lambda x: 1 if x['Id']==4 and x['Var1']==[0] else 0 , axis = 1)
Expected output:
Id Var1 ERR
1 [0, 1] 0
2 [0] 0
3 [6, 7] 0
4 [0] 1
You can compare by tuple or set:
df['ERR1'] = ((df['Id']==4) & (df['Var1'].apply(tuple)==(0, ))).astype(int)
df['ERR2'] = ((df['Id']==4) & ([tuple(x) ==(0, ) for x in df['Var1']])).astype(int)
df['ERR3'] = ((df['Id']==4) & (df['Var1'].apply(set)==set([0]))).astype(int)
df['ERR4'] = ((df['Id']==4) & ([set(x) == set([0]) for x in df['Var1']])).astype(int)
Performance (depends of input data):
df = pd.DataFrame({'Id': [1,2,3,4],
'Var1': [[0,1],[0],[6,7],[0]],
})
df = pd.concat([df] * 10000, ignore_index=True)
In [188]: %timeit df['ERR1'] = ((df['Id']==4) & (df['Var1'].apply(tuple)==(0, ))).astype(int)
13.1 ms ± 318 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [189]: %timeit df['ERR2'] = ((df['Id']==4) & ([tuple(x) ==(0, ) for x in df['Var1']])).astype(int)
8.98 ms ± 266 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [190]: %timeit df['ERR3'] = ((df['Id']==4) & (df['Var1'].apply(set)==set([0]))).astype(int)
17 ms ± 451 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [191]: %timeit df['ERR4'] = ((df['Id']==4) & ([set(x) == set([0]) for x in df['Var1']])).astype(int)
19.4 ms ± 93.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Python numpy matrix multiplication with one diagonal matrix

I have two arrays A (4000,4000) of which only the diagonal is filled with data, and B (4000,5), filled with data. Is there a way to multiply (dot) these arrays that is faster than the numpy.dot(a,b) function?
So far I found that (A * B.T).T should be faster (where A is one dimensional (4000,), filled with the diagonal elements), but it turned out to be roughly twice as slow.
is there a faster way to calculate B.dot(A) in the case where A is a diagnal array?
You could simply extract the diagonal elements and then perform broadcasted elementwise multiplication.
Thus, a replacement for B*A would be -
np.multiply(np.diag(B)[:,None], A)
and for A.T*B -
np.multiply(A.T,np.diag(B))
Runtime test -
In [273]: # Setup
...: M,N = 4000,5
...: A = np.random.randint(0,9,(M,N)).astype(float)
...: B = np.zeros((M,M),dtype=float)
...: np.fill_diagonal(B, np.random.randint(11,99,(M)))
...: A = np.matrix(A)
...: B = np.matrix(B)
...:
In [274]: np.allclose(B*A, np.multiply(np.diag(B)[:,None], A))
Out[274]: True
In [275]: %timeit B*A
10 loops, best of 3: 32.1 ms per loop
In [276]: %timeit np.multiply(np.diag(B)[:,None], A)
10000 loops, best of 3: 33 µs per loop
In [282]: np.allclose(A.T*B, np.multiply(A.T,np.diag(B)))
Out[282]: True
In [283]: %timeit A.T*B
10 loops, best of 3: 24.1 ms per loop
In [284]: %timeit np.multiply(A.T,np.diag(B))
10000 loops, best of 3: 36.2 µs per loop
Appears that my initial claim of (A * B.T).T being slower is incorrect.
from timeit import default_timer as timer
import numpy as np
##### Case 1
a = np.zeros((4000,4000))
np.fill_diagonal(a, 10)
b = np.ones((4000,5))
dot_list = []
def time_dot(a,b):
start = timer()
c = np.dot(a,b)
end = timer()
return end - start
for i in range(100):
dot_list.append(time_dot(a,b))
print np.mean(np.asarray(dot_list))
##### Case 2
a = np.ones((4000,))
a = a * 10
b = np.ones((4000,5))
shortcut_list = []
def time_quicker(a,b):
start = timer()
c = (a*b.T).T
end = timer()
return end - start
for i in range(100):
shortcut_list.append(time_quicker(a,b))
print np.mean(np.asarray(shortcut_list))
##### Case 3
a = np.zeros((4000,4000)) #diagonal matrix
np.fill_diagonal(a, 10)
b = np.ones((4000,5))
case3_list = []
def function(a,b):
start = timer()
np.multiply(b.T,np.diag(a))
end = timer()
return end - start
for i in range(100):
case3_list.append(function(a,b))
print np.mean(np.asarray(case3_list))
results in :
0.119120892431
0.00010633951868
0.00214490709662
so the second method is fastest