Vectorize this for loop in numpy - numpy

I am trying to compute matrix z (defined below) in python with numpy.
Here's my current solution (using 1 for loop)
z = np.zeros((n, k))
for i in range(n):
v = pi * (1 / math.factorial(x[i])) * np.exp(-1 * lamb) * (lamb ** x[i])
numerator = np.sum(v)
c = v / numerator
z[i, :] = c
return z
Is it possible to completely vectorize this computation? I need to do this computation for thousands of iterations, and matrix operations in numpy is much faster than huge for loops.

Here is a vectorized version of E. It replaces the for-loop and scalar arithmetic with NumPy broadcasting and array-based arithmetic:
def alt_E(x):
x = x[:, None]
z = pi * (np.exp(-lamb) * (lamb**x)) / special.factorial(x)
denom = z.sum(axis=1)[:, None]
z /= denom
return z
I ran em.py to get a sense for the typical size of x, lamb, pi, n and k. On data of this size,
alt_E is about 120x faster than E:
In [32]: %timeit E(x)
100 loops, best of 3: 11.5 ms per loop
In [33]: %timeit alt_E(x)
10000 loops, best of 3: 94.7 µs per loop
In [34]: 11500/94.7
Out[34]: 121.43611404435057
This is the setup I used for the benchmark:
import math
import numpy as np
import scipy.special as special
def alt_E(x):
x = x[:, None]
z = pi * (np.exp(-lamb) * (lamb**x)) / special.factorial(x)
denom = z.sum(axis=1)[:, None]
z /= denom
return z
def E(x):
z = np.zeros((n, k))
for i in range(n):
v = pi * (1 / math.factorial(x[i])) * \
np.exp(-1 * lamb) * (lamb ** x[i])
numerator = np.sum(v)
c = v / numerator
z[i, :] = c
return z
n = 576
k = 2
x = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5])
lamb = np.array([ 0.84835141, 1.04025989])
pi = np.array([ 0.5806958, 0.4193042])
assert np.allclose(alt_E(x), E(x))
By the way, E could also be calculated using scipy.stats.poisson:
import scipy.stats as stats
pois = stats.poisson(mu=lamb)
def alt_E2(x):
z = pi * pois.pmf(x[:,None])
denom = z.sum(axis=1)[:, None]
z /= denom
return z
but this does not turn out to be faster, at least for arrays of this length:
In [33]: %timeit alt_E(x)
10000 loops, best of 3: 94.7 µs per loop
In [102]: %timeit alt_E2(x)
1000 loops, best of 3: 278 µs per loop
For larger x, alt_E2 is faster:
In [104]: x = np.random.random(10000)
In [106]: %timeit alt_E(x)
100 loops, best of 3: 2.18 ms per loop
In [105]: %timeit alt_E2(x)
1000 loops, best of 3: 643 µs per loop

Related

KeyError: "None of [Index([...] are in the [columns]

I've got numpy array with shape of (3, 50):
data = np.array([[0, 3, 0, 2, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 0, 0, 0,
1, 1, 2, 0, 0, 2],
[0, 0, 0, 0, 0, 3, 0, 1, 6, 1, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0,
3, 0, 0, 0, 0, 0, 0, 5, 2, 2, 2, 1, 0, 0, 1, 0, 1, 3, 2, 0, 0, 0,
0, 0, 2, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0]])
and the following column names:
new_cols = [f'description_word_{i+1}_count' for i in range(50)]
I'm trying to add new columns in already existing dataframe in such way:
df[new_cols] = data
but get the error:
KeyError: "None of [Index(['description_word_1_count',
'description_word_2_count',\n 'description_word_3_count',
'description_word_4_count',\n 'description_word_5_count',
'description_word_6_count',\n 'description_word_7_count',
'description_word_8_count',\n 'description_word_9_count',
'description_word_10_count',\n 'description_word_11_count',
'description_word_12_count',\n 'description_word_13_count',
'description_word_14_count',\n 'description_word_15_count',
'description_word_16_count',\n 'description_word_17_count',
'description_word_18_count',\n 'description_word_19_count',
'description_word_20_count',\n 'description_word_21_count',
'description_word_22_count',\n 'description_word_23_count',
'description_word_24_count',\n 'description_word_25_count',
'description_word_26_count',\n 'description_word_27_count',
'description_word_28_count',\n 'description_word_29_count',
'description_word_30_count',\n 'description_word_31_count',
'description_word_32_count',\n 'description_word_33_count',
'description_word_34_count',\n 'description_word_35_count',
'description_word_36_count',\n 'description_word_37_count',
'description_word_38_count',\n 'description_word_39_count',
'description_word_40_count',\n 'description_word_41_count',
'description_word_42_count',\n 'description_word_43_count',
'description_word_44_count',\n 'description_word_45_count',
'description_word_46_count',\n 'description_word_47_count',
'description_word_48_count',\n 'description_word_49_count',
'description_word_50_count'],\n dtype='object')] are in the
[columns]"
Also I don't know where it finds a '\n' symbols in my column names.
At the same time creating a new dataframe with the data is OK:
new_df = pd.DataFrame(data=data, columns=new_cols)
Does anyone know what is causing the error?
Suppose you have a df like this:
df = pd.DataFrame({'person': [1,1,1], 'event': ['A','B','C']})
You can add new columns like this:
import pandas as pd
import numpy as np
data = np.array([[0, 3, 0, 2, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 0, 0, 0,
1, 1, 2, 0, 0, 2],
[0, 0, 0, 0, 0, 3, 0, 1, 6, 1, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0,
3, 0, 0, 0, 0, 0, 0, 5, 2, 2, 2, 1, 0, 0, 1, 0, 1, 3, 2, 0, 0, 0,
0, 0, 2, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0]])
new_cols = [f'description_word_{i+1}_count' for i in range(50)]
df[new_cols] = pd.DataFrame(data, index=df.index)
I think the problem is that you are using a syntax to create series, when you actually need to create several series. In other words, a dataframe.

How to print a specific information from value_count()?

import pandas as pd
data = {'qtd': [0, 1, 4, 0, 1, 3, 1, 3, 0, 0,
3, 1, 3, 0, 1, 1, 0, 0, 1, 3,
0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 1, 1, 1, 3, 0, 3, 0, 0,
2, 0, 0, 2, 0, 0, 2, 0, 0, 2,
0, 2, 0, 0, 2, 0, 0, 2, 0, 0,
2, 0, 0, 2, 0, 0, 2, 0, 0, 1,
1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
1, 1, 1, 1, 1]
}
df = pd.DataFrame (data, columns = ['qtd'])
Counting
df['qtd'].value_counts()
0 43
1 34
2 10
3 7
4 1
Name: qtd, dtype: int64
What I want is to print a phrase: "The total with zero occurrencies is 43"
Tried with .head(1) but shows more than I want.
Does this solve your problem? The [0] indicates the index you wish to print, in this case the very first occurrence in your column of a data frame.
print('The total with zero occurences is:', df['qtd'].value_counts()[0])
The output of the code above will be:
The total with zero occurences is: 43
I am not sure if you want this but may be helpful:
import inflect
e = inflect.engine()
(df['qtd'].map(e.number_to_words).radd("The total with ").add(" occurances is ")
.value_counts().astype(str).reset_index().agg(':'.join,1))
0 The total with zero occurances is :43
1 The total with one occurances is :34
2 The total with two occurances is :10
3 The total with three occurances is :7
4 The total with four occurances is :1
dtype: object

Fill values in numpy array that are between a certain value

Let's say I have an array that looks like this:
a = np.array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])
I want to fill the values that are between 1's with 1's.
So this would be the desired output:
a = np.array([0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0])
I have taken a look into this answer, which yields the following:
array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1])
I am sure this answer is really close to the output I want. However, although tried countless times, I can't change this code into making it work the way I want, as I am not that proficient with numpy arrays.
Any help is much appreciated!
Try this
b = ((a == 1).cumsum() % 2) | a
Out[10]:
array([0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0], dtype=int32)
From #Paul Panzer: use ufunc.accumulate with bitwise_xor
b = np.bitwise_xor.accumulate(a)|a
Try this:
import numpy as np
num_lst = np.array(
[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])
i = 0
while i < len(num_lst): # Iterate through the list
if num_lst[i]: # Check if element is 1 at i-th position
if not num_lst[i+1]: # Check if next element is 0
num_lst[i+1] = 1 # Change next element to 1
i += 1 # Continue through loop
else: # Check if next element is 1
i += 2 # Skip next element
else:
i += 1 # Continue through loop
print(num_lst)
This is probably not the most elegant way to execute this, but it should work. Basically, we loop through the list to find any 1s. When we find an element that is 1, we check if the next element is 0. If it is, then we change the next element to 1. If the next element is 1, that means we should stop changing 0s to 1s, so we jump over that element and proceed with the iteration.

How to create a matrix consists of submatrix

I want to create a matrix consists of some submatrix, or elements are defined by some conditions about indices.
e.g.
X = np.array(
[[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[1, 1, 0, 0, 1],
[1, 1, 0, 0, 1],
[1, 1, 1, 1, 0]]
)
where i-row and j-col meets conditions below
0 if 2k ≤ i < 2(k+1) and 2k ≤ j < 2(k+1)
1 otherwise
In the above condition, k is 0, 1, 2... and 2 is also a parameter to change and
So, what I finally need is
0 if nk ≤ i < n(k+1) nk ≤ j < n(k+1)
1 otherwise
I think np.ix_ is good for this demand, but it requires a loop structure (I hate loops).
Is there some nice way to generate this?
One way would be utilizing the outer method of the not_equal operator like so:
N = 17; k = 5
np.not_equal.outer(*2*(np.arange(N)//k,)).view('u1')
# array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=uint8)

Transform a matrix made of binomial vectors to ranges for consecutive zeros

I am trying to figure out how to do this transformation symbolically in theano a matrix of undetermined size
From:
[[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1],
.
.
]
To:
[[1, 2, 3, 0, 1, 2, 3, 4, 5, 0, 0, 1, 0, 1, 2, 3, 0],
[1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 0, 0, 0, 0, 0, 0],
.
.
]
So for every consecutive 0 I want an increasing range and whenever I stumble on a 1 the range resets.
Here's one way to do it, using inefficient scans:
import theano
import theano.tensor as tt
def inner_step(x_t_t, y_t_tm1):
return tt.switch(x_t_t, 0, y_t_tm1 + 1)
def outer_step(x_t):
return theano.scan(inner_step, sequences=[x_t], outputs_info=[0])[0]
def compile():
x = tt.bmatrix()
y = theano.scan(outer_step, sequences=[x])[0]
return theano.function([x], y)
def main():
f = compile()
data = [[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1]]
print f(data)
main()
When run, this prints:
[[1 2 3 0 1 2 3 4 5 0 0 1 0 1 2 3 0]
[1 2 3 4 5 6 7 8 0 1 2 0 0 0 0 0 0]]