randomly choose value between two numpy arrays - numpy

I have two numpy arrays:
left = np.array([2, 7])
right = np.array([4, 7])
right_p1 = right + 1
What I want to do is
rand = np.zeros(left.shape[0])
for i in range(left.shape[0]):
rand[i] = np.random.randint(left[i], right_p1[i])
Is there a way I could do this without using a for loop?

You could try with:
extremes = zip(left, right_p1)
rand = map(lambda x: np.random.randint(x[0], x[1]), extremes)
This way you will end up with a map object. If you need to save memory, you can keep it that way, otherwise you can get the full np.array passing through a list conversion, like this:
rand = np.array(list(map(lambda x: np.random.randint(x[0], x[1]), extremes)))

Related

discrete numpy array to continuous array

I have some discrete data in an array, such that:
arr = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
whose plot looks like:
I also have an index array, such that each unique value in arr is associated with a unique index value, like:
ind = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
What is the most pythonic way of converting arr from discrete values to continuous values, so that the array would look like this when plotted?:
therefore, interpolating between the discrete points to make continuous data
I found a solution to this if anyone has a similar issue. It is maybe not the most elegant so modifications are welcome:
def ref_linear_interp(x, y):
arr = []
ux=np.unique(x) #unique x values
for u in ux:
idx = y[x==u]
try:
min = y[x==u-1][0]
max = y[x==u][0]
except:
min = y[x==u][0]
max = y[x==u][0]
try:
min = y[x==u][0]
max = y[x==u+1][0]
except:
min = y[x==u][0]
max = y[x==u][0]
if min==max:
sub = np.full((len(idx)), min)
arr.append(sub)
else:
sub = np.linspace(min, max, len(idx))
arr.append(sub)
return np.concatenate(arr, axis=None).ravel()
y = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
x = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
z = np.arange(1, 16, 1)
Here is an answer for the symmetric solution that I would expect when reading the question:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# create the data as described
numbers = [1,2,3,2,1]
nblock = 3
df = pd.DataFrame({
"x": np.arange(nblock*len(numbers)),
"y": np.repeat(numbers, nblock),
"label": np.repeat(np.arange(len(numbers)), nblock)
})
Expecting a constant block size of 3, we could use a rolling window:
df['y-smooth'] = df['y'].rolling(nblock, center=True).mean()
# fill NaNs
df['y-smooth'].bfill(inplace=True)
df['y-smooth'].ffill(inplace=True)
plt.plot(df['x'], df['y-smooth'], marker='*')
If the block size is allowed to vary, we could determine the block centers and interpolate piecewise.
centers = df[['x', 'y', 'label']].groupby('label').mean()
df['y-interp'] = np.interp(df['x'], centers['x'], centers['y'])
plt.plot(df['x'], df['y-interp'], marker='*')
Note: You may also try
centers = df[['x', 'y', 'label']].groupby('label').min() to select the left corner of the labelled blocks.

Creating 2d array and filling first columns of each row in numpy

I have written the following code for creating a 2D array and filing the first element of each row. I am new to numpy. Is there a better way to do this?
y=np.zeros(N*T1).reshape(N,T1)
x = np.linspace(0,L,num = N)
for k in range(0,N):
y[k][0] = np.sin(PI*x[k]/L)
Simply do this:
y[:, 0] = np.sin(PI*x/L)

How to concatenate two tensors with intervals in tensorflow?

I want to concatenate two tensors checkerboard-ly in tensorflow2, like examples showed below:
example 1:
a = [[1,1],[1,1]]
b = [[0,0],[0,0]]
concated_a_and_b = [[1,0,1,0],[0,1,0,1]]
example 2:
a = [[1,1,1],[1,1,1],[1,1,1]]
b = [[0,0,0],[0,0,0],[0,0,0]]
concated_a_and_b = [[1,0,1,0,1,0],[0,1,0,1,0,1],[1,0,1,0,1,0]]
Is there a decent way in tensorflow2 to concatenate them like this?
A bit of background for this:
I first split a tensor c with a checkerboard mask into two halves a and b. A after some transformation I have to concat them back into oringnal shape and order.
What I mean by checkerboard-ly:
Step 1: Generate a matrix with alternated values
You can do this by first concatenating into [1, 0] pairs, and then by applying a final reshape.
Step 2: Reverse some rows
I split the matrix into two parts, reverse the second part and then rebuild the full matrix by picking alternatively from the first and second part
Code sample:
import math
import numpy as np
import tensorflow as tf
a = tf.ones(shape=(3, 4))
b = tf.zeros(shape=(3, 4))
x = tf.expand_dims(a, axis=-1)
y = tf.expand_dims(b, axis=-1)
paired_ones_zeros = tf.concat([x, y], axis=-1)
alternated_values = tf.reshape(paired_ones_zeros, [-1, a.shape[1] + b.shape[1]])
num_samples = alternated_values.shape[0]
middle = math.ceil(num_samples / 2)
is_num_samples_odd = middle * 2 != num_samples
# Gather first part of the matrix, don't do anything to it
first_elements = tf.gather_nd(alternated_values, [[index] for index in range(middle)])
# Gather second part of the matrix and reverse its elements
second_elements = tf.reverse(tf.gather_nd(alternated_values, [[index] for index in range(middle, num_samples)]), axis=[1])
# Pick alternatively between first and second part of the matrix
indices = np.concatenate([[[index], [index + middle]] for index in range(middle)], axis=0)
if is_num_samples_odd:
indices = indices[:-1]
output = tf.gather_nd(
tf.concat([first_elements, second_elements], axis=0),
indices
)
print(output)
I know this is not a decent way as it will affect time and space complexity. But it solves the above problem
def concat(tf1, tf2):
result = []
for (index, (tf_item1, tf_item2)) in enumerate(zip(tf1, tf2)):
item = []
for (subitem1, subitem2) in zip(tf_item1, tf_item2):
if index % 2 == 0:
item.append(subitem1)
item.append(subitem2)
else:
item.append(subitem2)
item.append(subitem1)
concated_a_and_b.append(item)
return concated_a_and_b

In numpy, what is the efficient way to find the maximum values and their indices of a 3D ndarray across two axis?

How to find the correlation-peak values and coordinates of a set of 2D cross-correlation functions?
Given an 3D ndarray that contains a set of 2D cross-correlation functions. What is the efficient way to find the maximum(peak) values and their coordinates(x and y indices)?
The code below do the work but I think it is inefficient.
import numpy as np
import numpy.matlib
ccorr = np.random.rand(7,5,5)
xind = ccorr.argmax(axis=-1)
mccorr = ccorr[np.matlib.repmat(np.arange(0,7)[:,np.newaxis],1,5),np.matlib.repmat(np.arange(0,5)[np.newaxis,:],7,1), xind]
yind = mccorr.argmax(axis=-1)
xind = xind[np.arange(0,7),yind]
values = mccorr[np.arange(0,7),yind]
print("cross-correlation functions (z,y,x)")
print(ccorr)
print("x and y indices of the maximum values")
print(xind,yind)
print("Maximum values")
print(values)
You'll want to flatten the dimensions you're searching over and then use unravel_index and take_along_axis to get the coordinates and values, respectively.
ccorr = np.random.rand(7,5,5)
cc_rav = ccorr.reshape(ccorr.shape[0], -1)
idx = np.argmax(cc_rav, axis = -1)
indices_2d = np.unravel_index(idx, ccorr.shape[1:])
vals = np.take_along_axis(ccorr, indices = indices_2d, axis = 0)
if you're using numpy version <1.15:
vals = cc_rav[np.arange(ccorr.shape[0]), idx]
or:
vals = ccorr[np.arange(ccorr.shape[0]),
indices_2d[0], indices_2d[1]]

apply generic function in a vectorized fashion using numpy/pandas

I am trying to vectorize my code and, thanks in large part to some users (https://stackoverflow.com/users/3293881/divakar, https://stackoverflow.com/users/625914/behzad-nouri), I was able to make huge progress. Essentially, I am trying to apply a generic function (in this case max_dd_array_ret) to each of the bins I found (see vectorize complex slicing with pandas dataframe for details on date vectorization and Start, End and Duration of Maximum Drawdown in Python for the rationale behind max_dd_array_ret). the problem is the following: I should be able to obtain the result df_2 and, to some degree, ranged_DD(asd_1.values, starts, ends+1) is what I am looking for, except for the tragic effect that it's as if the first two bins are merged and the last one is missing as it can be gauged by looking at the results.
any explanation and fix is very welcomed
import pandas as pd
import numpy as np
from time import time
from scipy.stats import binned_statistic
def max_dd_array_ret(xs):
xs = (xs+1).cumprod()
i = np.argmax(np.maximum.accumulate(xs) - xs) # end of the period
j = np.argmax(xs[:i])
max_dd = abs(xs[j]/xs[i] -1)
return max_dd if max_dd is not None else 0
def get_ranges_arr(starts,ends):
# Taken from https://stackoverflow.com/a/37626057/3293881
counts = ends - starts
counts_csum = counts.cumsum()
id_arr = np.ones(counts_csum[-1],dtype=int)
id_arr[0] = starts[0]
id_arr[counts_csum[:-1]] = starts[1:] - ends[:-1] + 1
return id_arr.cumsum()
def ranged_DD(arr,starts,ends):
# Get all indices and the IDs corresponding to same groups
idx = get_ranges_arr(starts,ends)
id_arr = np.repeat(np.arange(starts.size),ends-starts)
slice_arr = arr[idx]
return binned_statistic(id_arr, slice_arr, statistic=max_dd_array_ret)[0]
asd_1 = pd.Series(0.01 * np.random.randn(500), index=pd.date_range('2011-1-1', periods=500)).pct_change()
index_1 = pd.to_datetime(['2011-2-2', '2011-4-3', '2011-5-1','2011-7-2', '2011-8-3', '2011-9-1','2011-10-2', '2011-11-3', '2011-12-1','2012-1-2', '2012-2-3', '2012-3-1',])
index_2 = pd.to_datetime(['2011-2-15', '2011-4-16', '2011-5-17','2011-7-17', '2011-8-17', '2011-9-17','2011-10-17', '2011-11-17', '2011-12-17','2012-1-17', '2012-2-17', '2012-3-17',])
starts = asd_1.index.searchsorted(index_1)
ends = asd_1.index.searchsorted(index_2)
df_2 = pd.DataFrame([max_dd_array_ret(asd_1.loc[i:j]) for i, j in zip(index_1, index_2)], index=index_1)
print(df_2[0].values)
print(ranged_DD(asd_1.values, starts, ends+1))
results:
df_2
[ 1.75893509 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085
1.43863472 1.85322338 1.84767224 1.32605754 1.48688414 5.44786663]
ranged_DD(asd_1.values, starts, ends+1)
[ 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085 1.43863472
1.85322338 1.84767224 1.32605754 1.48688414]
which are identical except for the first two:
[ 1.75893509 6.08002911 vs [ 6.08002911
and the last two
1.48688414 5.44786663] vs 1.48688414]
p.s.:while looking in more detail at the docs (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html) I found that this might be the problem
"All but the last (righthand-most) bin is half-open. In other words,
if bins is [1, 2, 3, 4], then the first bin is [1, 2) (including 1,
but excluding 2) and the second [2, 3). The last bin, however, is [3,
4], which includes 4. New in version 0.11.0."
problem is I don't how to reset it.