How to convert a list of indices into a cell list (numpy array of lists) in numpy with vectorized implementation? - numpy

Cell list is a data structure that maintains lists of data points in an N-D meshgrid. For example, the following list of 2d indices:
ind = [(0, 1), (1, 0), (0, 1), (0, 0), (0, 0), (0, 0), (1, 1)]
is converted to the following 2x2 cell list:
cell = [[[3, 4, 5], [0, 2]],
[[1, ], [6, ]]
]
using an O(n) algorithm:
# create an empty 2x2 cell list
cell = [[[] for _ in range(2)] for _ in range(2)]
for i in range(len(ind)):
cell[ind[i][0], ind[i][1]].append(i)
Is there a vectorized way in numpy that can convert the list of indices (ind) into the cell structure described above?

I don't think there is a good pure numpy but you can either use pythran or---if you don't want to touch a compiler---scipy.sparse cf. this Q&A which is essentially a 1D version of your problem.
[stb_pthr.py] simplified from Most efficient way to sort an array into bins specified by an index array?
import numpy as np
#pythran export sort_to_bins(int[:], int)
def sort_to_bins(idx, mx=-1):
if mx==-1:
mx = idx.max() + 1
cnts = np.zeros(mx + 1, int)
for i in range(idx.size):
cnts[idx[i] + 1] += 1
for i in range(1, cnts.size):
cnts[i] += cnts[i-1]
res = np.empty_like(idx)
for i in range(idx.size):
res[cnts[idx[i]]] = i
cnts[idx[i]] += 1
return res, cnts[:-1]
Compile: pythran stb_pthr.py
Main script:
import numpy as np
try:
from stb_pthr import sort_to_bins
HAVE_PYTHRAN = True
except:
HAVE_PYTHRAN = False
from scipy import sparse
def fallback(flat, maxind):
res = sparse.csr_matrix((np.zeros_like(flat),flat,np.arange(len(flat)+1)),
(len(flat), maxind)).tocsc()
return res.indices, res.indptr[1:-1]
if not HAVE_PYTHRAN:
sort_to_bins = fallback
def to_cell(data, shape=None):
data = np.asanyarray(data)
if shape is None:
*shape, = (1 + c.max() for c in data.T)
flat = np.ravel_multi_index((*data.T,), shape)
reord, bnds = sort_to_bins(flat, np.prod(shape))
return np.frompyfunc(np.split(reord, bnds).__getitem__, 1, 1)(
np.arange(np.prod(shape)).reshape(shape))
ind = [(0, 1), (1, 0), (0, 1), (0, 0), (0, 0), (0, 0), (1, 1)]
print(to_cell(ind))
from timeit import timeit
ind = np.transpose(np.unravel_index(np.random.randint(0, 100, (1_000_000)), (10, 10)))
if HAVE_PYTHRAN:
print(timeit(lambda: to_cell(ind), number=10)*100)
sort_to_bins = fallback # !!! MUST REMOVE THIS LINE AFTER TESTING
print(timeit(lambda: to_cell(ind), number=10)*100)
Sample run, output is answer to OP's toy example and timings (in ms) for the pythran and scipy solutions on a 1,000,000 points example:
[[array([3, 4, 5]) array([0, 2])]
[array([1]) array([6])]]
11.411489499732852
29.700406698975712

Related

Python: create (sparse) stacked diagonal block matrix

I need to create a matrix with the form
M=[
[a1, 0, 0],
[0, b1, 0],
[0, 0, c1],
[a2, 0, 0],
[0, b2, 0],
[0, 0, c2],
[a3, 0, 0],
[0, b3, 0],
[0, 0, c3],
...]
where a(i), b(i) and c(i) are [1xp] blocks. The resulting matrix M has the form [3m x 3p]. I am given the input data in the form of 3 matrices [m x p]:
A = [[a1.T, a2.T, a3.T, ...]].T
B = [[b1.T, b2.T, b3.T, ...]].T
C = [[c1.T, c2.T, c3.T, ...]].T
How can I create the matrix M? Ideally it would be sparse using the scipy.sparse library but I am even struggling creating it as a dense matrix using numpy. Is there no way around a loop or at least list comprehension in this case?
No need to make it complicated. For your scale, the following executes in less than a second.
import numpy as np
import scipy.sparse
from numpy.random import default_rng
rand = default_rng(seed=0)
m = 70_000
p = 20
abc = rand.random((3, m, p))
M_dense = np.zeros((m, 3, 3*p))
for i in range(3):
M_dense[:, i, i*p:(i+1)*p] = abc[i, ...]
M_sparse = scipy.sparse.csr_matrix(M_dense.reshape((-1, 3*p)))
print(M_sparse.shape)
(210000, 60)
Far better, though, is to construct the sparse matrix directly. Note the permuted shape of abc.
abc = rand.random((m, 3, p))
data = abc.ravel()
indices = np.tile(np.arange(3*p), m)
indptr = np.arange(0, data.size+1, p)
M_sparse = scipy.sparse.csr_matrix((data, indices, indptr))

How do I input a Time Series in spmvg nfoursid

I want to use this algorithm for n4sid model estimation. However, in the Documentation, there is an input DataFrame generated from Random Samples, where I want to input a Time Series Dataframe. Calling the nfoursid method leads to an Type Error or Value Error.
Documentation:
https://github.com/spmvg/nfoursid/blob/master/examples/Overview.ipynb
Imported libs:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nfoursid.kalman import Kalman
from nfoursid.nfoursid import NFourSID
from nfoursid.state_space import StateSpace
import time
import datetime
import math
import scipy as sp
My input Time Series as Data Frame (flawless):
import yfinance as yfin
yfin.pdr_override()
spy = pdr.get_data_yahoo('AAPL',start='2022-08-23',end='2022-10-24')
spy['Log Return'] = np.log(spy['Adj Close']/spy['Adj Close'].shift(1))
AAPL=pd.DataFrame((spy['Log Return']))
The input DataFrame as proposed in the documentation:
state_space = StateSpace(A, B, C, D)
for _ in range(NUM_TRAINING_DATAPOINTS):
input_state = np.random.standard_normal((INPUT_DIM, 1))
noise = np.random.standard_normal((OUTPUT_DIM, 1)) * NOISE_AMPLITUDE
state_space.step(input_state, noise)
The call using the input proposed in the documentation:
#---->libs already imported
pd.set_option('display.max_columns', None)
np.random.seed(0) # reproducible results
NUM_TRAINING_DATAPOINTS = 1000
# create a training-set by simulating a state-space model with this many datapoints
NUM_TEST_DATAPOINTS = 20 # same for the test-set
INPUT_DIM = 3 #---->this probably needs to adapted to the AAPL dimensions
OUTPUT_DIM = 2
INTERNAL_STATE_DIM = 4 # actual order of the state-space model in the training- and test-set
NOISE_AMPLITUDE = .1 # add noise to the training- and test-set
FIGSIZE = 8
# define system matrices for the state-space model of the training- and test-set
A = np.array([
[1, .01, 0, 0],
[0, 1, .01, 0],
[0, 0, 1, .02],
[0, -.01, 0, 1],
]) / 1.01
B = np.array([
[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 1],
]
) / 3
C = np.array([
[1, 0, 1, 1],
[0, 0, 1, -1],
])
D = np.array([
[1, 0, 1],
[0, 1, 0]
]) / 10
)
#---->maybe I have to input the DataFrame already here at the state-space model:
state_space = StateSpace(A, B, C, D)
for _ in range(NUM_TRAINING_DATAPOINTS):
input_state = np.random.standard_normal((INPUT_DIM, 1))
noise = np.random.standard_normal((OUTPUT_DIM, 1)) * NOISE_AMPLITUDE
state_space.step(input_state, noise)
#----
#---->This is the method with the input DF, in this case the random state-space model
nfoursid = NFourSID(
state_space.to_dataframe(), # the state-space model can summarize inputs and outputs as a dataframe
output_columns=state_space.y_column_names,
input_columns=state_space.u_column_names,
num_block_rows=10
)
nfoursid.subspace_identification()
Pasting my DF at the call of the method nfoursid which leads to an error:
df2 = pd.DataFrame()
nfoursid = NFourSID(
output_columns=df2,
input_columns=AAPL,
num_block_rows=10
)
TypeError: NFourSID.init() missing 1 required positional argument: 'dataframe'
Pasting DF in the state_space led to:
ValueError: Dimensions of u (43, 1) are inconsistent. Expected (3, 1).
and
TypeError: 'DataFrame' object is not callable

How to plot ROC-AUC figures without using scikit-learn

I have the following list containing multiple tuples of (TP, FP, FN):
[(12, 0, 0), (5, 2, 2), (10, 0, 1), (7, 1, 1), (13, 0, 0), (7, 2, 2), (11, 0, 2)]
each tuple represents the scores for a single image. This means I have 7 images and I have calculated the scores for a object detection task. Now I calculate precision and recall for each image(tuple) using the following function:
def calculate_recall_precision(data):
precisions_bundle = []
recalls_bundle = []
for tp, fp, fn in data:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
precisions_bundle.append(precision)
recalls_bundle.append(recall)
return (precisions_bundle, recalls_bundle)
This function returns a tuple which contains two lists. The first one is precision values for each image and the second one is recall values for each image.
Now my main goal is to plot ROC and AUC curves using only matplotlib. Please note that I do not want to use scikit-learn library.
You can simply use matplotlib.pyplot.plot method. For example:
import numpy as np
import matplotlib.pyplot as plt
def plot_PR(precision_bundle, recall_bundle, save_path:Path=None):
line = plt.plot(recall_bundle, precision_bundle, linewidth=2, markersize=6)
line = plt.title('Precision/Recall curve', size =18, weight='bold')
line = plt.ylabel('Precision', size=15)
line = plt.xlabel('Recall', size=15 )
random_classifier_line_x = np.linspace(0, 1, 10)
random_classifier_line_y = np.linspace(1, 0, 10)
_ = plt.plot(random_classifier_line_x, random_classifier_line_y, color='firebrick', linestyle='--')
if save_path:
outname = save_path / 'PR_curve_thresh_opt.png'
_ = plt.savefig(outname, dpi = 100, bbox_inches='tight' )
return line
and then just use it as plot_PR(precision_bundle, recall_bundle).
Note: here I also added a dashed line for a random classifier and the possibility to save the figure in case you want to

How to transpose each element of a Numpy Matrix [duplicate]

I'm starting off with a numpy array of an image.
In[1]:img = cv2.imread('test.jpg')
The shape is what you might expect for a 640x480 RGB image.
In[2]:img.shape
Out[2]: (480, 640, 3)
However, this image that I have is a frame of a video, which is 100 frames long. Ideally, I would like to have a single array that contains all the data from this video such that img.shape returns (480, 640, 3, 100).
What is the best way to add the next frame -- that is, the next set of image data, another 480 x 640 x 3 array -- to my initial array?
A dimension can be added to a numpy array as follows:
image = image[..., np.newaxis]
Alternatively to
image = image[..., np.newaxis]
in #dbliss' answer, you can also use numpy.expand_dims like
image = np.expand_dims(image, <your desired dimension>)
For example (taken from the link above):
x = np.array([1, 2])
print(x.shape) # prints (2,)
Then
y = np.expand_dims(x, axis=0)
yields
array([[1, 2]])
and
y.shape
gives
(1, 2)
You could just create an array of the correct size up-front and fill it:
frames = np.empty((480, 640, 3, 100))
for k in xrange(nframes):
frames[:,:,:,k] = cv2.imread('frame_{}.jpg'.format(k))
if the frames were individual jpg file that were named in some particular way (in the example, frame_0.jpg, frame_1.jpg, etc).
Just a note, you might consider using a (nframes, 480,640,3) shaped array, instead.
Pythonic
X = X[:, :, None]
which is equivalent to
X = X[:, :, numpy.newaxis] and
X = numpy.expand_dims(X, axis=-1)
But as you are explicitly asking about stacking images,
I would recommend going for stacking the list of images np.stack([X1, X2, X3]) that you may have collected in a loop.
If you do not like the order of the dimensions you can rearrange with np.transpose()
You can use np.concatenate() use the axis parameter to specify the dimension that should be concatenated. If the arrays being concatenated do not have this dimension, you can use np.newaxis to indicate where the new dimension should be added:
import numpy as np
movie = np.concatenate((img1[:,np.newaxis], img2[:,np.newaxis]), axis=3)
If you are reading from many files:
import glob
movie = np.concatenate([cv2.imread(p)[:,np.newaxis] for p in glob.glob('*.jpg')], axis=3)
Consider Approach 1 with reshape method and Approach 2 with np.newaxis method that produce the same outcome:
#Lets suppose, we have:
x = [1,2,3,4,5,6,7,8,9]
print('I. x',x)
xNpArr = np.array(x)
print('II. xNpArr',xNpArr)
print('III. xNpArr', xNpArr.shape)
xNpArr_3x3 = xNpArr.reshape((3,3))
print('IV. xNpArr_3x3.shape', xNpArr_3x3.shape)
print('V. xNpArr_3x3', xNpArr_3x3)
#Approach 1 with reshape method
xNpArrRs_1x3x3x1 = xNpArr_3x3.reshape((1,3,3,1))
print('VI. xNpArrRs_1x3x3x1.shape', xNpArrRs_1x3x3x1.shape)
print('VII. xNpArrRs_1x3x3x1', xNpArrRs_1x3x3x1)
#Approach 2 with np.newaxis method
xNpArrNa_1x3x3x1 = xNpArr_3x3[np.newaxis, ..., np.newaxis]
print('VIII. xNpArrNa_1x3x3x1.shape', xNpArrNa_1x3x3x1.shape)
print('IX. xNpArrNa_1x3x3x1', xNpArrNa_1x3x3x1)
We have as outcome:
I. x [1, 2, 3, 4, 5, 6, 7, 8, 9]
II. xNpArr [1 2 3 4 5 6 7 8 9]
III. xNpArr (9,)
IV. xNpArr_3x3.shape (3, 3)
V. xNpArr_3x3 [[1 2 3]
[4 5 6]
[7 8 9]]
VI. xNpArrRs_1x3x3x1.shape (1, 3, 3, 1)
VII. xNpArrRs_1x3x3x1 [[[[1]
[2]
[3]]
[[4]
[5]
[6]]
[[7]
[8]
[9]]]]
VIII. xNpArrNa_1x3x3x1.shape (1, 3, 3, 1)
IX. xNpArrNa_1x3x3x1 [[[[1]
[2]
[3]]
[[4]
[5]
[6]]
[[7]
[8]
[9]]]]
a = np.expand_dims(a, axis=-1)
or
a = a[:, np.newaxis]
or
a = a.reshape(a.shape + (1,))
There is no structure in numpy that allows you to append more data later.
Instead, numpy puts all of your data into a contiguous chunk of numbers (basically; a C array), and any resize requires allocating a new chunk of memory to hold it. Numpy's speed comes from being able to keep all the data in a numpy array in the same chunk of memory; e.g. mathematical operations can be parallelized for speed and you get less cache misses.
So you will have two kinds of solutions:
Pre-allocate the memory for the numpy array and fill in the values, like in JoshAdel's answer, or
Keep your data in a normal python list until it's actually needed to put them all together (see below)
images = []
for i in range(100):
new_image = # pull image from somewhere
images.append(new_image)
images = np.stack(images, axis=3)
Note that there is no need to expand the dimensions of the individual image arrays first, nor do you need to know how many images you expect ahead of time.
You can use stack with the axis parameter:
img.shape # h,w,3
imgs = np.stack([img1,img2,img3,img4], axis=-1) # -1 = new axis is last
imgs.shape # h,w,3,nimages
For example: to convert grayscale to color:
>>> d = np.zeros((5,4), dtype=int) # 5x4
>>> d[2,3] = 1
>>> d3.shape
Out[30]: (5, 4, 3)
>>> d3 = np.stack([d,d,d], axis=-2) # 5x4x3 -1=as last axis
>>> d3[2,3]
Out[32]: array([1, 1, 1])
I followed this approach:
import numpy as np
import cv2
ls = []
for image in image_paths:
ls.append(cv2.imread('test.jpg'))
img_np = np.array(ls) # shape (100, 480, 640, 3)
img_np = np.rollaxis(img_np, 0, 4) # shape (480, 640, 3, 100).
This worked for me:
image = image[..., None]
This will help you add axis anywhere you want
import numpy as np
signal = np.array([[0.3394572666491664, 0.3089068053925853, 0.3516359279582483], [0.33932706934615525, 0.3094755563319447, 0.3511973743219001], [0.3394407172182317, 0.30889042266755573, 0.35166886011421256], [0.3394407172182317, 0.30889042266755573, 0.35166886011421256]])
print(signal.shape)
#(4,3)
print(signal[...,np.newaxis].shape) or signal[...:none]
#(4, 3, 1)
print(signal[:, np.newaxis, :].shape) or signal[:,none, :]
#(4, 1, 3)
there is three-way for adding new dimensions to ndarray .
first: using "np.newaxis" (something like #dbliss answer)
np.newaxis is just given an alias to None for making it easier to
understand. If you replace np.newaxis with None, it works the same
way. but it's better to use np.newaxis for being more explicit.
import numpy as np
my_arr = np.array([2, 3])
new_arr = my_arr[..., np.newaxis]
print("old shape", my_arr.shape)
print("new shape", new_arr.shape)
>>> old shape (2,)
>>> new shape (2, 1)
second: using "np.expand_dims()"
Specify the original ndarray in the first argument and the position
to add the dimension in the second argument axis.
my_arr = np.array([2, 3])
new_arr = np.expand_dims(my_arr, -1)
print("old shape", my_arr.shape)
print("new shape", new_arr.shape)
>>> old shape (2,)
>>> new shape (2, 1)
third: using "reshape()"
my_arr = np.array([2, 3])
new_arr = my_arr.reshape(*my_arr.shape, 1)
print("old shape", my_arr.shape)
print("new shape", new_arr.shape)
>>> old shape (2,)
>>> new shape (2, 1)

How to use gather on 1D vector?

Have been trying to use cntk.ops.gather on 1D vectors. Here is a snippet illustrating what doesn't work:
import cntk
import numpy as np
def main():
xx = cntk.input_variable(shape=(1))
yy = cntk.input_variable(shape=(1))
zz = cntk.sequence.gather(xx, yy)
xx_value = np.arange(15, dtype=np.float64)
yy_value = np.array([1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1], dtype=np.float64)
aa = zz.eval({xx: xx_value.reshape(-1, 1), yy: yy_value.reshape(-1, 1)})
print(aa)
if __name__ == "__main__":
main()
The reason for this is that cntk expects a batch of examples to be provided.
When it sees, a (15,1) array it converts it to a batch of 15 examples each of length 1.
Then when gather is applied cntk is unhappy because some examples in the minibatch produce empty sequences (those for which there is a 0 in yy_value).
You can solve your problem by specifying the fact that you only have one example in the minibatch in a couple different ways.
you can provide the values in lists like this
aa = zz.eval({xx: [xx_value.reshape(-1, 1)], yy: [yy_value.reshape(-1, 1)]})
you can provide the values in a tensor of shape (1,15,1) like this:
aa = zz.eval({xx: xx_value.reshape(1, -1, 1), yy: yy_value.reshape(1, -1, 1)})
The latter works only if all the sequences in a minibatch have the same length.