ValueError: setting an array element with a sequence Ask - least-squares

This python code:
import numpy,math
import scipy.optimize as optimization
import matplotlib.pyplot as plt
# Create toy data for curve_fit.
zo = numpy.array([0.0,1.0,2.0,3.0,4.0,5.0])
mu = numpy.array([0.1,0.9,2.2,2.8,3.9,5.1])
sig = numpy.array([1.0,1.0,1.0,1.0,1.0,1.0])
# Define hubble function.
def Hubble(x,a,b):
return H0 * m.sqrt( a*(1+x)**2 + 1/2 * a * (1+b)**3 )
# Define
def Distancez(x,a,b):
return c * (1+x)* np.asarray(quad(lambda tmp:
1/Hubble(a,b,tmp),0,x))
def mag(x,a,b):
return 5*np.log10(Distancez(x,a,b)) + 25
#return a+b*x
# Compute chi-square manifold.
Steps = 101 # grid size
Chi2Manifold = numpy.zeros([Steps,Steps]) # allocate grid
amin = 0.2 # minimal value of a covered by grid
amax = 0.3 # maximal value of a covered by grid
bmin = 0.3 # minimal value of b covered by grid
bmax = 0.6 # maximal value of b covered by grid
for s1 in range(Steps):
for s2 in range(Steps):
# Current values of (a,b) at grid position (s1,s2).
a = amin + (amax - amin)*float(s1)/(Steps-1)
b = bmin + (bmax - bmin)*float(s2)/(Steps-1)
# Evaluate chi-squared.
chi2 = 0.0
for n in range(len(xdata)):
residual = (mu[n] - mag(zo[n], a, b))/sig[n]
chi2 = chi2 + residual*residual
Chi2Manifold[Steps-1-s2,s1] = chi2 # write result to grid.
Throws this error message:
ValueError Traceback (most recent call last)
<ipython-input-136-d0ef47a881a7> in <module>()
36 residual = (mu[n] - mag(zo[n], a, b))/sig[n]
37 chi2 = chi2 + residual*residual
---> 38 Chi2Manifold[Steps-1-s2,s1] = chi2 # write result to
grid.
ValueError: setting an array element with a sequence.
Note: If I define a simple mag function such as (a+b*x), I do not get any error message.
In fact all three functions Hubble, Distancez and Meg have to be functions of redshift z, which is an array.
Now do you think I need to redefine all these functions to have an output array? I mean first, create an array of redshift and then the output of the functions automatically become array?
I need the output of the Distancez() and mag() functions to be arrays. I managed to do it, simply by changing the upper limit of the integral in the Distancez function from x to x.any(). Now I have an array and this is what I want. However, now I see that the output value of the for example Distance(0.25, 0.5, 0.3) is different from when I just put x in the upper limit of the integral? Any help would be appreciated.
Thanks for your reply.
I need the output of the Distancez() and mag() functions to be arrays. I managed to do it, simply by changing the upper limit of the integral in the Distancez function from x to x.any(). Now I have an array and this is what I want. However, now I see that the output value of the for example Distance(0.25, 0.5, 0.3) is different from when I just put x in the upper limit of the integral? Any help would be appreciated.

The ValueError is saying that it cannot assign an element of the array Chi2Manifold with a value that is a sequence. chi2 is probably a numpy array because residual is a numpy array because, your mag() function returns a numpy array, all because your Distancez function returns an numpy array -- you are telling it to do this with that np.asarray().
If Distancez() returned a scalar floating point value you'd probably be set. Do you need to use np.asarray() in Distancez()? Is that actually a 1-element array, or perhaps you intend to reduce that somehow to a scalar. I don't know what your Hubble() function is supposed to do and I'm not an astronomer but in my experience distances are often scalars ;).
If chi2 is meant to be a sequence or numpy array, you probably want to set an appropriately-sized range of values in Chi2Manifold to chi2.

Related

Is nx.eigenvector_centrality_numpy() using the Arnoldi iteration instead of the basic power method?

Since nx.eigenvector_centrality_numpy() using ARPACK, is it mean that nx.eigenvector_centrality_numpy() using Arnoldi iteration instead of the basic power method?
because when I try to compute manually using the basic power method, the result of my computation is different from the result of nx.eigenvector_centrality_numpy(). Can someone explain it to me?
To make it more clear, here is my code and the result that I got from the function and the result when I compute manually.
import networkx as nx
G = nx.DiGraph()
G.add_edge('a', 'b', weight=4)
G.add_edge('b', 'a', weight=2)
G.add_edge('b', 'c', weight=2)
G.add_edge('b','d', weight=2)
G.add_edge('c','b', weight=2)
G.add_edge('d','b', weight=2)
centrality = nx.eigenvector_centrality_numpy(G, weight='weight')
centrality
The result:
{'a': 0.37796447300922725,
'b': 0.7559289460184545,
'c': 0.3779644730092272,
'd': 0.3779644730092272}
Below is code from Power Method Python Program and I did a little bit of modification:
# Power Method to Find Largest Eigen Value and Eigen Vector
# Importing NumPy Library
import numpy as np
import sys
# Reading order of matrix
n = int(input('Enter order of matrix: '))
# Making numpy array of n x n size and initializing
# to zero for storing matrix
a = np.zeros((n,n))
# Reading matrix
print('Enter Matrix Coefficients:')
for i in range(n):
for j in range(n):
a[i][j] = float(input( 'a['+str(i)+']['+ str(j)+']='))
# Making numpy array n x 1 size and initializing to zero
# for storing initial guess vector
x = np.zeros((n))
# Reading initial guess vector
print('Enter initial guess vector: ')
for i in range(n):
x[i] = float(input( 'x['+str(i)+']='))
# Reading tolerable error
tolerable_error = float(input('Enter tolerable error: '))
# Reading maximum number of steps
max_iteration = int(input('Enter maximum number of steps: '))
# Power Method Implementation
lambda_old = 1.0
condition = True
step = 1
while condition:
# Multiplying a and x
ax = np.matmul(a,x)
# Finding new Eigen value and Eigen vector
x = ax/np.linalg.norm(ax)
lambda_new = np.vdot(ax,x)
# Displaying Eigen value and Eigen Vector
print('\nSTEP %d' %(step))
print('----------')
print('Eigen Value = %0.5f' %(lambda_new))
print('Eigen Vector: ')
for i in range(n):
print('%0.5f\t' % (x[i]))
# Checking maximum iteration
step = step + 1
if step > max_iteration:
print('Not convergent in given maximum iteration!')
break
# Calculating error
error = abs(lambda_new - lambda_old)
print('errror='+ str(error))
lambda_old = lambda_new
condition = error > tolerable_error
I used the same matrix and the result:
STEP 99
----------
Eigen Value = 3.70328
Eigen Vector:
0.51640
0.77460
0.25820
0.25820
errror=0.6172133998483682
STEP 100
----------
Eigen Value = 4.32049
Eigen Vector:
0.71714
0.47809
0.35857
0.35857
Not convergent in given maximum iteration!
I've to try to compute it with my calculator too and I know it's not convergent because |lambda1|=|lambda2|=4. I've to know the theory behind nx.eigenvector_centrality_numpy() properly so I can write it right for my thesis. Help me, please

Why does numpy and pytorch give different results after mean and variance normalization?

I am working on a problem in which a matrix has to be mean-var normalized row-wise. It is also required that the normalization is applied after splitting each row into tiny batches.
The code seem to work for Numpy, but fails with Pytorch (which is required for training).
It seems Pytorch and Numpy results differ. Any help will be greatly appreciated.
Example code:
import numpy as np
import torch
def normalize(x, bsize, eps=1e-6):
nc = x.shape[1]
if nc % bsize != 0:
raise Exception(f'Number of columns must be a multiple of bsize')
x = x.reshape(-1, bsize)
m = x.mean(1).reshape(-1, 1)
s = x.std(1).reshape(-1, 1)
n = (x - m) / (eps + s)
n = n.reshape(-1, nc)
return n
# numpy
a = np.float32(np.random.randn(8, 8))
n1 = normalize(a, 4)
# torch
b = torch.tensor(a)
n2 = normalize(b, 4)
n2 = n2.numpy()
print(abs(n1-n2).max())
In the first example you are calling normalize with a, a numpy.ndarray, while in the second you call normalize with b, a torch.Tensor.
According to the documentation page of torch.std, Bessel’s correction is used by default to measure the standard deviation. As such the default behavior between numpy.ndarray.std and torch.Tensor.std is different.
If unbiased is True, Bessel’s correction will be used. Otherwise, the sample deviation is calculated, without any correction.
torch.std(input, dim, unbiased, keepdim=False, *, out=None) → Tensor
Parameters
input (Tensor) – the input tensor.
unbiased (bool) – whether to use Bessel’s correction (δN = 1).
You can try yourself:
>>> a.std(), b.std(unbiased=True), b.std(unbiased=False)
(0.8364538, tensor(0.8942), tensor(0.8365))

Adding a third dimension to my 2D array in a for loop

I have a for loop that gives me an output of 16 x 8 2D arrays per entry in the loop. I want to stack all of these 2D arrays along the z-axis in a 3D array. This way, I can determine the variance over the z-axis. I have tried multiple commands, such as np.dstack, matrix3D[p,:,:] = ... and np.newaxis both in- and outside the loop. However, the closest I've come to my desired output is just a repetition of the last array stacked on top of each other. Also the dimensions were way off. I need to keep the original 16 x 8 format. By now I'm in a bit too deep and could use some nudge in the right direction!
My code:
excludedElectrodes = [1,a.numberOfColumnsInArray,a.numberOfElectrodes-a.numberOfColumnsInArray+1,a.numberOfElectrodes]
matrixEA = np.full([a.numberOfRowsInArray, a.numberOfColumnsInArray], np.nan)
for iElectrode in range(a.numberOfElectrodes):
if a.numberOfDeflectionsPerElectrode[iElectrode] != 0:
matrixEA[iElectrode // a.numberOfColumnsInArray][iElectrode % a.numberOfColumnsInArray] = 0
for iElectrode in range (a.numberOfElectrodes):
if iElectrode+1 not in excludedElectrodes:
"""Preprocessing"""
# Loop over heartbeats
for p in range (1,len(iLAT)):
# Calculate parameters, store them in right row-col combo (electrode number)
matrixEA[iElectrode // a.numberOfColumnsInArray][iElectrode % a.numberOfColumnsInArray] = (np.trapz(abs(correctedElectrogram[limitA[0]:limitB[0]]-totalBaseline[limitA[0]:limitB[0]]))/(1000))
# Stack all matrixEA arrays along z axis
matrix3D = np.dstack(matrixEA)
This example snippet does what you want, although I suspect your errors have to do more with things not relative to the concatenate part. Here, we use the None keyword in the array to create a new empty dimension (along which we concatenate the 2D arrays).
import numpy as np
# Function does create a dummy (16,8) array
def foo(a):
return np.random.random((16,8)) + a
arrays2D = []
# Your loop
for i in range(10):
# Calculate your (16,8) array
f = foo(i)
# And append it to the list
arrays2D.append(f)
# Stack arrays along new dimension
array3D = np.concatenate([i[...,None] for i in arrays2D], axis = -1)

Difficulty with numpy broadcasting

I have two 2d point clouds (oldPts and newPts) which I whish to combine. They are mx2 and nx2 numpyinteger arrays with m and n of order 2000. newPts contains many duplicates or near duplicates of oldPts and I need to remove these before combining.
So far I have used the histogram2d function to produce a 2d representation of oldPts (H). I then compare each newPt to an NxN area of H and if it is empty I accept the point. This last part I am currently doing with a python loop which i would like to remove. Can anybody show me how to do this with broadcasting or perhaps suggest a completely different method of going about the problem. the working code is below
npzfile = np.load(path+datasetNo+'\\temp.npz')
arrs = npzfile.files
oldPts = npzfile[arrs[0]]
newPts = npzfile[arrs[1]]
# remove all the negative values
oldPts = oldPts[oldPts.min(axis=1)>=0,:]
newPts = newPts[newPts.min(axis=1)>=0,:]
# round to integers
oldPts = np.around(oldPts).astype(int)
newPts = newPts.astype(int)
# put the oldPts into 2d array
H, xedg,yedg= np.histogram2d(oldPts[:,0],oldPts[:,1],
bins = [xMax,yMax],
range = [[0, xMax], [0, yMax]])
finalNewList = []
N = 5
for pt in newPts:
if not H[max(0,pt[0]-N):min(xMax,pt[0]+N),
max(0,pt[1]- N):min(yMax,pt[1]+N)].any():
finalNewList.append(pt)
finalNew = np.array(finalNewList)
The right way to do this is to use linear algebra to compute the distance between each pair of 2-long vectors, and then accept only the new points that are "different enough" from each old point: using scipy.spatial.distance.cdist:
import numpy as np
oldPts = np.random.randn(1000,2)
newPts = np.random.randn(2000,2)
from scipy.spatial.distance import cdist
dist = cdist(oldPts, newPts)
print(dist.shape) # (1000, 2000)
okIndex = np.max(dist, axis=0) > 5
print(np.sum(okIndex)) # prints 1503 for me
finalNew = newPts[okIndex,:]
print(finalNew.shape) # (1503, 2)
Above I use the Euclidean distance of 5 as the threshold for "too close": any point in newPts that's farther than 5 from all points in oldPts is accepted into finalPts. You will have to look at the range of values in dist to find a good threshold, but your histogram can guide you in picking the best one.
(One good way to visualize dist is to use matplotlib.pyplot.imshow(dist).)
This is a more refined version of what you were doing with the histogram. In fact, you ought to be able to get the exact same answer as the histogram by passing in metric='minkowski', p=1 keyword arguments to cdist, assuming your histogram bin widths are the same in both dimensions, and using 5 again as the threshold.
(PS. If you're interested in another useful function in scipy.spatial.distance, check out my answer that uses pdist to find unique rows/columns in an array.)

Build a numpy array from a random distribution until the last column exceeds a threshold

I want to build a 2d numpy array from a random distribution so that each of the values in the last column of each row exceeds a threshold.
Here's the working code I have now. Is there a cleaner way to build numpy arrays with an arbitrary condition?
def new_array(
num_rows: int,
dist: Callable[[int], np.ndarray],
min_hours: int) -> np.ndarray:
# Get the 40th percentile as a reasonable guess for how many samples we need.
# Use a lower percentile to increase num_cols and avoid looping in most cases.
p40_val = np.quantile(dist(20), 0.4)
# Generate at least 10 columns each time.
num_cols = max(int(min_hours / p40_val), 10)
def create_starts() -> np.ndarray:
return dist(num_rows * num_cols).reshape((num_rows, num_cols)).cumsum(axis=1)
max_iters = 20
starts = create_starts()
for _ in range(max_iters):
if np.min(starts[:, -1]) >= min_hours:
# All the last columns exceed min_hours.
break
last_col_vals = starts[:, -1].repeat(num_cols).reshape(starts.shape)
next_starts = create_starts() + last_col_vals
starts = np.append(starts, next_starts, axis=1)
else:
# We didn't break out of the for loop, so we hit the max iterations.
raise AssertionError('Failed to create enough samples to exceed '
'sim duration for all columns')
# Only keep columns up to the column where each value > min_hours.
mins_per_col = np.min(starts, axis=0)
cols_exceeding_sim_duration = np.nonzero(mins_per_col > min_hours)[0]
cols_to_keep = cols_exceeding_sim_duration[0]
return np.delete(starts, np.s_[cols_to_keep:], axis=1)
new_array(5, lambda size: np.random.normal(3, size=size), 7)
# Example output
array([[1.47584632, 4.04034105, 7.19592256],
[3.10804306, 6.46487043, 9.74177227],
[1.03633165, 2.62430309, 6.92413189],
[3.46100139, 6.53068143, 7.37990547],
[2.70152742, 6.09488369, 9.58376664]])
I simplified several things and replaced them with Numpy's logical indexing. The for-loop is now while and there is no need to handle the error as it just runs until there are enough rows.
Is this still working as you expect it?
def new_array(num_rows, dist, min_hours):
# Get the 40th percentile as a reasonable guess for how many samples we need.
# Use a lower percentile to increase num_cols and avoid looping in most cases.
p40_val = np.quantile(dist(20), 0.4)
# Generate at least 10 columns each time.
num_cols = max(int(min_hours / p40_val), 10)
# no need to reshape here, size can be a shape tuple
def create_starts() -> np.ndarray:
return dist((num_rows, num_cols)).cumsum(axis=1)
# append to list, in the end stack it into a Numpy array once.
# faster than numpy.append
# due to Numpy's pre-allocation which will slow down things here.
storage = []
while True:
starts = create_starts()
# boolean / logical array
is_larger = starts[:, -1] >= min_hours
# Use Numpy boolean indexing instead to find the rows
# fitting your condition
good_rows = starts[is_larger, :]
# can also be empty array if none found, but will
# be skipped later
storage.append(good_rows)
# count what is in storage so far, empty arrays will be skipped
# due to shape (0, x)
number_of_good_rows = sum([_a.shape[0] for _a in storage])
print('number_of_good_rows', number_of_good_rows)
if number_of_good_rows >= num_rows:
starts = np.vstack(storage)
print(starts)
break
# Only keep columns up to the column where each value > min_hours.
# also use logical indexing here
is_something = np.logical_not(np.all(starts > min_hours, axis=0))
return starts[:, is_something]