scatter matrix column wise - numpy

I am trying to scatter matrix of size N x N column wise to different processes. Its expected that N % number_of_processes = 0. Input matrix:
A = [[1,2,3,4],
[5,6,7,8],
[9,10,11,12],
[13,14,15,16]]
When I run this with 2 processes, process P0 should receive columns 1-2: [[1,5,9,13], [2,6,10,14]], P1 should receive columns 3-4 [[3,7,11,15], [4,8,12,16]]. I transposed the matrix so each row of the new matrix is consists of columns, but when scattering still receiving rows in the order of original matrix.
from mpi4py import MPI
import numpy as np
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
N = 4
#K = 10
if rank == 0:
A = np.random.random((N,N))/N*2
vector = np.random.random(N)
print("Rank: ", rank)
print("A: ", A)
print("Vector: ", vector)
else:
A=np.empty((N,N), dtype='float64')
vector = np.empty(N, dtype='float64')
# distributing vector to all processes
#comm.Bcast(vector, root = 0)
matrix_columns = np.empty((A.shape[0],A.shape[0]), dtype='float64')
# get columns from matrix
if rank == 0:
matrix_columns = np.transpose(A)
else:
matrix_columns = np.empty((A.shape[0],A.shape[0]), dtype='float64')
if rank == 0:
print("Columns >>>")
print(matrix_columns)
# distribute columns to all processes
received_columns = np.empty((matrix_columns.shape[0]//size,matrix_columns.shape[0]), dtype='float64')
comm.Scatter(matrix_columns, received_columns, root = 0)
print("My rank: ", rank, "received columns: ", received_columns)

Related

Randomly select items from two equally sized tensors

Assume that we have two equally sized tensors of size batch_size * 1. For each index in the batch dimension we want to choose randomly between the two tensors. My solution was to create an indices tensor that contains random 0 or 1 indices of size batch_size and use those to index_select from the concatenation of the two tensors. However, to do so I had the "view" that cat tensor and the solution ended up to be quite "ugly":
import torch
bs = 8
a = torch.zeros(bs, 1)
print("a size", a.size())
b = torch.ones(bs, 1)
c = torch.cat([a, b], dim=-1)
print(c)
print("c size", c.size())
# create bs number of random 0 and 1's
indices = torch.randint(0, 2, [bs])
print("idxs size", indices.size())
print("idxs", indices)
# use `indices` to slice the `cat`ted tensor
d = c.view(1, -1).index_select(-1, indices).view(-1, 1)
print("d size", d.size())
print(d)
I am wondering whether there is a prettier and, more importantly, more efficient solution.
Posting two answers that I got over at the PyTorch forums
import torch
bs = 8
a = torch.zeros(bs, 1)
b = torch.ones(bs, 1)
c = torch.cat([a, b], dim=-1)
choices_flat = c.view(-1)
# index = torch.randint(choices_flat.numel(), (bs,))
# or if replace = False
index = torch.randperm(choices_flat.numel())[:bs]
select = choices_flat[index]
print(select)
import torch
bs = 8
a = torch.zeros(bs, 1)
print("a size", a.size())
b = torch.ones(bs, 1)
idx = torch.randint(2 * bs, (bs,))
d = torch.cat([a, b])[idx] # [bs, 1]

Stratify batch in Tensorflow 2

I have minibatches that I get from an sqlite database with data of integer and float type, x, and a binary label in 0 and 1, y. I am looking for something like X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(y, x, test_size=0.1, random_state=1, stratify=True) from scikit-learn, where a keyword could stratify the data (i.e. the same number of class-0 and class-1 instances).
In Tensorflow 2, stratification seems not straightforwardly possible. My very complicated solution works for me, but takes a lot of time because of all the reshaping and transposing:
def stratify(x, y):
# number of positive instances (the smaller class)
pos = np.sum(y).item() # how many positive bonds there are
x = np.transpose(x)
# number of features
f = np.shape(x)[1]
# filter only class 1
y = tf.transpose(y)
x_pos = tf.boolean_mask(x,
y_pos = tf.boolean_mask(y, y)
# filter only class 1
x_neg = tf.boolean_mask(x, tf.bitwise.invert(y)-254)
x_neg = tf.reshape(x_neg, [f,-1])
y_neg = tf.boolean_mask(y, tf.bitwise.invert(y)-254)
# just take randomy as many class-0 as there are class-1
x_neg = tf.transpose(tf.random.shuffle(tf.transpose(x_neg)))
x_neg = x_neg[:,0:pos]
y_neg = y_neg[0:pos]
# concat the class-1 and class-0 together, then shuffle, and concat back together
x = tf.concat([x_pos,tf.transpose(x_neg)],0)
y = tf.concat([y_pos, tf.transpose(y_neg)],0)
xy = tf.concat([tf.transpose(x), tf.cast(np.reshape(y,[1, -1]), tf.float64)],0)
xy = tf.transpose((tf.random.shuffle(tf.transpose(xy)))) # because there is no axis arg in shuffle
x = xy[0:f,:]
x = tf.transpose(x)
y = xy[f,:]
return x, y
I am happy to see some feedback/improvement on my own function or novel, easier ideas.
Data division is best if it is done in raw format only or before you transform it into tensors. If there is a strong requirement to do it in TensorFlow only, then I will suggest you to make use of tf.data.Dataset class. I have added the demo code with relevant comments explaining the steps.
import tensorflow as tf
import numpy as np
TEST_SIZE = 0.1
DATA_SIZE = 1000
# Create data
X_data = np.random.rand(DATA_SIZE, 28, 28, 1)
y_data = np.random.randint(0, 2, [DATA_SIZE])
samples1 = np.sum(y_data)
print('Percentage of 1 = ', samples1 / len(y_data))
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((X_data, y_data))
# Gather data with 0 and 1 labels separately
class0_dataset = dataset.filter(lambda x, y: y == 0)
class1_dataset = dataset.filter(lambda x, y: y == 1)
# Shuffle them
class0_dataset = class0_dataset.shuffle(DATA_SIZE)
class1_dataset = class1_dataset.shuffle(DATA_SIZE)
# Split them
class0_test_samples_len = int((DATA_SIZE - samples1) * TEST_SIZE)
class0_test = class0_dataset.take(class0_test_samples_len)
class0_train = class0_dataset.skip(class0_test_samples_len)
class1_test_samples_len = int(samples1 * TEST_SIZE)
class1_test = class1_dataset.take(class1_test_samples_len)
class1_train = class1_dataset.skip(class1_test_samples_len)
print('Train Class 0 = ', len(list(class0_train)), ' Class 1 = ', len(list(class1_train)))
print('Test Class 0 = ', len(list(class0_test)), ' Class 1 = ', len(list(class1_test)))
# Gather datasets
train_dataset = class0_train.concatenate(class1_train).shuffle(DATA_SIZE)
test_dataset = class0_test.concatenate(class1_test).shuffle(DATA_SIZE)
print('Train dataset size = ', len(list(train_dataset)))
print('Test dataset size = ', len(list(test_dataset)))
Sample output:
Percentage of 1 = 0.474
Train Class 0 = 474 Class 1 = 427
Test Class 0 = 52 Class 1 = 47
Train dataset size = 901
Test dataset size = 99

Tensorflow: sparse_tensor_dense_matmul slower than regular matmul

I have 2 scenarios:
scenario 1:
op: sparse_tensor_dense_matmul
A: 1000x1000 sparsity = 90%
B: 1000x1000 sparsity = 0%
scenario 2:
op: matmul
A: 1000x1000 sparsity = 0%
B: 1000x1000 sparsity = 0%
I understand that GPUs do not compute sparse matrix multiplication well but I would certainly expect them to perform it atleast as well as they perform non-sparse matrix mulipliation. In my code I get 10x slower for sparse_tensor_dense_matmul!
import tensorflow as tf
import numpy as np
import time
import itertools
rate = 0.1
N = 1000
itrs = 1000
num = int(rate * N * N)
combs = np.array(list(itertools.product(range(N), range(N))))
choices = range(len(combs))
_idxs = np.random.choice(a=choices, size=num, replace=False).tolist()
_idxs = combs[_idxs]
_idxs = _idxs.tolist()
_idxs = sorted(_idxs)
_vals = np.float32(np.random.rand(num))
_y = np.random.uniform(low=-1., high=1., size=(N, N))
_z = np.random.uniform(low=-1., high=1., size=(N, N))
################################################
x = tf.SparseTensor(indices=_idxs, values=_vals, dense_shape=(N, N))
y = tf.Variable(_y, dtype=tf.float32)
z = tf.Variable(_z, dtype=tf.float32)
sparse_dot = tf.sparse_tensor_dense_matmul(x, y)
dot = tf.matmul(z, y)
################################################
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
start = time.time()
for i in range(itrs):
[_sparse_dot] = sess.run([sparse_dot], feed_dict={})
total = time.time() - start
print (total)
start = time.time()
for i in range(itrs):
[_dot] = sess.run([dot], feed_dict={})
total = time.time() - start
print (total)
################################################
25.357680797576904
2.7684502601623535

SOM kmean optimization ValueError: all the input arrays must have same number of dimensions

I am trying to merge kmeans into SOM finding the best match unit. During clustering points to return the numbers of clusters for each point I encounter this error
"ValueError: all the input arrays must have same number of dimensions"
in line 159
distances_from_center = np.concatenate((distances_from_center, [dist(teacher,nodes)]))
I am trying to optimize the SOM using the fast kmeans approach.
N = 8 # linear size of 2D map
M = 8
n_teacher = 10000 # # of teacher signal
np.random.seed(100)# test seed for random number
def main():
# initialize node vectors
nodes = np.random.rand(N,M,3)# node array. each node has 3-dim weight vector
#nodes = centers_initiation(n_teacher, 4)
#initial out put
#TODO; make out put function to simplify here
plt.imshow(nodes, interpolation='none')
plt.savefig("init.png")
""""""
""" Learning """
""""""
# teacher signal
teachers = np.random.rand(n_teacher,3)
for i in range(n_teacher):
train(nodes, teachers, i)
# intermediate out put
if i%200 ==0 or i< 100: #out put for i<100 or each 1000 iteration
plt.imshow(nodes, interpolation='none')
plt.savefig(str(i)+".png")
#output
plt.imshow(nodes, interpolation='none')
plt.savefig("final.png")
def train(nodes, teachers, i):
bmu = best_matching_unit(nodes, teachers[i])
#print bmu
for x in range(N):
for y in range(M):
c = np.array([x,y])# coordinate of unit
d = np.linalg.norm(c-bmu)
L = learning_ratio(i)
S = learning_radius(i,d)
for z in range(3): #TODO clear up using numpy function
nodes[x,y,z] += L*S*(teachers[i,z] - nodes[x,y,z])
def dist(x, y):
# euclidean distance
if len(x.shape) == 1:
d = np.sqrt(np.sum((x - y) ** 2))
else:
d = np.sqrt(np.sum((x - y) ** 2, axis=1))
return d
def centers_initiation(teacher, number_of_centers):
# initialization of clusters centers as most distant points. return cluster centers (point)
dist_per_point = np.empty((0, 0), int)
dist_for_point = 0
index_of_deleted_point = 0
for point in teacher:
for other_point in np.delete(teacher, index_of_deleted_point, axis=0):
dist_for_point += dist(point, other_point)
dist_per_point = np.append(dist_per_point, dist_for_point)
dist_for_point = 0
index_of_deleted_point += 1
ordered_points_by_min = np.array(
[key for key, value in sorted(enumerate(dist_per_point), key=lambda p: p[1], reverse=True)])
return teacher[ordered_points_by_min[0:number_of_centers]]
def get_cluster_number(teacher, nodes):
# clustering points. return numbers of clusters for each point
distances_from_centers = np.zeros((0, nodes.shape[0]), int)
for point in teacher:
distances_from_center = np.array([])
for center in nodes:
distances_from_center = np.concatenate((distances_from_center, [dist(teacher,nodes)]))
distances_from_centers = np.concatenate((distances_from_centers, [distances_from_center]), axis=0)
nearest_center_number = np.argmin(distances_from_centers, axis=1)
return nearest_center_number
def best_matching_unit(teacher, nodes):
clusters = get_cluster_number(teacher, nodes)
clusters_centers_shift = 1
new_centers = np.zeros(nodes.shape)
counter = 0
while np.sum(clusters_centers_shift) != 0:
counter += 1
for i in xrange(nodes.shape[0]):
new_centers[i] = np.mean(teacher[:][clusters == i], axis=0)
clusters_centers_shift = dist(new_centers, nodes)
clusters = get_cluster_number(teacher, new_centers)
nodes = np.copy(new_centers)
return clusters
def neighbourhood(t):#neighbourhood radious
halflife = float(n_teacher/4) #for testing
initial = float(N/2)
return initial*np.exp(-t/halflife)
def learning_ratio(t):
halflife = float(n_teacher/4) #for testing
initial = 0.1
return initial*np.exp(-t/halflife)
def learning_radius(t, d):
# d is distance from BMU
s = neighbourhood(t)
return np.exp(-d**2/(2*s**2))
main()

Odd-size numpy arrays send/receive

I would like to gather numpy array contents from all processors to one. In case all arrays are of the same size, it works. However I don't see a natural way of doing the same task for arrays of proc-dependent size. Please consider the following code:
from mpi4py import MPI
import numpy
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
if rank >= size/2:
nb_elts = 5
else:
nb_elts = 2
# create data
lst = []
for i in xrange(nb_elts):
lst.append(rank*3+i)
array_lst = numpy.array(lst, dtype=int)
# communicate array
result = []
if rank == 0:
result = array_lst
for p in xrange(1, size):
received = numpy.empty(nb_elts, dtype=numpy.int)
comm.Recv(received, p, tag=13)
result = numpy.concatenate([result, received])
else:
comm.Send(array_lst, 0, tag=13)
My problem is at the "received" allocation. How can I know what is the size to be allocated? Do I have to first send/receive each array size?
Based on a suggestion below, I'll go with
data_array = numpy.ones(rank + 3, dtype=int)
data_array *= rank + 5
print '[{}] data: {} ({})'.format(rank, data_array, type(data_array))
# make all processors aware of data array sizes
all_sizes = {rank: data_array.size}
gathered_all_sizes = comm_py.allgather(all_sizes)
for d in gathered_all_sizes:
all_sizes.update(d)
# prepare Gatherv as described by #francis
nbsum = 0
sendcounts = []
displacements = []
for p in xrange(size):
n = all_sizes[p]
displacements.append(nbsum)
sendcounts.append(n)
nbsum += n
if rank==0:
result = numpy.empty(nbsum, dtype=numpy.int)
else:
result = None
comm_py.Gatherv(data_array,[result, tuple(sendcounts), tuple(displacements), MPI.INT64_T], root=0)
print '[{}] gathered data: {}'.format(rank, result)
In the code you pasted, both Send() and Recv() sends nb_elts elements. The problem is that nb_elts is not the same for every processes... Hence, the number of item received does not match the number of elements that were sent and the program complains:
mpi4py.MPI.Exception: MPI_ERR_TRUNCATE: message truncated
To prevent that, the root process must compute the number of items that the other processes have sent. Hence, in the loop for p in xrange(1, size), nb_elts must be computed according to p, not rank.
The following code based on yours has been corrected. I would add that the natural way to perform this gathering operation is to use Gatherv(). See http://materials.jeremybejarano.com/MPIwithPython/collectiveCom.html and the documentation of mpi4py for instance. I added the corresponding sample code. The only tricky point is that numpy.int is 64bit long. Hence, the Gatherv() uses the MPI type MPI_DOUBLE.
from mpi4py import MPI
import numpy
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
if rank >= size/2:
nb_elts = 5
else:
nb_elts = 2
# create data
lst = []
for i in xrange(nb_elts):
lst.append(rank*3+i)
array_lst = numpy.array(lst, dtype=int)
# communicate array
result = []
if rank == 0:
result = array_lst
for p in xrange(1, size):
if p >= size/2:
nb_elts = 5
else:
nb_elts = 2
received = numpy.empty(nb_elts, dtype=numpy.int)
comm.Recv(received, p, tag=13)
result = numpy.concatenate([result, received])
else:
comm.Send(array_lst, 0, tag=13)
if rank==0:
print "Send Recv, result= "+str(result)
#How to use Gatherv:
nbsum=0
sendcounts=[]
displacements=[]
for p in xrange(0,size):
displacements.append(nbsum)
if p >= size/2:
nbsum+= 5
sendcounts.append(5)
else:
nbsum+= 2
sendcounts.append(2)
if rank==0:
print "nbsum "+str(nbsum)
print "sendcounts "+str(tuple(sendcounts))
print "displacements "+str(tuple(displacements))
print "rank "+str(rank)+" array_lst "+str(array_lst)
print "numpy.int "+str(numpy.dtype(numpy.int))+" "+str(numpy.dtype(numpy.int).itemsize)+" "+str(numpy.dtype(numpy.int).name)
if rank==0:
result2=numpy.empty(nbsum, dtype=numpy.int)
else:
result2=None
comm.Gatherv(array_lst,[result2,tuple(sendcounts),tuple(displacements),MPI.DOUBLE],root=0)
if rank==0:
print "Gatherv, result2= "+str(result2)