Need help unpacking a nested array into a pandas dataframe - pandas

I am running some code that generates an array with the following shape (18433, 17, 600 to 885 in length). I need to unpack that into a pandas dataframe with 17 columns, and rows containing data for 18433 entities that each have 600 to 885 time series entries. The code that generates the array is shown below. I am a relative python newbie and have reached the extent of my skills. I tried unpacking using a for loop, but it takes forever. Are there any libraries or methods that are more efficient?
# Generate full monthly cash flow arrays
# define constant input parameters
eloss = 0
weight = 1.0
prod_wt = 1.0
inv_wt = 1.0
stx_oil = 0.0795
stx_gas = 0.0795
stx_ngl = 0.0795
adval = 0
aban = 150000
# Create function for slicing the volume array and calculating the monthly cash flow
def econ_ncf_iter(r):
econ_ncf_iter = econ_cf(index = r, uid = prop_list.loc[r, 'PROPNUM'], wi = prop_list.loc[r, 'WI'],
nri = prop_list.loc[r, 'NRI'], roy = prop_list.loc[r, 'Royalty'], eloss = eloss,
weight = weight, prod_wt = prod_wt, inv_wt = inv_wt,
shrink = np.round(prop_list.loc[r, 'SHRINK'] / 100, 6),
btu = np.round(prop_list.loc[r, 'BTU'] / 1000, 6),
ngl_yield = np.round(prop_list.loc[r, 'NGL/GAS'], 6),
pri_oil = np.extract(oilprice[r][0] == prop_list.loc[r, 'PROPNUM'], oilprice[r][1]),
pri_gas = np.extract(gasprice[r][0] == prop_list.loc[r, 'PROPNUM'], gasprice[r][1]),
paj_oil = prop_list.loc[r, 'PAJ_OIL'],
paj_gas = np.extract(gasdiff[r][0] == prop_list.loc[r, 'PROPNUM'], gasdiff[r][1]),
paj_ngl = prop_list.loc[r, 'PAJ_NGL'], stx_oil = stx_oil, stx_gas = stx_gas, stx_ngl = stx_ngl,
adval = adval, opc_fix = np.round(prop_list.loc[r, 'OPC/T'], 2),
opc_oil = np.round(prop_list.loc[r, 'OIL_OPEX'], 2),
opc_gas = np.round(prop_list.loc[r, 'GAS_OPEX'], 2),
capex = np.round(prop_list.loc[r, 'CAPITAL'] * 1000, 2), aban = aban)
return econ_ncf_iter
# generate net cash flow array
econ_ncf = lambda r: econ_ncf_iter(r)
vecon_ncf = np.vectorize(econ_ncf_iter, otypes = [object])
ncf_arr_packed = vecon_ncf(R)

I figured it out and it was pretty easy
'''
ncf_pd_dflist = []
columns = ['UID', 'Month', 'Grs Oil', 'Grs Gas', 'Net Oil', 'Net Gas', 'Net NGL', 'Oil Revenue', 'Gas Revenue',
'NGL Revenue', 'Total Revenue', 'Total Tax', 'OPEX', 'Operating Income', 'Cumulative Op CF', 'Net Cashflow',
'Cumulative Net CF']
pbar = tqdm(len(R))
for r in R:
ncf_pd_dflist.append(pd.DataFrame(np.transpose(ncf_arr_packed[r])))
pbar.update()
ncf_pd = pd.concat(ncf_pd_dflist)
ncf_pd.columns = columns
pbar.close()
'''
Simple code to loop though the array and create a list of pandas dataframes. After the loop finishes I concatenate the dataframe lists into a single dataframe. This took about 5 seconds to complete.

Although you already figured out a solution, here's a general alternative without an explicit loop. It takes some simple steps:
If the desired horizontal axis (your middle axis) is not the last, swap them.
Reshape to a 2D array of the horizontal rows.
Make the DataFrame with a MultiIndex from the cartesian product of the other axes.
Assuming the array is arr:
x, y, z = arr.shape
df = pd.DataFrame(arr.swapaxes(1, 2).reshape(x*z, -1),
pd.MultiIndex.from_product([np.arange(x), np.arange(z)]))

Related

Remove the requirement to loop through numpy array

Overview
The code below contains a numpy array clusters with values that are compared against each row of a pandas Dataframe using np.where. The SoFunc function returns rows where all conditions are True and takes the clusters array as input.
Question
I can loop through this array to compare each array element against the respective np.where conditions. How do I remove the requirement to loop but still get the same output?
I appreciate looping though numpy arrays is inefficient and want to improve this code. The actual dataset will be much larger.
Prepare the reproducible mock data
def genMockDataFrame(days,startPrice,colName,startDate,seed=None):
periods = days*24
np.random.seed(seed)
steps = np.random.normal(loc=0, scale=0.0018, size=periods)
steps[0]=0
P = startPrice+np.cumsum(steps)
P = [round(i,4) for i in P]
fxDF = pd.DataFrame({
'ticker':np.repeat( [colName], periods ),
'date':np.tile( pd.date_range(startDate, periods=periods, freq='H'), 1 ),
'price':(P)})
fxDF.index = pd.to_datetime(fxDF.date)
fxDF = fxDF.price.resample('D').ohlc()
fxDF.columns = [i.title() for i in fxDF.columns]
return fxDF
def SoFunc(clust):
#generate mock data
df = genMockDataFrame(10,1.1904,'eurusd','19/3/2020',seed=157)
df["Upper_Band"] = 1.1928
df.loc["2020-03-27", "Upper_Band"] = 1.2118
df.loc["2020-03-26", "Upper_Band"] = 1.2200
df["Level"] = np.where((df["High"] >= clust)
& (df["Low"] <= clust)
& (df["High"] >= df["Upper_Band"] ),1,np.NaN
)
return df.dropna()
Loop through the clusters array
clusters = np.array([1.1929 , 1.2118 ])
l = []
for i in range(len(clusters)):
l.append(SoFunc(clusters[i]))
pd.concat(l)
Output
Open High Low Close Upper_Band Level
date
2020-03-19 1.1904 1.1937 1.1832 1.1832 1.1928 1.0
2020-03-25 1.1939 1.1939 1.1864 1.1936 1.1928 1.0
2020-03-27 1.2118 1.2144 1.2039 1.2089 1.2118 1.0
(Edited based on #tdy's comment below)
pandas.merge allows you to make len(clusters) copies of your dataframe and then pare it down to according to the conditions in your SoFunc function.
The cross merge creates a dataframe with a copy of df for each record in clusters_df. The overall result ought to be faster for large dataframes than the loop-based approach, provided you have enough memory to temporarily accommodate the merged dataframe (if not, the operation may spill over onto page / swap and slow down drastically).
import numpy as np
import pandas as pd
def genMockDataFrame(days,startPrice,colName,startDate,seed=None):
''' identical to the example provided '''
periods = days*24
np.random.seed(seed)
steps = np.random.normal(loc=0, scale=0.0018, size=periods)
steps[0]=0
P = startPrice+np.cumsum(steps)
P = [round(i,4) for i in P]
fxDF = pd.DataFrame({
'ticker':np.repeat( [colName], periods ),
'date':np.tile( pd.date_range(startDate, periods=periods, freq='H'), 1 ),
'price':(P)})
fxDF.index = pd.to_datetime(fxDF.date)
fxDF = fxDF.price.resample('D').ohlc()
fxDF.columns = [i.title() for i in fxDF.columns]
return fxDF
# create the base dataframe according to the former SoFunc
df = genMockDataFrame(10,1.1904,'eurusd','19/3/2020',seed=157)
df["Upper_Band"] = 1.1928
df.loc["2020-03-27"]["Upper_Band"] = 1.2118
df.loc["2020-03-26"]["Upper_Band"] = 1.2200
# create a df out of the cluster array
clusters = np.array([1.1929 , 1.2118 ])
clusters_df = pd.DataFrame({"clust": clusters})
# perform the merge, then filter and finally clean up
result_df = (
pd
.merge(df.reset_index(), clusters_df, how="cross") # for each entry in cluster, make a copy of df
.loc[lambda z: (z.Low <= z.clust) & (z.High >= z.clust) & (z.High >= z.Upper_Band), :] # filter the copies down
.drop(columns=["clust"]) # not needed in result
.assign(Level=1.0) # to match your result; not really needed
.set_index("date") # bring back the old index
)
print(result_df)
I recommend inspecting just the result of pd.merge(df.reset_index(), clusters_df, how="cross") to see how it works.

Optimization Python

I am trying to get the optimal solution
column heading: D_name , Vial_size1 ,Vial_size2 ,Vial_size3 , cost , units_needed
row 1: Act , 120 , 400 , 0 , $5 , 738
row 2: dug , 80 , 200 , 400 , $40 , 262
data in excel
column heading: Vials price size
Row 1: Vial size 1 5 120
Row 2: Vial size 2 5 400
prob=LpProblem("Dose_Vial",LpMinimize)
import pandas as pd
df = pd.read_excel (r'C:\Users\*****\Desktop\Vial.xls')
print (df)
# Create a list of the Vial_Size
Vial_Size = list(df['Vials'])
# Create a dictinary of units for all Vial_Size
size = dict(zip(Vial_Size,df['size']))
# Create a dictinary of price for all Vial_Size
Price = dict(zip(Vial_Size,df['Price']))
# print dictionaries
print(Vial_Size)
print(size)
print(Price)
vial_vars = LpVariable.dicts("Vials",size,lowBound=0,cat='Integer')
# start building the LP problem by adding the main objective function
prob += lpSum([Price[i]*vial_vars[i]*size[i] for i in size])
# adding constraints
prob += lpSum([size[f] * vial_vars[f] for f in size]) >= 738
# The status of the solution is printed to the screen
prob.solve()
print("Status:", LpStatus[prob.status])
# In case the problem is ill-formulated or there is not sufficient information,
# the solution may be infeasible or unbounded
for v in prob.variables():
if v.varValue>0:
print(v.name, "=", format(round(v.varValue)))
Vials_Vial_Size_1 = 3
Vials_Vial_Size_2 = 1
obj =round((value(prob.objective)))
print("The total cost of optimized vials: ${}".format(round(obj)))
The total cost of optimized vials: $3800
'
how to set it for 2 or more drugs and get the best optimal solution.
Here is an approach to solve the first part of the question, finding vial combinations that minimizes the waste (I'm not sure what role the price plays?):
from pulp import *
import pandas as pd
import csv
drugs_dict = {"D_name": ['Act', 'dug'],
"Vial_size1": [120, 80],
"Vial_size2": [400, 200],
"Vial_size3": [0, 400],
"cost": [5, 40],
"units_needed": [738, 262]}
df = pd.DataFrame(drugs_dict)
drugs = list(df['D_name'])
vial_1_size = dict(zip(drugs, drugs_dict["Vial_size1"]))
vial_2_size = dict(zip(drugs, drugs_dict["Vial_size2"]))
vial_3_size = dict(zip(drugs, drugs_dict["Vial_size3"]))
units_needed = dict(zip(drugs, drugs_dict["units_needed"]))
results = []
for drug in drugs:
print(f"drug = {drug}")
# setup minimum waste problem
prob = LpProblem("Minimum Waste Problem", LpMinimize)
# create decision variables
vial_1_var = LpVariable("Vial_1", lowBound=0, cat='Integer')
vial_2_var = LpVariable("Vial_2", lowBound=0, cat='Integer')
vial_3_var = LpVariable("Vial_3", lowBound=0, cat='Integer')
units = lpSum([vial_1_size[drug] * vial_1_var +
vial_2_size[drug] * vial_2_var +
vial_3_size[drug] * vial_3_var])
# objective function
prob += units
# constraints
prob += units >= units_needed[drug]
prob.solve()
print(f"units = {units.value()}")
for v in prob.variables():
if v.varValue > 0:
print(v.name, "=", v.varValue)
results.append([drug, units.value(), int(vial_1_var.value() or 0), int(vial_2_var.value() or 0), int(vial_3_var.value() or 0)])
with open('vial_results.csv', 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['drug', 'units', 'vial_1', 'vial_2', 'vial_3'])
csv_writer.writerows(results)
Running gives:
drug = Act
units = 760.0
Vial_1 = 3.0
Vial_2 = 1.0
drug = dug
units = 280.0
Vial_1 = 1.0
Vial_2 = 1.0

Is non-identical not enough to be considered 'distinct' for kmeans centroids?

I have an issue with kmeans clustering providing centroids. I saw the same problem already asked (
K-means: Initial centers are not distinct), but the solution in that post is not working in my case.
I selected the centroids using ClusterR::Kmeans_arma. I confirmed that my centroids are not identical using mgcv::uniquecombs, but still got the initial centers are not distinct error.
> dim(t(dat))
[1] 13540 11553
> centroids = ClusterR::KMeans_arma(data = t(dat), centers = 561,
n_iter = 50, seed_mode = "random_subset",
verbose = FALSE, CENTROIDS = NULL)
> dim(centroids)
[1] 561 11553
> x = mgcv::uniquecombs(centroids)
> dim(x)
[1] 561 11553
> res = kmeans(t(dat), centers = centroids, iter.max = 200)
Error in kmeans(t(dat), centers = centroids, iter.max = 200) :
initial centers are not distinct
Any suggestion to resolve this? Thanks!
I replicated the issue you've mentioned with the following data:
cols = 13540
rows = 11553
set.seed(1)
vec_dat = runif(rows * cols)
dat = matrix(vec_dat, nrow = rows, ncol = cols)
dim(dat)
dat = t(dat)
dim(dat)
There is no 'centers' parameter in the 'ClusterR::KMeans_arma()' function, therefore I've assumed you actually mean 'clusters',
centroids = ClusterR::KMeans_arma(data = dat,
clusters = 561,
n_iter = 50,
seed_mode = "random_subset",
verbose = TRUE,
CENTROIDS = NULL)
str(centroids)
dim(centroids)
The 'centroids' is a matrix of class "k-means clustering". If your intention is to come to the clusters then you can use,
clust = ClusterR::predict_KMeans(data = dat,
CENTROIDS = centroids,
threads = 6)
length(unique(clust)) # 561
class(centroids) # "k-means clustering"
If you want to pass the 'centroids' to the base R 'kmeans' function you have to set the 'class' of the 'centroids' object to NULL and that because the base R 'kmeans' function uses internally the base R 'duplicated()' function (you can view this by using print(kmeans) in the R console) which does not recognize the 'centroids' object as a matrix or data.frame (it is an object of class "k-means clustering") and performs the checking column-wise rather than row-wise. Therefore, the following should work for your case,
class(centroids) = NULL
dups = duplicated(centroids)
sum(dups) # this should actually give 0
res = kmeans(dat, centers = centroids, iter.max = 200)
I've made a few adjustments to the "ClusterR::predict_KMeans()" and particularly I've added the "threads" parameter and a check for duplicates, therefore if you want to come to the clusters using multiple cores you have to install the package from Github using,
remotes::install_github('mlampros/ClusterR',
upgrade = 'always',
dependencies = TRUE,
repos = 'https://cloud.r-project.org/')
The changes will take effect in the next version of the CRAN package which will be "1.2.2"
UPDATE regarding output and performance (based on your comment):
data(dietary_survey_IBS, package = 'ClusterR')
kmeans_arma = function(data) {
km_cl = ClusterR::KMeans_arma(data,
clusters = 2,
n_iter = 10,
seed_mode = "random_subset",
seed = 1)
pred_cl = ClusterR::predict_KMeans(data = data,
CENTROIDS = km_cl,
threads = 1)
return(pred_cl)
}
km_arma = kmeans_arma(data = dietary_survey_IBS)
km_algos = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")
for (algo in km_algos) {
cat('base-kmeans-algo:', algo, '\n')
km_base = kmeans(dietary_survey_IBS,
centers = 2,
iter.max = 10,
nstart = 1, # can be set to 5 or 10 etc.
algorithm = algo)
km_cl = as.vector(km_base$cluster)
print(table(km_arma, km_cl))
cat('--------------------------\n')
}
microbenchmark::microbenchmark(kmeans(dietary_survey_IBS,
centers = 2,
iter.max = 10,
nstart = 1, # can be set to 5 or 10 etc.
algorithm = algo), kmeans_arma(data = dietary_survey_IBS), times = 100)
I don't see any significant difference in the output clusters between the 'base R kmeans' and the 'kmeans_arma' function for all available 'base R kmeans' algorithms (you can test it also for your own data sets). I am not sure which algorithm the 'armadillo' library uses internally and moreover the 'base R kmeans' includes the 'nstart' parameter (you can consult the documentation for more info). Regarding performance you won't see any substantial differences for small to medium data sets but due to the fact that the armadillo library uses OpenMP internally in case that your computer has more than 1 cores then for big data sets I think the 'ClusterR::KMeans_arma' function will return the 'centroids' faster.

avoiding data leakage with timed data and cross validation

I'm using the Kobe Bryant Dataset.
I wish to predict the shot_made_flag with KnnRegressor.
I'm trying to avoid data leakage by grouping the data by season, year, and month.
season is pre-existing column and year and month are columns I've added like so:
kobe_data_encoded['year'] = kobe_data_encoded['game_date'].apply(lambda x: int(re.compile('(\d{4})').findall(x)[0]))
kobe_data_encoded['month'] = kobe_data_encoded['game_date'].apply(lambda x: int(re.compile('-(\d+)-').findall(x)[0]))
Here's the full code of my pre-processing code of the features:
import re
# drop unnecesarry columns
kobe_data_encoded = kobe_data.drop(columns=['game_event_id', 'game_id', 'lat', 'lon', 'team_id', 'team_name', 'matchup', 'shot_id'])
# use HotEncoding for action_type, combined_shot_type, shot_zone_area, shot_zone_basic, opponent
kobe_data_encoded = pd.get_dummies(kobe_data_encoded, prefix_sep="_", columns=['action_type'])
kobe_data_encoded = pd.get_dummies(kobe_data_encoded, prefix_sep="_", columns=['combined_shot_type'])
kobe_data_encoded = pd.get_dummies(kobe_data_encoded, prefix_sep="_", columns=['shot_zone_area'])
kobe_data_encoded = pd.get_dummies(kobe_data_encoded, prefix_sep="_", columns=['shot_zone_basic'])
kobe_data_encoded = pd.get_dummies(kobe_data_encoded, prefix_sep="_", columns=['opponent'])
# covert season to years
kobe_data_encoded['season'] = kobe_data_encoded['season'].apply(lambda x: int(re.compile('(\d+)-').findall(x)[0]))
# covert shot_type to numeric representation
kobe_data_encoded['shot_type'] = kobe_data_encoded['shot_type'].apply(lambda x: int(re.compile('(\d)PT').findall(x)[0]))
# add year and month using game_date
kobe_data_encoded['year'] = kobe_data_encoded['game_date'].apply(lambda x: int(re.compile('(\d{4})').findall(x)[0]))
kobe_data_encoded['month'] = kobe_data_encoded['game_date'].apply(lambda x: int(re.compile('-(\d+)-').findall(x)[0]))
kobe_data_encoded = kobe_data_encoded.drop(columns=['game_date'])
# covert shot_type to numeric representation
kobe_data_encoded.loc[kobe_data_encoded['shot_zone_range'] == 'Back Court Shot', 'shot_zone_range'] = 4
kobe_data_encoded.loc[kobe_data_encoded['shot_zone_range'] == '24+ ft.', 'shot_zone_range'] = 3
kobe_data_encoded.loc[kobe_data_encoded['shot_zone_range'] == '16-24 ft.', 'shot_zone_range'] = 2
kobe_data_encoded.loc[kobe_data_encoded['shot_zone_range'] == '8-16 ft.', 'shot_zone_range'] = 1
kobe_data_encoded.loc[kobe_data_encoded['shot_zone_range'] == 'Less Than 8 ft.', 'shot_zone_range'] = 0
# transform game_date to date time object
# kobe_data_encoded['game_date'] = pd.to_numeric(kobe_data_encoded['game_date'].str.replace('-',''))
kobe_data_encoded.head()
Then I've scaled the data using MinMaxScaler:
# scaling
min_max_scaler = preprocessing.MinMaxScaler()
scaled_features_df = kobe_data_encoded.copy()
column_names = ['loc_x', 'loc_y', 'minutes_remaining', 'period',
'seconds_remaining', 'shot_distance', 'shot_type', 'shot_zone_range']
scaled_features = min_max_scaler.fit_transform(scaled_features_df[column_names])
scaled_features_df[column_names] = scaled_features
And grouped by the season, year, and month like stated above:
seasons_date = scaled_features_df.groupby(['season', 'year', 'month'])
I've been tasked with using KFold to find the best K using roc_auc score.
Here's my implementation:
neighbors = [x for x in range(1,50) if x % 2 != 0]
cv_scores = []
for k in neighbors:
print('k: ', k)
knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
scores = []
accumelated_X = pd.DataFrame()
accumelated_y = pd.Series()
for group_name, group in seasons_date:
print(group_name)
group = group.drop(columns=['season', 'year', 'month'])
not_classified_df = group[group['shot_made_flag'].isnull()]
classified_df = group[group['shot_made_flag'].notnull()]
X = classified_df.drop(columns=['shot_made_flag'])
y = classified_df['shot_made_flag']
accumelated_X = pd.concat([accumelated_X, X])
accumelated_y = pd.concat([accumelated_y, y])
cv = StratifiedKFold(n_splits=10, shuffle=True)
scores.append(cross_val_score(knn, accumelated_X, accumelated_y, cv=cv, scoring='roc_auc'))
cv_scores.append(scores.mean())
#graphical view
#misclassification error
MSE = [1-x for x in cv_scores]
#optimal K
optimal_k_index = MSE.index(min(MSE))
optimal_k = neighbors[optimal_k_index]
print(optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()
I'm not sure if I'm dealing with the data leakage correctly in this situation
Because if I'm accumulating the previous season data and then pass it over to cross_val_score I might just as-well and up with data leakage since the cv can split the data in a way that the new season data it fitted upon and the previous season data is tested upon am I right here?
If so I would like to know how to approach this situation where I would like to use K-Fold to find the best k using this timed data without having data leakage.
Is it even sensible to use K-Fold to split data and not split by game date to avoid data leakage?
To be short, as you wanna do something with sounds like timeseries, you cannot use the standard k-fold cross validation.
You would use some data from the future to predict the past, which is forbidden.
A good approach you can find here: https://stats.stackexchange.com/questions/14099/using-k-fold-cross-validation-for-time-series-model-selection
fold 1 : training [1], test [2]
fold 2 : training [1 2], test [3]
fold 3 : training [1 2 3], test [4]
fold 4 : training [1 2 3 4], test [5]
fold 5 : training [1 2 3 4 5], test [6]
where the numbers are in chronolical order of your datatime

Odd-size numpy arrays send/receive

I would like to gather numpy array contents from all processors to one. In case all arrays are of the same size, it works. However I don't see a natural way of doing the same task for arrays of proc-dependent size. Please consider the following code:
from mpi4py import MPI
import numpy
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
if rank >= size/2:
nb_elts = 5
else:
nb_elts = 2
# create data
lst = []
for i in xrange(nb_elts):
lst.append(rank*3+i)
array_lst = numpy.array(lst, dtype=int)
# communicate array
result = []
if rank == 0:
result = array_lst
for p in xrange(1, size):
received = numpy.empty(nb_elts, dtype=numpy.int)
comm.Recv(received, p, tag=13)
result = numpy.concatenate([result, received])
else:
comm.Send(array_lst, 0, tag=13)
My problem is at the "received" allocation. How can I know what is the size to be allocated? Do I have to first send/receive each array size?
Based on a suggestion below, I'll go with
data_array = numpy.ones(rank + 3, dtype=int)
data_array *= rank + 5
print '[{}] data: {} ({})'.format(rank, data_array, type(data_array))
# make all processors aware of data array sizes
all_sizes = {rank: data_array.size}
gathered_all_sizes = comm_py.allgather(all_sizes)
for d in gathered_all_sizes:
all_sizes.update(d)
# prepare Gatherv as described by #francis
nbsum = 0
sendcounts = []
displacements = []
for p in xrange(size):
n = all_sizes[p]
displacements.append(nbsum)
sendcounts.append(n)
nbsum += n
if rank==0:
result = numpy.empty(nbsum, dtype=numpy.int)
else:
result = None
comm_py.Gatherv(data_array,[result, tuple(sendcounts), tuple(displacements), MPI.INT64_T], root=0)
print '[{}] gathered data: {}'.format(rank, result)
In the code you pasted, both Send() and Recv() sends nb_elts elements. The problem is that nb_elts is not the same for every processes... Hence, the number of item received does not match the number of elements that were sent and the program complains:
mpi4py.MPI.Exception: MPI_ERR_TRUNCATE: message truncated
To prevent that, the root process must compute the number of items that the other processes have sent. Hence, in the loop for p in xrange(1, size), nb_elts must be computed according to p, not rank.
The following code based on yours has been corrected. I would add that the natural way to perform this gathering operation is to use Gatherv(). See http://materials.jeremybejarano.com/MPIwithPython/collectiveCom.html and the documentation of mpi4py for instance. I added the corresponding sample code. The only tricky point is that numpy.int is 64bit long. Hence, the Gatherv() uses the MPI type MPI_DOUBLE.
from mpi4py import MPI
import numpy
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
if rank >= size/2:
nb_elts = 5
else:
nb_elts = 2
# create data
lst = []
for i in xrange(nb_elts):
lst.append(rank*3+i)
array_lst = numpy.array(lst, dtype=int)
# communicate array
result = []
if rank == 0:
result = array_lst
for p in xrange(1, size):
if p >= size/2:
nb_elts = 5
else:
nb_elts = 2
received = numpy.empty(nb_elts, dtype=numpy.int)
comm.Recv(received, p, tag=13)
result = numpy.concatenate([result, received])
else:
comm.Send(array_lst, 0, tag=13)
if rank==0:
print "Send Recv, result= "+str(result)
#How to use Gatherv:
nbsum=0
sendcounts=[]
displacements=[]
for p in xrange(0,size):
displacements.append(nbsum)
if p >= size/2:
nbsum+= 5
sendcounts.append(5)
else:
nbsum+= 2
sendcounts.append(2)
if rank==0:
print "nbsum "+str(nbsum)
print "sendcounts "+str(tuple(sendcounts))
print "displacements "+str(tuple(displacements))
print "rank "+str(rank)+" array_lst "+str(array_lst)
print "numpy.int "+str(numpy.dtype(numpy.int))+" "+str(numpy.dtype(numpy.int).itemsize)+" "+str(numpy.dtype(numpy.int).name)
if rank==0:
result2=numpy.empty(nbsum, dtype=numpy.int)
else:
result2=None
comm.Gatherv(array_lst,[result2,tuple(sendcounts),tuple(displacements),MPI.DOUBLE],root=0)
if rank==0:
print "Gatherv, result2= "+str(result2)