Fast binning of geographical data with negative values - scipy.stats

Trying to bin some geolocated data using scipy stats.binned_statisc_2d but it seems there cannot exist any negative values in the data. Is there a way to do this accurately and fast?
import numpy as np
ilats = np.linspace(90,-90, 4000)
ilons = np.linspace(-180, 179.955, 8000)
values = np.array([17,-14, -7,-8,-11,-8,-7,-8,-10,-5,-3,
-12,-5, -6,21,30, 2, 4,-8, 6, 4, 7,
3,-6,-13, 21, 4, 5,11,-6, 8,-5,-6,
9,8,-8, -2,-16,-5,-5,-9,-4,-6,33,
-8,-5,-14,-8,-11,21,24,-7,-13,12,-6,
5,7,8,-3,-3,-4, 4, 9,-3, 9,-11,
-8,6,4, 8,-6,-6,-4,-3, 4, 5,11,
-3,-6,-4,-8,-4,12,-9,-8,15,-10,-5,
-4,12,5,-4, 4, 7,-13, 5,-4,-4,-5,
-8,-10,-9,-7.])
lats = np.array([ 6.7427, 42.7027, 42.6963, 10.3688, 37.5713, 37.5798,
-12.1563, 42.7127, 41.7457, 37.8122, 37.66 , 41.7456,
41.7457, 38.4462, 8.5418, -12.7309, -10.9395, -10.9464,
38.0641, -10.9507, -12.7316, -10.9313, -12.7235, 37.6469,
38.1234, 20.3964, -12.0847, -12.0844, 10.3794, 38.1302,
10.3627, 38.1582, 38.1463, 22.6466, 20.4246, 38.1401,
-36.6505, 38.2352, 37.8795, 40.2281, 37.8125, 42.323 ,
37.8775, 9.3717, 38.732 , 38.7202, 38.2688, 38.9148,
38.9414, -4.8618, -4.8525, 39.0108, 38.8187, -6.5067,
38.009 , -6.5174, -6.5101, -6.51 , 37.7243, 37.7512,
37.7215, -6.4902, -6.5113, 37.5409, 1.9481, 37.6398,
-6.5073, 37.8037, -11.133 , 9.0896, 38.177 , 9.089 ,
37.8708, 38.3848, -3.553 , 9.4345, -3.5343, -3.5769,
37.6847, 37.6045, 37.8857, 38.32 , 8.1673, 37.8822,
37.9113, 8.6278, 37.5652, 37.8236, 37.8593, 8.6219,
-3.5614, 37.924 , 37.7845, 37.8436, 37.8666, 37.6804,
37.639 , 40.7691, 40.7744, 37.8029, 42.9793, 8.207 ,
39.302 ])
lons = np.array([ 60.8964, -96.1017, -96.1049, 71.595 , -97.0008, -97.0126,
57.4887, -96.109 , -95.1058, -97.1088, -96.6413, -95.1054,
-95.1062, -95.2395, 58.3938, -73.7145, -70.626 , -70.5864,
-95.5678, -70.5914, -73.7525, -70.6048, -73.753 , -96.7662,
-95.504 , 100.3965, -70.7921, -70.7905, 71.5499, -95.4816,
71.5457, -95.326 , -95.3355, 96.8339, 100.2684, -95.8697,
39.1031, -95.4456, -96.3814, -94.5726, -96.3782, -95.4554,
-96.3797, -66.7449, -95.1513, -95.1465, -95.0972, -95.2498,
-95.2054, 84.2004, 84.21 , -94.5695, -94.9174, 114.0945,
-95.942 , 114.0592, 114.0956, 114.0873, -96.4689, -96.4599,
-96.463 , 114.0741, 114.0975, -96.582 , 117.2901, -96.572 ,
114.0561, -96.5539, -74.9417, 71.3391, -95.4253, 71.2452,
-96.5511, -95.065 , 107.5832, 71.3906, 107.6005, 107.4975,
-96.9722, -96.9307, -96.2627, -95.1745, 72.5249, -96.2632,
-96.3324, 57.9562, -96.9309, -96.5123, -96.589 , 57.9627,
107.6405, -96.2711, -96.5737, -96.2344, -96.2099, -96.5062,
-96.5248, -94.8421, -94.8522, -96.5873, -97.1523, 72.4707,
-95.0489])
ret = stats.binned_statistic_2d(lons, lats, values, 'count', bins=[ilons, ilats])
Trying to examine the ungridded data, values by gridding them on a coarse grid (ilats, ilons) and plotting the counts first case and mean later on. But the above results produce:
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [52], in <cell line: 1>()
----> 1 ret = stats.binned_statistic_2d(lons, lats, ka, 'count', bins=[ilons, ilats])
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:352, in binned_statistic_2d(x, y, values, statistic, bins, range, expand_binnumbers)
349 xedges = yedges = np.asarray(bins, float)
350 bins = [xedges, yedges]
--> 352 medians, edges, binnumbers = binned_statistic_dd(
353 [x, y], values, statistic, bins, range,
354 expand_binnumbers=expand_binnumbers)
356 return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:571, in binned_statistic_dd(sample, values, statistic, bins, range, expand_binnumbers, binned_statistic_result)
569 if binned_statistic_result is None:
570 nbin, edges, dedges = _bin_edges(sample, bins, range)
--> 571 binnumbers = _bin_numbers(sample, nbin, edges, dedges)
572 else:
573 edges = binned_statistic_result.bin_edges
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:752, in _bin_numbers(sample, nbin, edges, dedges)
750 if dedges_min == 0:
751 raise ValueError('The smallest edge difference is numerically 0.')
-->** 752 decimal = int(-np.log10(dedges_min)) + 6**
753 # Find which points are on the rightmost edge.
754 on_edge = np.where((sample[:, i] >= edges[i][-1]) &
755 (np.around(sample[:, i], decimal) ==
756 np.around(edges[i][-1], decimal)))[0]
ValueError: cannot convert float NaN to integer
It looks like there is a log operation and I don't see a way around it.

Related

Optimization Python

I am trying to get the optimal solution
column heading: D_name , Vial_size1 ,Vial_size2 ,Vial_size3 , cost , units_needed
row 1: Act , 120 , 400 , 0 , $5 , 738
row 2: dug , 80 , 200 , 400 , $40 , 262
data in excel
column heading: Vials price size
Row 1: Vial size 1 5 120
Row 2: Vial size 2 5 400
prob=LpProblem("Dose_Vial",LpMinimize)
import pandas as pd
df = pd.read_excel (r'C:\Users\*****\Desktop\Vial.xls')
print (df)
# Create a list of the Vial_Size
Vial_Size = list(df['Vials'])
# Create a dictinary of units for all Vial_Size
size = dict(zip(Vial_Size,df['size']))
# Create a dictinary of price for all Vial_Size
Price = dict(zip(Vial_Size,df['Price']))
# print dictionaries
print(Vial_Size)
print(size)
print(Price)
vial_vars = LpVariable.dicts("Vials",size,lowBound=0,cat='Integer')
# start building the LP problem by adding the main objective function
prob += lpSum([Price[i]*vial_vars[i]*size[i] for i in size])
# adding constraints
prob += lpSum([size[f] * vial_vars[f] for f in size]) >= 738
# The status of the solution is printed to the screen
prob.solve()
print("Status:", LpStatus[prob.status])
# In case the problem is ill-formulated or there is not sufficient information,
# the solution may be infeasible or unbounded
for v in prob.variables():
if v.varValue>0:
print(v.name, "=", format(round(v.varValue)))
Vials_Vial_Size_1 = 3
Vials_Vial_Size_2 = 1
obj =round((value(prob.objective)))
print("The total cost of optimized vials: ${}".format(round(obj)))
The total cost of optimized vials: $3800
'
how to set it for 2 or more drugs and get the best optimal solution.
Here is an approach to solve the first part of the question, finding vial combinations that minimizes the waste (I'm not sure what role the price plays?):
from pulp import *
import pandas as pd
import csv
drugs_dict = {"D_name": ['Act', 'dug'],
"Vial_size1": [120, 80],
"Vial_size2": [400, 200],
"Vial_size3": [0, 400],
"cost": [5, 40],
"units_needed": [738, 262]}
df = pd.DataFrame(drugs_dict)
drugs = list(df['D_name'])
vial_1_size = dict(zip(drugs, drugs_dict["Vial_size1"]))
vial_2_size = dict(zip(drugs, drugs_dict["Vial_size2"]))
vial_3_size = dict(zip(drugs, drugs_dict["Vial_size3"]))
units_needed = dict(zip(drugs, drugs_dict["units_needed"]))
results = []
for drug in drugs:
print(f"drug = {drug}")
# setup minimum waste problem
prob = LpProblem("Minimum Waste Problem", LpMinimize)
# create decision variables
vial_1_var = LpVariable("Vial_1", lowBound=0, cat='Integer')
vial_2_var = LpVariable("Vial_2", lowBound=0, cat='Integer')
vial_3_var = LpVariable("Vial_3", lowBound=0, cat='Integer')
units = lpSum([vial_1_size[drug] * vial_1_var +
vial_2_size[drug] * vial_2_var +
vial_3_size[drug] * vial_3_var])
# objective function
prob += units
# constraints
prob += units >= units_needed[drug]
prob.solve()
print(f"units = {units.value()}")
for v in prob.variables():
if v.varValue > 0:
print(v.name, "=", v.varValue)
results.append([drug, units.value(), int(vial_1_var.value() or 0), int(vial_2_var.value() or 0), int(vial_3_var.value() or 0)])
with open('vial_results.csv', 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['drug', 'units', 'vial_1', 'vial_2', 'vial_3'])
csv_writer.writerows(results)
Running gives:
drug = Act
units = 760.0
Vial_1 = 3.0
Vial_2 = 1.0
drug = dug
units = 280.0
Vial_1 = 1.0
Vial_2 = 1.0

Jupyter "TypeError: invalid type comparison"

Alright, just started a new job and i have been tasked with writing a simple notebook in jupyter. I really want to impress my supervisor and have been working on this code for hours and can't get it to work, hopefully somebody here can help me.
Here is the code I have been working on:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
df = pd.read_csv(r'C:\Users\jk2588\Documents\EDA\EDA Practice\top1000_dataset.csv', converters={'GENDER': lambda x: int(x == 'Male')}, usecols = ['MEMBER_ID', 'GENDER', 'Age', 'Dement'])
df_gp_1 = df[['MEMBER_ID', 'Dement']].groupby('MEMBER_ID').agg(np.mean).reset_index()
df_gp_2 = df[['MEMBER_ID', 'GENDER', 'Age']].groupby('MEMBER_ID').agg(max).reset_index()
df_gp = pd.merge(df_gp_1, df_gp_2, on = ['MEMBER_ID'])
df.head()
Output: MEMBER_ID Age Dement GENDER
0 000000002 01 36 NaN 0
1 000000002 01 36 NaN 0
2 000000002 01 36 NaN 0
3 000000002 01 36 NaN 0
4 000000002 01 36 NaN 0
df['Dement'] = df['Dement'].fillna(0)
df['Dement'] = df['Dement'].astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
MEMBER_ID 999 non-null object
Age 999 non-null int64
Dement 999 non-null int64
GENDER 999 non-null int64
dtypes: int64(3), object(1)
memory usage: 31.3+ KB
freq = ((df_gp.Age.value_counts(normalize = True).reset_index().sort_values(by = 'index').Age)*100).tolist()
number_gp = 7
def ax_settings(ax, var_name, x_min, x_max):
ax.set_xlim(x_min,x_max)
ax.set_yticks([])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_edgecolor('#444444')
ax.spines['bottom'].set_linewidth(2)
ax.text(0.02, 0.05, var_name, fontsize=17, fontweight="bold", transform = ax.transAxes)
return None
fig = plt.figure(figsize=(12,7))
gs = gridspec.GridSpec(nrows=number_gp,
ncols=2,
figure=fig,
width_ratios= [3, 1],
height_ratios= [1]*number_gp,
wspace=0.2, hspace=0.05
)
ax = [None]*(number_gp + 1)
features = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
for i in range(number_gp):
ax[i] = fig.add_subplot(gs[i, 0])
ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
if i < (number_gp - 1): ax[i].set_xticks([])
ax[0].legend(['Male', 'Female'], facecolor='w')
ax[number_gp] = fig.add_subplot(gs[:, 1])
ax[number_gp].spines['right'].set_visible(False)
ax[number_gp].spines['top'].set_visible(False)
ax[number_gp].barh(features, freq, color='#004c99', height=0.4)
ax[number_gp].set_xlim(0,100)
ax[number_gp].invert_yaxis()
ax[number_gp].text(1.09, -0.04, '(%)', fontsize=10, transform = ax[number_gp].transAxes)
ax[number_gp].tick_params(axis='y', labelsize = 14)
plt.show()
I am then met with:
C:\Users\jk2588\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py:1167: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = method(y)
--------------------------------------------------------------------------
TypeError Traceback (most recent call last
<ipython-input-38-8665030edb1c> in <module>()
24 ax[i] = fig.add_subplot(gs[i, 0])
25 ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
---> 26 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
27 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
28 if i < (number_gp - 1): ax[i].set_xticks([])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1281
1282 with np.errstate(all='ignore'):
-> 1283 res = na_op(values, other)
1284 if is_scalar(res):
1285 raise TypeError('Could not compare {typ} type with Series'
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y)
1167 result = method(y)
1168 if result is NotImplemented:
-> 1169 raise TypeError("invalid type comparison")
1170 else:
1171 result = op(x, y)
TypeError: invalid type comparison
Please help, i have been faced with an absurd amount of errors this week

Spark random forest - could not convert float to int error

I have features which are numeric and a binary response. I am trying to build ensemble decision trees such as random forest and gradient-boosted trees. However, I get an error. I have reproduced the error with iris data.
The error is below and the whole error message is at the bottom.
TypeError: Could not convert 12.631578947368421 to int
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
y = list(iris.target)
df = pd.read_csv("https://raw.githubusercontent.com/venky14/Machine- Learning-with-Iris-Dataset/master/Iris.csv")
df = df.drop(['Species'], axis = 1)
df['label'] = y
spark_df = spark.createDataFrame(df).drop('Id')
cols = spark_df.drop('label').columns
assembler = VectorAssembler(inputCols = cols, outputCol = 'features')
output_dat = assembler.transform(spark_df).select('label', 'features')
rf = RandomForestClassifier(labelCol = "label", featuresCol = "features")
paramGrid_rf = ParamGridBuilder() \
.addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
.addGrid(rf.numTrees, np.linspace(10, 60, 20)).build()
crossval_rf = CrossValidator(estimator = rf,
estimatorParamMaps = paramGrid_rf,
evaluator = BinaryClassificationEvaluator(),
numFolds = 5)
cvModel_rf = crossval_rf.fit(output_dat)
TypeError Traceback (most recent call last)
<ipython-input-24-44f8f759ed8e> in <module>
2 paramGrid_rf = ParamGridBuilder() \
3 .addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
----> 4 .addGrid(rf.numTrees, np.linspace(10, 60, 20)) \
5 .build()
6
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in build(self)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in to_key_value_pairs(keys, values)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/param/__init__.py in toInt(value)
197 return int(value)
198 else:
--> 199 raise TypeError("Could not convert %s to int" % value)
200
201 #staticmethod
TypeError: Could not convert 12.631578947368421 to int```
Both maxDepth and numTrees need to be integers; Numpy linspace procudes floats:
import numpy as np
np.linspace(10, 60, 20)
Result:
array([ 10. , 12.63157895, 15.26315789, 17.89473684,
20.52631579, 23.15789474, 25.78947368, 28.42105263,
31.05263158, 33.68421053, 36.31578947, 38.94736842,
41.57894737, 44.21052632, 46.84210526, 49.47368421,
52.10526316, 54.73684211, 57.36842105, 60. ])
So, your code bumps upon the first non-integer value (here 12.63157895), and produces an error.
Use arange instead:
np.arange(10, 60, 20)
# array([10, 30, 50])

predict 2 dimension output from 10 numbers by tensorflow

I want to predict one number from 10 numbers
What I want to do is predict t from mat
Each mat[i] is corrsponding to t[i]
Of course I have more then 5 rows in mat and t , just simplifies the problem now.
I have written the code like this below.
#There is target data `t` and traindata `mat[0]`,`mat[1]`,`mat[2]`....
t = [0,1,0,1,0] #answer 2 dimension
limit = 10# number of degrees
mat = [[2,-2,3,-4,2,2,3,5,3,6], #10 degrees number of mat[0] leads t[0]
[1,3,-3,2,2,5,1,3,2,3], #10 degrees number of mat[1] leads t[1]
[-2,3,2,-2,2,-2,1,3,4,5], #10 degrees number of mat[2] leads t[2]
[-2,2,-1,-2,2,-2,7,3,9,2], #10 degrees number of mat[3] leads t[3]
[-2,-3,2,-2,2,-4,1,-4,4,5], #10 degrees number of mat[4] leads t[4]
]
x = tf.placeholder(tf.float32,[None,10])
w = tf.Variable(tf.zeros([10,5]))
y = tf.matmul(x,w)
t = tf.placeholder(tf.float32,[None,1])
loss = tf.reduce_sum(tf.square(y-t))
train_step = tf.train.AdamOptimizer().minimize(loss)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
train_t = np.array(mat)
train_t = train_t.reshape([limit,5])
train_x = np.zeros([limit,5])
# initialize
for row, num in enumerate(range(1,limit + 1)):
for col, n in enumerate(range(0,5)):
train_x[row][col] = num**n
i = 0
for _ in range(100000):
i += 1
sess.run(train_step,feed_dict={x:train_x,t:train_t})
if i % 10000 == 0:
loss_val = sess.run(loss,feed_dict={x:train_x,t:train_t})
print('step : %d,Loss: %f' % (i,loss_val))
w_val = sess.run(w)
pprint("w_val")
pprint(w_val)
However this shows error like this
Traceback (most recent call last):
File "wisdom2.py", line 60, in <module>
sess.run(train_step,feed_dict={x:train_x,t:train_t})
File "/Users/whitebear/tensorflow/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 789, in run
run_metadata_ptr)
File "/Users/whitebear/tensorflow/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 975, in _run
% (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (10, 5) for Tensor 'Placeholder:0', which has shape '(?, 10)'
The problem is that the shape of your placeholder and the shape of your input do not match. The placeholder x expects a value with N rows and 10 columns, but train_x has 10 rows and 5 columns. Likewise, t should have N rows and 1 column, but the passed value train_t has 10 rows and 5 columns. You should either change the shape of your placeholders or the shape of your inputs.

New error during set_labels in pandas 0.19.2: ValueError: Unequal label lengths

After upgrading from Pandas 0.18.1 to 0.19.2, I am getting the following error when I try to add new levels and labels to my dataframe. Any idea what the problem is?
print index
MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
print levels
[['1', '2', 'Total'], ['nextLevel']]
print labels
[[0, 1, 2], [0, 0, 0]]
index = index.set_levels(levels)
print index
MultiIndex(levels=[[u'Supported', u'Unsupported', u'Total'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index = index.set_labels(labels)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-f6fb11fbbb3a> in <module>()
288
289 # Initialize dfplot
--> 290 slice_data()
291
292 if len(resultList)==1:
<ipython-input-11-f6fb11fbbb3a> in slice_data(*args)
71 index = index.set_levels(levels)
72 print index
---> 73 index = index.set_labels(labels)
74 data_slice = data_slice.reindex(index)
75
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in set_labels(self, labels, level, inplace, verify_integrity)
350 idx = self._shallow_copy()
351 idx._reset_identity()
--> 352 idx._set_labels(labels, level=level, verify_integrity=verify_integrity)
353 if not inplace:
354 return idx
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _set_labels(self, labels, level, copy, validate, verify_integrity)
285
286 if verify_integrity:
--> 287 self._verify_integrity(labels=new_labels)
288
289 self._labels = new_labels
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _verify_integrity(self, labels, levels)
145 if len(label) != label_length:
146 raise ValueError("Unequal label lengths: %s" %
--> 147 ([len(lab) for lab in labels]))
148 if len(label) and label.max() >= len(level):
149 raise ValueError("On level %d, label max (%d) >= length of"
ValueError: Unequal label lengths: [3, 3]
I'm wondering if it's a bug in the new pandas code. Perhaps self.labels[0] should be labels[0]?
def _verify_integrity(self, labels=None, levels=None):
"""
Parameters
----------
labels : optional list
Labels to check for validity. Defaults to current labels.
levels : optional list
Levels to check for validity. Defaults to current levels.
Raises
------
ValueError
* if length of levels and labels don't match or any label would
exceed level bounds
"""
# NOTE: Currently does not check, among other things, that cached
# nlevels matches nor that sortorder matches actually sortorder.
labels = labels or self.labels
levels = levels or self.levels
if len(levels) != len(labels):
raise ValueError("Length of levels and labels must match. NOTE:"
" this index is in an inconsistent state.")
label_length = len(self.labels[0])
for i, (level, label) in enumerate(zip(levels, labels)):
if len(label) != label_length:
raise ValueError("Unequal label lengths: %s" %
([len(lab) for lab in labels]))
if len(label) and label.max() >= len(level):
raise ValueError("On level %d, label max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, label.max(),
len(level)))
I tested my fix and it worked! I submitted a bug to Pandas:
https://github.com/pandas-dev/pandas/issues/15157
I'm not sure if its a bug - I suppose Pandas could replace all the extra indexes with missing values doing it your way but I think you should use reindex
df.reindex(index2)
index = pd.MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index2 = pd.MultiIndex(levels=[['1', '2', 'Total'], ['nextLevel']],
labels=[[0, 1, 2], [0, 0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
I am new to Pandas, and I found the documentation on MultiIndexing difficult to adapt to solving my own problem. Basically, I want to add some extra rows. This is the solution I came up with. There is probably a much better way to do it. Feel free to share if you'd like.
groupbyColumns = ['label0', 'label1']
data_slice = dataframe.groupby(by=groupbyColumns).sum()
index = data_slice.index
levels = list()
for levelIter in range(len(data_slice.index.levels)):
levels.append([x for x in data_slice.index.levels[levelIter]])
levels[0].append('Total')
if len(resultList)==2:
levels[-1].append('Difference')
addIndexCountForDifferenceRow = 1
else:
addIndexCountForDifferenceRow = 0
# Create new indexing sequence since we are adding Total (and Difference if doing comparison) rows
labels = list()
for labelIter in range(len(data_slice.index.labels)):
labels.append(list())
if len(data_slice.index.labels)==2:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
if len(data_slice.index.labels)==3:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
labels2 = [x for x in data_slice.index.labels[2]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1):
for iter2 in range(max(labels2)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
labels[2].append(iter2)
index = index.set_levels(levels)
index = index.set_labels(labels)
data_slice = data_slice.reindex(index)