Fast binning of geographical data with negative values - scipy.stats
Trying to bin some geolocated data using scipy stats.binned_statisc_2d but it seems there cannot exist any negative values in the data. Is there a way to do this accurately and fast?
import numpy as np
ilats = np.linspace(90,-90, 4000)
ilons = np.linspace(-180, 179.955, 8000)
values = np.array([17,-14, -7,-8,-11,-8,-7,-8,-10,-5,-3,
-12,-5, -6,21,30, 2, 4,-8, 6, 4, 7,
3,-6,-13, 21, 4, 5,11,-6, 8,-5,-6,
9,8,-8, -2,-16,-5,-5,-9,-4,-6,33,
-8,-5,-14,-8,-11,21,24,-7,-13,12,-6,
5,7,8,-3,-3,-4, 4, 9,-3, 9,-11,
-8,6,4, 8,-6,-6,-4,-3, 4, 5,11,
-3,-6,-4,-8,-4,12,-9,-8,15,-10,-5,
-4,12,5,-4, 4, 7,-13, 5,-4,-4,-5,
-8,-10,-9,-7.])
lats = np.array([ 6.7427, 42.7027, 42.6963, 10.3688, 37.5713, 37.5798,
-12.1563, 42.7127, 41.7457, 37.8122, 37.66 , 41.7456,
41.7457, 38.4462, 8.5418, -12.7309, -10.9395, -10.9464,
38.0641, -10.9507, -12.7316, -10.9313, -12.7235, 37.6469,
38.1234, 20.3964, -12.0847, -12.0844, 10.3794, 38.1302,
10.3627, 38.1582, 38.1463, 22.6466, 20.4246, 38.1401,
-36.6505, 38.2352, 37.8795, 40.2281, 37.8125, 42.323 ,
37.8775, 9.3717, 38.732 , 38.7202, 38.2688, 38.9148,
38.9414, -4.8618, -4.8525, 39.0108, 38.8187, -6.5067,
38.009 , -6.5174, -6.5101, -6.51 , 37.7243, 37.7512,
37.7215, -6.4902, -6.5113, 37.5409, 1.9481, 37.6398,
-6.5073, 37.8037, -11.133 , 9.0896, 38.177 , 9.089 ,
37.8708, 38.3848, -3.553 , 9.4345, -3.5343, -3.5769,
37.6847, 37.6045, 37.8857, 38.32 , 8.1673, 37.8822,
37.9113, 8.6278, 37.5652, 37.8236, 37.8593, 8.6219,
-3.5614, 37.924 , 37.7845, 37.8436, 37.8666, 37.6804,
37.639 , 40.7691, 40.7744, 37.8029, 42.9793, 8.207 ,
39.302 ])
lons = np.array([ 60.8964, -96.1017, -96.1049, 71.595 , -97.0008, -97.0126,
57.4887, -96.109 , -95.1058, -97.1088, -96.6413, -95.1054,
-95.1062, -95.2395, 58.3938, -73.7145, -70.626 , -70.5864,
-95.5678, -70.5914, -73.7525, -70.6048, -73.753 , -96.7662,
-95.504 , 100.3965, -70.7921, -70.7905, 71.5499, -95.4816,
71.5457, -95.326 , -95.3355, 96.8339, 100.2684, -95.8697,
39.1031, -95.4456, -96.3814, -94.5726, -96.3782, -95.4554,
-96.3797, -66.7449, -95.1513, -95.1465, -95.0972, -95.2498,
-95.2054, 84.2004, 84.21 , -94.5695, -94.9174, 114.0945,
-95.942 , 114.0592, 114.0956, 114.0873, -96.4689, -96.4599,
-96.463 , 114.0741, 114.0975, -96.582 , 117.2901, -96.572 ,
114.0561, -96.5539, -74.9417, 71.3391, -95.4253, 71.2452,
-96.5511, -95.065 , 107.5832, 71.3906, 107.6005, 107.4975,
-96.9722, -96.9307, -96.2627, -95.1745, 72.5249, -96.2632,
-96.3324, 57.9562, -96.9309, -96.5123, -96.589 , 57.9627,
107.6405, -96.2711, -96.5737, -96.2344, -96.2099, -96.5062,
-96.5248, -94.8421, -94.8522, -96.5873, -97.1523, 72.4707,
-95.0489])
ret = stats.binned_statistic_2d(lons, lats, values, 'count', bins=[ilons, ilats])
Trying to examine the ungridded data, values by gridding them on a coarse grid (ilats, ilons) and plotting the counts first case and mean later on. But the above results produce:
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [52], in <cell line: 1>()
----> 1 ret = stats.binned_statistic_2d(lons, lats, ka, 'count', bins=[ilons, ilats])
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:352, in binned_statistic_2d(x, y, values, statistic, bins, range, expand_binnumbers)
349 xedges = yedges = np.asarray(bins, float)
350 bins = [xedges, yedges]
--> 352 medians, edges, binnumbers = binned_statistic_dd(
353 [x, y], values, statistic, bins, range,
354 expand_binnumbers=expand_binnumbers)
356 return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:571, in binned_statistic_dd(sample, values, statistic, bins, range, expand_binnumbers, binned_statistic_result)
569 if binned_statistic_result is None:
570 nbin, edges, dedges = _bin_edges(sample, bins, range)
--> 571 binnumbers = _bin_numbers(sample, nbin, edges, dedges)
572 else:
573 edges = binned_statistic_result.bin_edges
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:752, in _bin_numbers(sample, nbin, edges, dedges)
750 if dedges_min == 0:
751 raise ValueError('The smallest edge difference is numerically 0.')
-->** 752 decimal = int(-np.log10(dedges_min)) + 6**
753 # Find which points are on the rightmost edge.
754 on_edge = np.where((sample[:, i] >= edges[i][-1]) &
755 (np.around(sample[:, i], decimal) ==
756 np.around(edges[i][-1], decimal)))[0]
ValueError: cannot convert float NaN to integer
It looks like there is a log operation and I don't see a way around it.
Related
Optimization Python
I am trying to get the optimal solution column heading: D_name , Vial_size1 ,Vial_size2 ,Vial_size3 , cost , units_needed row 1: Act , 120 , 400 , 0 , $5 , 738 row 2: dug , 80 , 200 , 400 , $40 , 262 data in excel column heading: Vials price size Row 1: Vial size 1 5 120 Row 2: Vial size 2 5 400 prob=LpProblem("Dose_Vial",LpMinimize) import pandas as pd df = pd.read_excel (r'C:\Users\*****\Desktop\Vial.xls') print (df) # Create a list of the Vial_Size Vial_Size = list(df['Vials']) # Create a dictinary of units for all Vial_Size size = dict(zip(Vial_Size,df['size'])) # Create a dictinary of price for all Vial_Size Price = dict(zip(Vial_Size,df['Price'])) # print dictionaries print(Vial_Size) print(size) print(Price) vial_vars = LpVariable.dicts("Vials",size,lowBound=0,cat='Integer') # start building the LP problem by adding the main objective function prob += lpSum([Price[i]*vial_vars[i]*size[i] for i in size]) # adding constraints prob += lpSum([size[f] * vial_vars[f] for f in size]) >= 738 # The status of the solution is printed to the screen prob.solve() print("Status:", LpStatus[prob.status]) # In case the problem is ill-formulated or there is not sufficient information, # the solution may be infeasible or unbounded for v in prob.variables(): if v.varValue>0: print(v.name, "=", format(round(v.varValue))) Vials_Vial_Size_1 = 3 Vials_Vial_Size_2 = 1 obj =round((value(prob.objective))) print("The total cost of optimized vials: ${}".format(round(obj))) The total cost of optimized vials: $3800 ' how to set it for 2 or more drugs and get the best optimal solution.
Here is an approach to solve the first part of the question, finding vial combinations that minimizes the waste (I'm not sure what role the price plays?): from pulp import * import pandas as pd import csv drugs_dict = {"D_name": ['Act', 'dug'], "Vial_size1": [120, 80], "Vial_size2": [400, 200], "Vial_size3": [0, 400], "cost": [5, 40], "units_needed": [738, 262]} df = pd.DataFrame(drugs_dict) drugs = list(df['D_name']) vial_1_size = dict(zip(drugs, drugs_dict["Vial_size1"])) vial_2_size = dict(zip(drugs, drugs_dict["Vial_size2"])) vial_3_size = dict(zip(drugs, drugs_dict["Vial_size3"])) units_needed = dict(zip(drugs, drugs_dict["units_needed"])) results = [] for drug in drugs: print(f"drug = {drug}") # setup minimum waste problem prob = LpProblem("Minimum Waste Problem", LpMinimize) # create decision variables vial_1_var = LpVariable("Vial_1", lowBound=0, cat='Integer') vial_2_var = LpVariable("Vial_2", lowBound=0, cat='Integer') vial_3_var = LpVariable("Vial_3", lowBound=0, cat='Integer') units = lpSum([vial_1_size[drug] * vial_1_var + vial_2_size[drug] * vial_2_var + vial_3_size[drug] * vial_3_var]) # objective function prob += units # constraints prob += units >= units_needed[drug] prob.solve() print(f"units = {units.value()}") for v in prob.variables(): if v.varValue > 0: print(v.name, "=", v.varValue) results.append([drug, units.value(), int(vial_1_var.value() or 0), int(vial_2_var.value() or 0), int(vial_3_var.value() or 0)]) with open('vial_results.csv', 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(['drug', 'units', 'vial_1', 'vial_2', 'vial_3']) csv_writer.writerows(results) Running gives: drug = Act units = 760.0 Vial_1 = 3.0 Vial_2 = 1.0 drug = dug units = 280.0 Vial_1 = 1.0 Vial_2 = 1.0
Jupyter "TypeError: invalid type comparison"
Alright, just started a new job and i have been tasked with writing a simple notebook in jupyter. I really want to impress my supervisor and have been working on this code for hours and can't get it to work, hopefully somebody here can help me. Here is the code I have been working on: import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sns df = pd.read_csv(r'C:\Users\jk2588\Documents\EDA\EDA Practice\top1000_dataset.csv', converters={'GENDER': lambda x: int(x == 'Male')}, usecols = ['MEMBER_ID', 'GENDER', 'Age', 'Dement']) df_gp_1 = df[['MEMBER_ID', 'Dement']].groupby('MEMBER_ID').agg(np.mean).reset_index() df_gp_2 = df[['MEMBER_ID', 'GENDER', 'Age']].groupby('MEMBER_ID').agg(max).reset_index() df_gp = pd.merge(df_gp_1, df_gp_2, on = ['MEMBER_ID']) df.head() Output: MEMBER_ID Age Dement GENDER 0 000000002 01 36 NaN 0 1 000000002 01 36 NaN 0 2 000000002 01 36 NaN 0 3 000000002 01 36 NaN 0 4 000000002 01 36 NaN 0 df['Dement'] = df['Dement'].fillna(0) df['Dement'] = df['Dement'].astype('int64') df.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 4 columns): MEMBER_ID 999 non-null object Age 999 non-null int64 Dement 999 non-null int64 GENDER 999 non-null int64 dtypes: int64(3), object(1) memory usage: 31.3+ KB freq = ((df_gp.Age.value_counts(normalize = True).reset_index().sort_values(by = 'index').Age)*100).tolist() number_gp = 7 def ax_settings(ax, var_name, x_min, x_max): ax.set_xlim(x_min,x_max) ax.set_yticks([]) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_edgecolor('#444444') ax.spines['bottom'].set_linewidth(2) ax.text(0.02, 0.05, var_name, fontsize=17, fontweight="bold", transform = ax.transAxes) return None fig = plt.figure(figsize=(12,7)) gs = gridspec.GridSpec(nrows=number_gp, ncols=2, figure=fig, width_ratios= [3, 1], height_ratios= [1]*number_gp, wspace=0.2, hspace=0.05 ) ax = [None]*(number_gp + 1) features = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+'] for i in range(number_gp): ax[i] = fig.add_subplot(gs[i, 0]) ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000) sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False) sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False) if i < (number_gp - 1): ax[i].set_xticks([]) ax[0].legend(['Male', 'Female'], facecolor='w') ax[number_gp] = fig.add_subplot(gs[:, 1]) ax[number_gp].spines['right'].set_visible(False) ax[number_gp].spines['top'].set_visible(False) ax[number_gp].barh(features, freq, color='#004c99', height=0.4) ax[number_gp].set_xlim(0,100) ax[number_gp].invert_yaxis() ax[number_gp].text(1.09, -0.04, '(%)', fontsize=10, transform = ax[number_gp].transAxes) ax[number_gp].tick_params(axis='y', labelsize = 14) plt.show() I am then met with: C:\Users\jk2588\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py:1167: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison result = method(y) -------------------------------------------------------------------------- TypeError Traceback (most recent call last <ipython-input-38-8665030edb1c> in <module>() 24 ax[i] = fig.add_subplot(gs[i, 0]) 25 ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000) ---> 26 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False) 27 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False) 28 if i < (number_gp - 1): ax[i].set_xticks([]) ~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis) 1281 1282 with np.errstate(all='ignore'): -> 1283 res = na_op(values, other) 1284 if is_scalar(res): 1285 raise TypeError('Could not compare {typ} type with Series' ~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y) 1167 result = method(y) 1168 if result is NotImplemented: -> 1169 raise TypeError("invalid type comparison") 1170 else: 1171 result = op(x, y) TypeError: invalid type comparison Please help, i have been faced with an absurd amount of errors this week
Spark random forest - could not convert float to int error
I have features which are numeric and a binary response. I am trying to build ensemble decision trees such as random forest and gradient-boosted trees. However, I get an error. I have reproduced the error with iris data. The error is below and the whole error message is at the bottom. TypeError: Could not convert 12.631578947368421 to int from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.classification import GBTClassifier import pandas as pd from sklearn import datasets iris = datasets.load_iris() y = list(iris.target) df = pd.read_csv("https://raw.githubusercontent.com/venky14/Machine- Learning-with-Iris-Dataset/master/Iris.csv") df = df.drop(['Species'], axis = 1) df['label'] = y spark_df = spark.createDataFrame(df).drop('Id') cols = spark_df.drop('label').columns assembler = VectorAssembler(inputCols = cols, outputCol = 'features') output_dat = assembler.transform(spark_df).select('label', 'features') rf = RandomForestClassifier(labelCol = "label", featuresCol = "features") paramGrid_rf = ParamGridBuilder() \ .addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \ .addGrid(rf.numTrees, np.linspace(10, 60, 20)).build() crossval_rf = CrossValidator(estimator = rf, estimatorParamMaps = paramGrid_rf, evaluator = BinaryClassificationEvaluator(), numFolds = 5) cvModel_rf = crossval_rf.fit(output_dat) TypeError Traceback (most recent call last) <ipython-input-24-44f8f759ed8e> in <module> 2 paramGrid_rf = ParamGridBuilder() \ 3 .addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \ ----> 4 .addGrid(rf.numTrees, np.linspace(10, 60, 20)) \ 5 .build() 6 ~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in build(self) 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)] 121 --> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] 123 124 ~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0) 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)] 121 --> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] 123 124 ~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in to_key_value_pairs(keys, values) 118 119 def to_key_value_pairs(keys, values): --> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)] 121 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] ~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0) 118 119 def to_key_value_pairs(keys, values): --> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)] 121 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] ~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/param/__init__.py in toInt(value) 197 return int(value) 198 else: --> 199 raise TypeError("Could not convert %s to int" % value) 200 201 #staticmethod TypeError: Could not convert 12.631578947368421 to int```
Both maxDepth and numTrees need to be integers; Numpy linspace procudes floats: import numpy as np np.linspace(10, 60, 20) Result: array([ 10. , 12.63157895, 15.26315789, 17.89473684, 20.52631579, 23.15789474, 25.78947368, 28.42105263, 31.05263158, 33.68421053, 36.31578947, 38.94736842, 41.57894737, 44.21052632, 46.84210526, 49.47368421, 52.10526316, 54.73684211, 57.36842105, 60. ]) So, your code bumps upon the first non-integer value (here 12.63157895), and produces an error. Use arange instead: np.arange(10, 60, 20) # array([10, 30, 50])
predict 2 dimension output from 10 numbers by tensorflow
I want to predict one number from 10 numbers What I want to do is predict t from mat Each mat[i] is corrsponding to t[i] Of course I have more then 5 rows in mat and t , just simplifies the problem now. I have written the code like this below. #There is target data `t` and traindata `mat[0]`,`mat[1]`,`mat[2]`.... t = [0,1,0,1,0] #answer 2 dimension limit = 10# number of degrees mat = [[2,-2,3,-4,2,2,3,5,3,6], #10 degrees number of mat[0] leads t[0] [1,3,-3,2,2,5,1,3,2,3], #10 degrees number of mat[1] leads t[1] [-2,3,2,-2,2,-2,1,3,4,5], #10 degrees number of mat[2] leads t[2] [-2,2,-1,-2,2,-2,7,3,9,2], #10 degrees number of mat[3] leads t[3] [-2,-3,2,-2,2,-4,1,-4,4,5], #10 degrees number of mat[4] leads t[4] ] x = tf.placeholder(tf.float32,[None,10]) w = tf.Variable(tf.zeros([10,5])) y = tf.matmul(x,w) t = tf.placeholder(tf.float32,[None,1]) loss = tf.reduce_sum(tf.square(y-t)) train_step = tf.train.AdamOptimizer().minimize(loss) sess = tf.Session() sess.run(tf.initialize_all_variables()) train_t = np.array(mat) train_t = train_t.reshape([limit,5]) train_x = np.zeros([limit,5]) # initialize for row, num in enumerate(range(1,limit + 1)): for col, n in enumerate(range(0,5)): train_x[row][col] = num**n i = 0 for _ in range(100000): i += 1 sess.run(train_step,feed_dict={x:train_x,t:train_t}) if i % 10000 == 0: loss_val = sess.run(loss,feed_dict={x:train_x,t:train_t}) print('step : %d,Loss: %f' % (i,loss_val)) w_val = sess.run(w) pprint("w_val") pprint(w_val) However this shows error like this Traceback (most recent call last): File "wisdom2.py", line 60, in <module> sess.run(train_step,feed_dict={x:train_x,t:train_t}) File "/Users/whitebear/tensorflow/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 789, in run run_metadata_ptr) File "/Users/whitebear/tensorflow/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 975, in _run % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape()))) ValueError: Cannot feed value of shape (10, 5) for Tensor 'Placeholder:0', which has shape '(?, 10)'
The problem is that the shape of your placeholder and the shape of your input do not match. The placeholder x expects a value with N rows and 10 columns, but train_x has 10 rows and 5 columns. Likewise, t should have N rows and 1 column, but the passed value train_t has 10 rows and 5 columns. You should either change the shape of your placeholders or the shape of your inputs.
New error during set_labels in pandas 0.19.2: ValueError: Unequal label lengths
After upgrading from Pandas 0.18.1 to 0.19.2, I am getting the following error when I try to add new levels and labels to my dataframe. Any idea what the problem is? print index MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']], labels=[[0, 1], [0, 0]], names=[u'segment..ASRinfo..supportedUtt', u'label']) print levels [['1', '2', 'Total'], ['nextLevel']] print labels [[0, 1, 2], [0, 0, 0]] index = index.set_levels(levels) print index MultiIndex(levels=[[u'Supported', u'Unsupported', u'Total'], [u'nextLevel']], labels=[[0, 1], [0, 0]], names=[u'segment..ASRinfo..supportedUtt', u'label']) index = index.set_labels(labels) --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-11-f6fb11fbbb3a> in <module>() 288 289 # Initialize dfplot --> 290 slice_data() 291 292 if len(resultList)==1: <ipython-input-11-f6fb11fbbb3a> in slice_data(*args) 71 index = index.set_levels(levels) 72 print index ---> 73 index = index.set_labels(labels) 74 data_slice = data_slice.reindex(index) 75 /Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in set_labels(self, labels, level, inplace, verify_integrity) 350 idx = self._shallow_copy() 351 idx._reset_identity() --> 352 idx._set_labels(labels, level=level, verify_integrity=verify_integrity) 353 if not inplace: 354 return idx /Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _set_labels(self, labels, level, copy, validate, verify_integrity) 285 286 if verify_integrity: --> 287 self._verify_integrity(labels=new_labels) 288 289 self._labels = new_labels /Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _verify_integrity(self, labels, levels) 145 if len(label) != label_length: 146 raise ValueError("Unequal label lengths: %s" % --> 147 ([len(lab) for lab in labels])) 148 if len(label) and label.max() >= len(level): 149 raise ValueError("On level %d, label max (%d) >= length of" ValueError: Unequal label lengths: [3, 3] I'm wondering if it's a bug in the new pandas code. Perhaps self.labels[0] should be labels[0]? def _verify_integrity(self, labels=None, levels=None): """ Parameters ---------- labels : optional list Labels to check for validity. Defaults to current labels. levels : optional list Levels to check for validity. Defaults to current levels. Raises ------ ValueError * if length of levels and labels don't match or any label would exceed level bounds """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. labels = labels or self.labels levels = levels or self.levels if len(levels) != len(labels): raise ValueError("Length of levels and labels must match. NOTE:" " this index is in an inconsistent state.") label_length = len(self.labels[0]) for i, (level, label) in enumerate(zip(levels, labels)): if len(label) != label_length: raise ValueError("Unequal label lengths: %s" % ([len(lab) for lab in labels])) if len(label) and label.max() >= len(level): raise ValueError("On level %d, label max (%d) >= length of" " level (%d). NOTE: this index is in an" " inconsistent state" % (i, label.max(), len(level)))
I tested my fix and it worked! I submitted a bug to Pandas: https://github.com/pandas-dev/pandas/issues/15157
I'm not sure if its a bug - I suppose Pandas could replace all the extra indexes with missing values doing it your way but I think you should use reindex df.reindex(index2) index = pd.MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']], labels=[[0, 1], [0, 0]], names=[u'segment..ASRinfo..supportedUtt', u'label']) index2 = pd.MultiIndex(levels=[['1', '2', 'Total'], ['nextLevel']], labels=[[0, 1, 2], [0, 0, 0]], names=[u'segment..ASRinfo..supportedUtt', u'label'])
I am new to Pandas, and I found the documentation on MultiIndexing difficult to adapt to solving my own problem. Basically, I want to add some extra rows. This is the solution I came up with. There is probably a much better way to do it. Feel free to share if you'd like. groupbyColumns = ['label0', 'label1'] data_slice = dataframe.groupby(by=groupbyColumns).sum() index = data_slice.index levels = list() for levelIter in range(len(data_slice.index.levels)): levels.append([x for x in data_slice.index.levels[levelIter]]) levels[0].append('Total') if len(resultList)==2: levels[-1].append('Difference') addIndexCountForDifferenceRow = 1 else: addIndexCountForDifferenceRow = 0 # Create new indexing sequence since we are adding Total (and Difference if doing comparison) rows labels = list() for labelIter in range(len(data_slice.index.labels)): labels.append(list()) if len(data_slice.index.labels)==2: labels0 = [x for x in data_slice.index.labels[0]] labels1 = [x for x in data_slice.index.labels[1]] for iter0 in range(max(labels0)+2): for iter1 in range(max(labels1)+1+addIndexCountForDifferenceRow): labels[0].append(iter0) labels[1].append(iter1) if len(data_slice.index.labels)==3: labels0 = [x for x in data_slice.index.labels[0]] labels1 = [x for x in data_slice.index.labels[1]] labels2 = [x for x in data_slice.index.labels[2]] for iter0 in range(max(labels0)+2): for iter1 in range(max(labels1)+1): for iter2 in range(max(labels2)+1+addIndexCountForDifferenceRow): labels[0].append(iter0) labels[1].append(iter1) labels[2].append(iter2) index = index.set_levels(levels) index = index.set_labels(labels) data_slice = data_slice.reindex(index)