New error during set_labels in pandas 0.19.2: ValueError: Unequal label lengths - pandas

After upgrading from Pandas 0.18.1 to 0.19.2, I am getting the following error when I try to add new levels and labels to my dataframe. Any idea what the problem is?
print index
MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
print levels
[['1', '2', 'Total'], ['nextLevel']]
print labels
[[0, 1, 2], [0, 0, 0]]
index = index.set_levels(levels)
print index
MultiIndex(levels=[[u'Supported', u'Unsupported', u'Total'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index = index.set_labels(labels)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-f6fb11fbbb3a> in <module>()
288
289 # Initialize dfplot
--> 290 slice_data()
291
292 if len(resultList)==1:
<ipython-input-11-f6fb11fbbb3a> in slice_data(*args)
71 index = index.set_levels(levels)
72 print index
---> 73 index = index.set_labels(labels)
74 data_slice = data_slice.reindex(index)
75
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in set_labels(self, labels, level, inplace, verify_integrity)
350 idx = self._shallow_copy()
351 idx._reset_identity()
--> 352 idx._set_labels(labels, level=level, verify_integrity=verify_integrity)
353 if not inplace:
354 return idx
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _set_labels(self, labels, level, copy, validate, verify_integrity)
285
286 if verify_integrity:
--> 287 self._verify_integrity(labels=new_labels)
288
289 self._labels = new_labels
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _verify_integrity(self, labels, levels)
145 if len(label) != label_length:
146 raise ValueError("Unequal label lengths: %s" %
--> 147 ([len(lab) for lab in labels]))
148 if len(label) and label.max() >= len(level):
149 raise ValueError("On level %d, label max (%d) >= length of"
ValueError: Unequal label lengths: [3, 3]
I'm wondering if it's a bug in the new pandas code. Perhaps self.labels[0] should be labels[0]?
def _verify_integrity(self, labels=None, levels=None):
"""
Parameters
----------
labels : optional list
Labels to check for validity. Defaults to current labels.
levels : optional list
Levels to check for validity. Defaults to current levels.
Raises
------
ValueError
* if length of levels and labels don't match or any label would
exceed level bounds
"""
# NOTE: Currently does not check, among other things, that cached
# nlevels matches nor that sortorder matches actually sortorder.
labels = labels or self.labels
levels = levels or self.levels
if len(levels) != len(labels):
raise ValueError("Length of levels and labels must match. NOTE:"
" this index is in an inconsistent state.")
label_length = len(self.labels[0])
for i, (level, label) in enumerate(zip(levels, labels)):
if len(label) != label_length:
raise ValueError("Unequal label lengths: %s" %
([len(lab) for lab in labels]))
if len(label) and label.max() >= len(level):
raise ValueError("On level %d, label max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, label.max(),
len(level)))

I tested my fix and it worked! I submitted a bug to Pandas:
https://github.com/pandas-dev/pandas/issues/15157

I'm not sure if its a bug - I suppose Pandas could replace all the extra indexes with missing values doing it your way but I think you should use reindex
df.reindex(index2)
index = pd.MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index2 = pd.MultiIndex(levels=[['1', '2', 'Total'], ['nextLevel']],
labels=[[0, 1, 2], [0, 0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])

I am new to Pandas, and I found the documentation on MultiIndexing difficult to adapt to solving my own problem. Basically, I want to add some extra rows. This is the solution I came up with. There is probably a much better way to do it. Feel free to share if you'd like.
groupbyColumns = ['label0', 'label1']
data_slice = dataframe.groupby(by=groupbyColumns).sum()
index = data_slice.index
levels = list()
for levelIter in range(len(data_slice.index.levels)):
levels.append([x for x in data_slice.index.levels[levelIter]])
levels[0].append('Total')
if len(resultList)==2:
levels[-1].append('Difference')
addIndexCountForDifferenceRow = 1
else:
addIndexCountForDifferenceRow = 0
# Create new indexing sequence since we are adding Total (and Difference if doing comparison) rows
labels = list()
for labelIter in range(len(data_slice.index.labels)):
labels.append(list())
if len(data_slice.index.labels)==2:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
if len(data_slice.index.labels)==3:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
labels2 = [x for x in data_slice.index.labels[2]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1):
for iter2 in range(max(labels2)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
labels[2].append(iter2)
index = index.set_levels(levels)
index = index.set_labels(labels)
data_slice = data_slice.reindex(index)

Related

Normalizing windows in tensorflow dataset

I am trying to build a windowed dataset from a univariate time series.
The idea is if the series looks like [1, 2, 3, 4, 5, 6] and the window length was 2, then
I'd take windows of length 3 to account for 2 X features and Y target output, so
[[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] then I'll shuffle them up to avoid bias from that, and split out the input features from target output for each window: [[[1, 2], [3]], [[2, 3], [4]], [[3, 4], [5]], [[4, 5], [6]]]
def windowed_dataset(series):
# Initially the data is (N,) expand dims to (N, 1)
series = tf.expand_dims(series, axis=-1)
# Tensorflow Dataset from the array
ds = tf.data.Dataset.from_tensor_slices(series)
# Create the windows that will serve as input features and label (hence +1)
ds = ds.window(window_len + 1, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w: w.batch(window_len + 1))
# randomize order
ds = ds.shuffle(shuffle_buffer)
# Separate the inputs and the target output(label)
ds = ds.map(lambda w: (w[:-1], w[-1]))
return ds.batch(batch_size).prefetch(1)
However I'd like to add some normalization. For example if my window is w=[1, 2, 3] then I'd like to normalize according to [p/w[0] - 1 for p in w]
I thought I could achieve this with ds.map and
def normalize_window(w):
return [((i/w[0]) -1) for i in w]
ds = ds.map(normalize_window)
because map is supposed to apply the function to each window in the dataset, but this didn't work. All the example in tensorflow dataset docs use map with lambda functions but I presume it works with regular functions too
Does anyone know how it should be done?
EDIT
The traceback I get is
<ipython-input-39-929295e1b775> in <module>()
----> 1 dataset = model_forecast_datasets(btc_model, np_data[:6])
11 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
263 except Exception as e: # pylint:disable=broad-except
264 if hasattr(e, 'ag_error_metadata'):
--> 265 raise e.ag_error_metadata.to_exception(e)
266 else:
267 raise
OperatorNotAllowedInGraphError: in user code:
<ipython-input-38-b3d0f7e17689>:12 normalize_window *
return [(i/w[0] -1) for i in w]
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:561 __iter__
self._disallow_iteration()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:557 _disallow_iteration
self._disallow_in_graph_mode("iterating over `tf.Tensor`")
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:537 _disallow_in_graph_mode
" this function with #tf.function.".format(task))
OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed in Graph execution. Use Eager execution or decorate this function with #tf.function.
You would need a function that vectorizes the calculation, something like
def normalize(data):
mean = tf.math.reduce_mean(data)
std = tf.math.reduce_std(data)
data = tf.subtract(data, mean)
data = tf.divide(data, std)
return data
ds = ds.map(normalize)
Edit: For your specific normalization this may work:
def normalize(data):
data1 = tf.subtract(data, tf.constant(1))
data1 = tf.divide(data1, data[0])
return data1
(this would have to go after batching ds = ds.flat_map(...)

Tensorflow: When using slim.dataset.Dataset, is there a way to map label ID values to other values?

dataset = slim.dataset.Dataset(...)
provider = slim.dataset_data_provider.DatasetDataProvider(dataset, ..._
image, labels = provider.get(['image', 'label')
Let's say, for an example in a dataset A, labels could be [1, 2, 1, 3]. However, for some reason (e.g, due to dataset B), I would like to map the label IDs to other values. The mapping could be like below.
# {old_label: target_label}
mapping = {0: 0, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 1}
For now, I am guessing two ways:
-- tf.data.Dataset seems to have a map(map_func) function that every examples should pass, which could be the solution. However, I am more familiar to slim.dataset.Dataset. Is there a similar trick for slim.dataset.Dataset?
-- I was wondering if I can simply apply some mapping function to a tensor label such as:
new_labels = tf.map_fn(lambda x: x+1, labels, dtype=tf.int32)
# labels = [1 2 1 3] --> new_labels = [2 3 2 4]. This works.
new_labels = tf.map_fn(lambda x: mapping[x], labels, dtype=tf.int32)
# I wished but this does not work!
However, the below didn't work, which is what I need. Could anyone please advise?
I think you can try tf.contrib.lookup:
keys = list(mapping.keys())
values = [mapping[k] for k in keys]
table = tf.contrib.lookup.HashTable(
tf.contrib.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.int64, value_dtype=tf.int64), -1
)
new_labels = table.lookup(labels)
sess=tf.Session()
sess.run(table.init)
print(sess.run(new_labels))

ValueError: setting an array element with a sequence (LogisticRegression with Array based feature)

Thanks in advance for any guidance. I'm attempting to do classification via Logistic Regression using scikit-learn where the X is Intercept and one field that is an array of heartrate data called heartrate. Based on researching others who've also faced this error I've made sure the heartrate arrays are all the same shape/size.
It's getting the value error in sklearn/utils/validation.py line 382, in check_array on the line where a copy of the dataframe is done via array = np.array(array, dtype=dtype, order=order, copy=copy). I suspect that my arrays aren't contiguous in memory and that's what's posing the problem but not sure...
Here are some code snip-its to help sleuth out the problem:
def get_training_set(self):
training_set = []
after_date = datetime.utcnow() - timedelta(weeks=8)
before_date = datetime.utcnow() - timedelta(hours=12)
activities = self.strava_client.get_activities(after=after_date, before=before_date)
for act in activities:
if act.has_heartrate:
streams = self.strava_client.get_activity_streams(activity_id=act.id, types=['heartrate'])
heartrate = np.array(list(filter(lambda x: x is not None, streams['heartrate'].data)))
fixed_heartrate = np.pad(heartrate, (0, 15000 - len(heartrate)), 'constant')
item = {'activity_type': self.classes.index(act.type),'heartrate': fixed_heartrate}
training_set.append(item)
return pd.DataFrame(training_set)
def train(self):
df = self.get_training_set()
df['Intercept'] = np.ones((len(df),))
y = df[['activity_type']]
X = df[['Intercept', 'heartrate']]
y = np.ravel(y)
#
model = LogisticRegression()
self.debug('y={}'.format(y))
model = model.fit(X,y)
The exception occurs in fit...
Thanks in advance for any guidance.
Respect,
Mike
copied from comment for improved formatting:
/python3.5/site-packages/sklearn/linear_model/logistic.py", line 1173, in
fit order="C")
File "/python3.5/site-packages/sklearn/utils/validation.py", line 521, in
check_X_y ensure_min_features, warn_on_dtype, estimator)
File "/lib/python3.5/site-packages/sklearn/utils/validation.py", line 382, in
check_array array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: setting an array element with a sequence
and the other comment:
X and y look like this:
X.shape=(29, 2)
y.shape=(29,)
X=[[1 array([74, 74, 77, ..., 0, 0, 0])]
[1 array([66, 67, 69, ..., 0, 0, 0])]
...
[1 array([92, 92, 91, ..., 0, 0, 0])]
[1 array([79, 79, 79, ..., 0, 0, 0])]]
y=[ 0 11 11 0 1 0 11 0 11 1 0 11 0 0 11 0 0 0 0 0 11 0 11 0 0 0 11 0 0]
Do things work better if you change train() so look like this?
def train(self):
df = self.get_training_set()
df['Intercept'] = 1 # (a)
y = df['activity_type'].values # (b)
X = [np.concatenate(( np.array([col1]), col2 )) for col1, col2 in df[['Intercept', 'heartrate']].values.T]
model = LogisticRegression()
model.fit(X,y) # (c)
(a) A sequence of the correct length will be generated
(b) Use values to return an numpy array instead of another dataframe
(c) fit is done inplace

trimming column named is generating ValueError

I have a table which I run through a function to trim its columns down to length 128 (I know it's really long, there isn't anything I can do about that) characters so it can use to_sql to create a database from it.
def truncate_column_names(df, length):
rename = {}
for col in df.columns:
if len(col) > length:
new_col = col[:length-3]+"..."
rename[col] = new_col
result = df.rename(columns=rename)
return result
This function works fine and I get a table out just fine but the problem comes when I tried to save the file I get the error
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
The method I have doing some housekeeping before saving to a file included dropping duplicates and that is where this error is being spit out. I tested this by saving the original dataFrame and then just loading it, running the truncate function, and then trying drop_duplicates on the result and I get the same error.
The headers for the file before I try truncating looks like this:
http://pastebin.com/WXmvwHDg
I trimmed the file down to 1 record and still have the problem.
This was a result of the truncating causing some columns to have non-unique names.
To confirm this was an issue I did a short test:
In [113]: df = pd.DataFrame(columns=["ab", "ac", "ad"])
In [114]: df
Out[114]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [115]: df.drop_duplicates()
Out[115]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [116]: df.columns
Out[116]: Index([u'ab', u'ac', u'ad'], dtype='object')
In [117]: df.columns = df.columns.str[:1]
In [118]: df
Out[118]:
Empty DataFrame
Columns: [a, a, a]
Index: []
In [119]: df.drop_duplicates()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-119-daf275b6788b> in <module>()
----> 1 df.drop_duplicates()
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in drop_duplicates(self, su
bset, take_last, inplace)
2826 deduplicated : DataFrame
2827 """
-> 2828 duplicated = self.duplicated(subset, take_last=take_last)
2829
2830 if inplace:
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in duplicated(self, subset,
take_last)
2871
2872 vals = (self[col].values for col in subset)
-> 2873 labels, shape = map(list, zip( * map(f, vals)))
2874
2875 ids = get_group_index(labels, shape, sort=False, xnull=False)
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in f(vals)
2860
2861 def f(vals):
-> 2862 labels, shape = factorize(vals, size_hint=min(len(self), _SI
ZE_HINT_LIMIT))
2863 return labels.astype('i8',copy=False), len(shape)
2864
C:\Miniconda\lib\site-packages\pandas\core\algorithms.pyc in factorize(values, s
ort, order, na_sentinel, size_hint)
133 table = hash_klass(size_hint or len(vals))
134 uniques = vec_klass()
--> 135 labels = table.get_labels(vals, uniques, 0, na_sentinel)
136
137 labels = com._ensure_platform_int(labels)
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_labels (pandas\ha
shtable.c:13946)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
and got the same result. using df.columns.unique() after the truncation i had ~200 duplicate columns after the truncation

Creating image from point list with Numpy, how to speed up?

I've following code which seems to be performance bottleneck:
for x, y, intensity in myarr:
target_map[x, y] = target_map[x,y] + intensity
There are multiple coordinates for same coordinate with variable intensity.
Datatypes:
> print myarr.shape, myarr.dtype
(219929, 3) uint32
> print target_map.shape, target_map.dtype
(150, 200) uint32
Is there any way to optimize this loop, other than writing it in C?
This seems to be related question, how ever I couldn't get the accepted answer working for me: How to convert python list of points to numpy image array?
I get following error message:
Traceback (most recent call last):
File "<pyshell#38>", line 1, in <module>
image[coordinates] = 1
IndexError: too many indices for array
If you convert your 2D coordinates into target_map into flat indices into it using np.ravel_multi_index, you can use np.unique and np.bincount to speed things up quite a bit:
def vec_intensity(my_arr, target_map) :
flat_coords = np.ravel_multi_index((my_arr[:, 0], my_arr[:, 1]),
dims=target_map.shape)
unique_, idx = np.unique(flat_coords, return_inverse=True)
sum_ = np.bincount(idx, weights=my_arr[:, 2])
target_map.ravel()[unique_] += sum_
return target_map
def intensity(my_arr, target_map) :
for x, y, intensity in myarr:
target_map[x, y] += intensity
return target_map
#sample data set
rows, cols = 150, 200
items = 219929
myarr = np.empty((items, 3), dtype=np.uint32)
myarr[:, 0] = np.random.randint(rows, size=(items,))
myarr[:, 1] = np.random.randint(cols, size=(items,))
myarr[:, 2] = np.random.randint(100, size=(items,))
And now:
In [6]: %timeit target_map_1 = np.zeros((rows, cols), dtype=np.uint32); target_map_1 = vec_intensity(myarr, target_map_1)
10 loops, best of 3: 53.1 ms per loop
In [7]: %timeit target_map_2 = np.zeros((rows, cols), dtype=np.uint32); target_map_2 = intensity(myarr, target_map_2)
1 loops, best of 3: 934 ms per loop
In [8]: np.all(target_map_1 == target_map_2)
Out[8]: True
That's almost a 20x speed increase.