Alternate row colour for dataframe - pandas

Trying to build on from this: Python: Color pandas dataframe based on MultiIndex
I've extended the code:
import pandas as pd
i = pd.MultiIndex.from_tuples([(0, 'zero'), (0, 'one'), (1, 'zero'), (1, 'one'), (1, 'two'), (1, 'three'), (1, 'four'), (2, 'zero'), (2, 'one'), (2, 'two'), (2, 'three'), (2, 'four')], names=['level_0', 'level_1'])
df = pd.DataFrame(range(0, len(i)), index=i, columns=['foo'])
colors = {0: (0.6, 0.8, 0.8, 1), 1: (1, 0.9, 0.4, 1), 2: (0.6, 0.8, 0.8, 1)}
#convert rgba to integers
c1 = {k: (int(r * 255),int(g * 255),int(b * 255), a) for k, (r,g,b,a) in colors.items()}
c2 = {k: (int(r * 255),int(g * 255),int(b * 255), 0.25) for k, (r,g,b,a) in colors.items()}
#get values of first level of MulitIndex
idx = df.index.get_level_values(0)
#counter per first level for pair and unpair coloring
zipped = zip(df.groupby(idx).cumcount(), enumerate(idx))
css = [{'selector': f'.row{i}', 'props': [('background-color', f'rgba{c1[j]}')]}
if v % 2 == 0
else {'selector': f'.row{i}', 'props': [('background-color', f'rgba{c2[j]}')]}
for v,(i, j) in zipped]
df1.style.set_table_styles(css)
And got this:
It seems tedious to do this manually. So how do I go about generalising it so that it covers all rows, and the pattern applies even if I apply it to other such 2-level multi-index dataframes?

Here is one way to do it with cycle from Python standard library's itertools module:
import pandas as pd
# Setup
i = pd.MultiIndex.from_tuples(
[
(0, "zero"),
(0, "one"),
(1, "zero"),
(1, "one"),
(1, "two"),
(1, "three"),
(1, "four"),
(2, "zero"),
(2, "one"),
(2, "two"),
(2, "three"),
(2, "four"),
(3, "one"),
],
names=["level_0", "level_1"],
)
df = pd.DataFrame(range(0, len(i)), index=i, columns=["foo"])
# Define two pairs of colors (dark and light green/yellow)
from itertools import cycle
colors = [(0.6, 0.8, 0.8), (1, 0.9, 0.4)] # green, yellow
color_cycle = cycle(
[
{
k: (int(c[0] * 255), int(c[1] * 255), int(c[2] * 255), a)
for k, a in enumerate([1, 0.25])
}
for c in colors
]
)
# Define color for each row
bg_colors = []
for i in df.index.get_level_values(0).unique():
color = next(color_cycle)
row_color = cycle(
[
{
"props": [("background-color", f"rgba{color[0]}")],
},
{
"props": [("background-color", f"rgba{color[1]}")],
},
]
)
for _ in range(df.loc[(i,), :].shape[0]):
bg_colors.append(next(row_color))
# Style dataframe
css = [{"selector": f".row{i}"} | color for i, color in enumerate(bg_colors)]
df.style.set_table_styles(css)
Output from last cell in Jupyter notebook:

Related

Combinations & Numpy

I need to rate each combination in order to get the best one.
I completed the first step but it is not optimized at all.
When the value of RQ or NBPIS or NBSER is big, my code is much too long.
Do you have an idea to get the same result much faster?
Thank you very much
import numpy as np
from itertools import combinations, combinations_with_replacement
#USER SETTINGS
RQ=['A','B','C','D','E','F','G','H']
NBPIS=3
NBSER=3
#CODE
Combi1=np.array(list(combinations_with_replacement(RQ,NBPIS)))
Combi2=combinations_with_replacement(Combi1,NBSER)
Combi3=np.array([])
Compt=0
First=0
for X in Combi2:
Long=0
Compt=Compt+1
Y=np.array(X)
for Z in RQ:
Long=Long+1
if Z not in Y:
break
elif Long==len(RQ):
if First==0:
Combi3=Y
Combi3 = np.expand_dims(Combi3, axis = 0)
First=1
else:
Combi3=np.append(Combi3, [Y], axis = 0)
#RESULTS
print(Combi3)
print(Combi3.shape)
print(Compt)
Assuming your code produces the desirable result, the first step to optimize your code is refactoring it. This might help others to jump in and help as well.
Let's start making a function of it.
#USER SETTINGS
RQ=['A','B','C','D','E','F','G','H']
NBPIS=3
NBSER=3
def your_code():
Combi1=np.array(list(combinations_with_replacement(RQ,NBPIS)))
Combi2=combinations_with_replacement(Combi1,NBSER)
Combi3=np.array([])
Compt=0
First=0
for X in Combi2:
Long=0
Compt=Compt+1
Y=np.array(X)
for Z in RQ:
Long=Long+1
if Z not in Y:
break
elif Long==len(RQ):
if First==0:
Combi3=Y
Combi3 = np.expand_dims(Combi3, axis = 0)
First=1
else:
Combi3=np.append(Combi3, [Y], axis = 0)
shape = Combi3.shape
size = Compt
return Combi3, shape, size
Refactoring
Notice that Compt is equal to len(Combi2), so turning Combi2 as a numpy array will help to simplify the code. This also allow the variable Y to be replaced by X only. Also, there is no need for Combi1 to be a numpy array since it is only consumed by combinations_with_replacement.
def your_code_refactored():
Combi1 = combinations_with_replacement(RQ,NBPIS)
Combi2 = np.array(list(combinations_with_replacement(Combi1,NBSER)))
Combi3=np.array([])
First=0
for X in Combi2:
Long=0
for Z in RQ:
Long=Long+1
if Z not in X:
break
elif Long==len(RQ):
if First==0:
Combi3=X
Combi3 = np.expand_dims(Combi3, axis = 0)
First=1
else:
Combi3=np.append(Combi3, [X], axis = 0)
shape = Combi3.shape
size = len(Combi2)
return Combi3, shape, size
Next thing is to refactor how Combi3 is created and populated. The varialbe First is used to expand Combi3 dimension in the first interaction only, so this logic can be simplified as,
def your_code_refactored():
Combi1 = combinations_with_replacement(RQ,NBPIS)
Combi2 = np.array(list(combinations_with_replacement(Combi1,NBSER)))
Combi3 = np.empty((0, NBPIS, NBSER))
for X in Combi2:
Long=0
for Z in RQ:
Long=Long+1
if Z not in X:
break
elif Long==len(RQ):
Combi3 = np.append(Combi3, [X], axis = 0)
shape = Combi3.shape
size = len(Combi2)
return Combi3, shape, size
It seems Combi2 is populated only with combinations that have at least one of each element from RQ. This is accomplished by testing if each element of RQ is in X, which is basically checking if RQ is a subset of X. So it is simplified further,
def your_code_refactored():
Combi1 = combinations_with_replacement(RQ,NBPIS)
Combi2 = np.array(list(combinations_with_replacement(Combi1,NBSER)))
Combi3 = np.empty((0, NBPIS, NBSER))
unique_RQ = set(RQ)
for X in Combi2:
if unique_RQ.issubset(X.flatten()):
Combi3 = np.append(Combi3, [X], axis = 0)
shape = Combi3.shape
size = len(Combi2)
return Combi3, shape, size
This looks much simpler. Time to make it faster, maybe :)
Optimizing
One way this code can be optimized is to replace np.append by list.append. In numpy's documentation we see that np.append reallocate a larger and larger array each time it is called. The code might be speed up with list.append, since it over-allocates memory. So the code could be rewritten with list comprehension.
def your_code_refactored_and_optimized():
Combi1 = combinations_with_replacement(RQ,NBPIS)
Combi2 = np.array(list(combinations_with_replacement(Combi1,NBSER)))
unique_RQ = set(RQ)
Combi3 = np.array([X for X in Combi2 if unique_RQ.issubset(X.flatten())])
shape = Combi3.shape
size = len(Combi2)
return Combi3, shape, size
Testing
Now we can see it run faster.
import timeit
n = 5
print(timeit.timeit('your_code()', globals=globals(), number=n))
print(timeit.timeit('your_code_refactored_and_optimized()', globals=globals(), number=n))
This isn't much a gain in speed but it's something :)
I have an idea to reduce execution time by removing unnecessary combinations, simplifying with the following example with :
RQ=['A','B','C']
NBPIS=3
NBSER=3
Currently with :
Combi1 = combinations_with_replacement(RQ,NBPIS)
print(list(Combi1))
[('A', 'A', 'A'), ('A', 'A', 'B'), ('A', 'A', 'C'), ('A', 'B', 'B'),
('A', 'B', 'C'), ('A', 'C', 'C'), ('B', 'B', 'B'), ('B', 'B', 'C'),
('B', 'C', 'C'), ('C', 'C', 'C')]
But with :
Combi1 = list(list(combinations(RQ,W)) for W in range(1,NBPIS+1))
print(Combi1)
[[('A',), ('B',), ('C',)], [('A', 'B'), ('A', 'C'), ('B', 'C')],
[('A', 'B', 'C')]]
Problem with :
Combi1 = list(list(combinations(RQ,W)) for W in range(1,NBPIS+1))
Error message :
Combi3 = np.array([X for X in Combi2 if
unique_RQ.issubset(X.flatten())])
TypeError: unhashable type: 'list'
But with :
(Combi1 = combinations(RQ,W) for W in range(1,NBPIS+1))
print(Combi3)
[]
Questions :
For Combi1,
Instead of :
[[('A',), ('B',), ('C',)], [('A', 'B'), ('A', 'C'), ('B', 'C')],
[('A', 'B', 'C')]]
how to get this ? :
[('A'), ('B'), ('C'), ('A', 'B'), ('A', 'C'), ('B', 'C'), ('A', 'B',
'C')]
For Combi3, is it possible to get an array with different sizes ?
Instead of :
[[['A' 'A' 'A'] ['A' 'A' 'A'] ['A' 'B' 'C']]...
Obtain this ? :
[[['A'] ['A'] ['A' 'B' 'C']]...

tfidfVectorizer on only one column in training set

I have a problem concerning the tfidfVectorizer.
My problem is that I have 3 columns, one is the text that needs to be vectorized and the two others are already numbers, so I only need to vectorize one of them.
I have read that you need to vectorize your data after you have split it into training and test set, so I have split my data set like so:
X = df[['cleaned_tweet_text', 'polarity', 'subjectivity']] # The Tweets
y = df['cyberbullying_type'] # The Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)
It is the cleaned_tweet_text that needs to be vectorized
I have tried this(see below) but I am not sure this is the right way.
tfidf = TfidfVectorizer(max_features = 1000)
X_train_tfidf = tfidf.fit_transform(X_train.cleaned_tweet_text)
X_test_tfidf = tfidf.transform(X_test.cleaned_tweet_text)
It does not give me an error, and if I print out X_train_tfidf I get this:
(0, 217) 0.41700972853730645
(0, 118) 0.16283369998713235
(0, 758) 0.16948694862672925
(0, 404) 0.20143376247898365
(0, 626) 0.4426572817169202
(0, 356) 0.20217167680038242
(0, 871) 0.4634256150008882
(0, 65) 0.3606189681792524
(0, 565) 0.38556256201243433
(1, 719) 0.29478675756557454
(1, 919) 0.30596230567496185
(1, 698) 0.36538974359723864
(1, 485) 0.816429056367109
(1, 118) 0.13936199719971182
(2, 342) 0.17134974750083107
(2, 256) 0.18449190025596335
(2, 110) 0.3604602574432005
(2, 290) 0.39210201833562014
(2, 648) 0.3538174461369334
(2, 161) 0.2742199778844052
(2, 251) 0.3864257748655211
(2, 128) 0.26063790594719993
(2, 599) 0.18251158997125277
(2, 123) 0.39339155686431243
(2, 360) 0.21729849596293152
does that mean it works? so now I can put it into a classifier?

How to merge pandas DF on imperfect match?

I'm trying to merge/join x and y dataframes based on an exact match of the company columns and a partial match of some degree on the name columns.
Other than looking at the values returned by SequenceMatcher(None, x_name, y_name).ratio(), which were always above .8 in my case, I haven't tried much that warrants mentioning.
x = pd.DataFrame([{'id': 1, 'name': 'Robert Jackson', 'company': 'Test inc.', 'tenure': 6},
{'id': 2, 'name': 'William Johnson', 'company': 'Test inc.', 'tenure': 6}]).set_index('id')
y = pd.DataFrame([{'id': 4, 'name': 'Bob Jackson', 'company': 'Test inc.', 'job': 'desk'},
{'id': 5, 'name': 'Willy Johnson', 'company': 'Test inc.', 'job': 'desk'}]).set_index('id')
goal = pd.DataFrame([{'x_id': 1, 'y_id': 4, 'x_name': 'Robert Jackson', 'y_name': 'Bob Jackson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'},
{'x_id': 2, 'y_id': 5, 'x_name': 'William Johnson', 'y_name': 'Willy Johnson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'}])
Is something like this plausible? I'd appreciate any feedback, thank you.
Great question! I'm following to see other answers as I've been doing a lot of similar work lately. One inefficient method I've taken is to use fuzzywuzzy based on a threshold.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1
The solution that I used was:
from difflib import SequenceMatcher
x['merge_name'] = x['name']
x['merge_comp'] = x['company']
for a, b in x[['name', 'company']].values:
for ixb, (c,d) in enumerate(y[['name', 'company']].values):
if SequenceMatcher(None,a,c).ratio() >= .8:
y.loc[ixb,'merge_name'] = a
if SequenceMatcher(None,b,d).ratio() == 1:
y.loc[ixb,'merge_comp'] = b
goal = pd.merge(x,y, on=['merge_name', 'merge_comp'])
This function worked while passing arbitrary number of columns:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=False, post_drop=True):
if reset_index:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
flag = 0
merge_columns = []
r = len(columns)
for f in range(r):
df1[prefix + columns[flag]] = df1[columns[flag]]
merge_columns.append(prefix + columns[flag])
flag =+ 1
flag = 0
for f in range(r):
for col_1 in df1[columns[flag]].values:
for index, col_2 in enumerate(df2[columns[flag]].values):
print(type(col_2))
if SequenceMatcher(None,str(col_1),str(col_2)).ratio() >= ratios[flag]:
df2.loc[index, merge_columns[flag]] = col_1
flag =+ 1
df = pd.merge(df1,df2, on=merge_columns)
if post_drop:
df1.drop(columns=merge_columns, inplace=True)
df2.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x, y, columns=['name', 'company'], ratios=[.8, 1], reset_index=True)
This function worked for passing exactly 2 columns/ratios:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=True, post_drop=True):
df1_c = df1.copy()
df2_c = df2.copy()
if reset_index:
df1_c.reset_index(inplace=True)
df2_c.reset_index(inplace=True)
df1_c[prefix + columns[0]] = df1_c[columns[0]]
df1_c[prefix + columns[1]] = df1_c[columns[1]]
merge_columns = [prefix + columns[0], prefix + columns[1]]
for col_1, col_2 in df1_c[[columns[0], columns[1]]].values:
for index, (col_3, col_4) in enumerate(df2_c[[columns[0], columns[1]]].values):
if SequenceMatcher(None, str(col_1), str(col_3)).ratio() >= ratios[0]:
df2_c.loc[index, merge_columns[0]] = col_1
if SequenceMatcher(None, str(col_2), str(col_4)).ratio() >= ratios[1]:
df2_c.loc[index, merge_columns[1]] = col_2
df = pd.merge(df1_c, df2_c, on=merge_columns)
if post_drop:
df.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x,y,columns=['name', 'company'], ratios=[.8,1])

Tensorflow tf.data.Dataset API, dataset unzip function?

In tensorflow 1.12 there is the Dataset.zip function: documented here.
However, I was wondering if there is a dataset unzip function which will return back the original two datasets.
# NOTE: The following examples use `{ ... }` to represent the
# contents of a dataset.
a = { 1, 2, 3 }
b = { 4, 5, 6 }
c = { (7, 8), (9, 10), (11, 12) }
d = { 13, 14 }
# The nested structure of the `datasets` argument determines the
# structure of elements in the resulting dataset.
Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }
# The `datasets` argument may contain an arbitrary number of
# datasets.
Dataset.zip((a, b, c)) == { (1, 4, (7, 8)),
(2, 5, (9, 10)),
(3, 6, (11, 12)) }
# The number of elements in the resulting dataset is the same as
# the size of the smallest dataset in `datasets`.
Dataset.zip((a, d)) == { (1, 13), (2, 14) }
I would like to have the following
dataset = Dataset.zip((a, d)) == { (1, 13), (2, 14) }
a, d = dataset.unzip()
My workaround was to just use map, not sure if there might be interest in a syntax sugar function for unzip later though.
a = dataset.map(lambda a, b: a)
b = dataset.map(lambda a, b: b)
TensorFlow's get_single_element() is finally around which can be used to unzip datasets (as asked in the question above).
This avoids the need of generating and using an iterator using .map() or iter() (which could be costly for big datasets).
get_single_element() returns a tensor (or a tuple or dict of tensors) encapsulating all the members of the dataset. We need to pass all the members of the dataset batched into a single element.
This can be used to get features as a tensor-array, or features and labels as a tuple or dictionary (of tensor-arrays) depending upon how the original dataset was created.
import tensorflow as tf
a = [ 1, 2, 3 ]
b = [ 4, 5, 6 ]
c = [ (7, 8), (9, 10), (11, 12) ]
d = [ 13, 14 ]
# Creating datasets from lists
ads = tf.data.Dataset.from_tensor_slices(a)
bds = tf.data.Dataset.from_tensor_slices(b)
cds = tf.data.Dataset.from_tensor_slices(c)
dds = tf.data.Dataset.from_tensor_slices(d)
list(tf.data.Dataset.zip((ads, bds)).as_numpy_iterator()) == [ (1, 4), (2, 5), (3, 6) ] # True
list(tf.data.Dataset.zip((bds, ads)).as_numpy_iterator()) == [ (4, 1), (5, 2), (6, 3) ] # True
# Let's zip and unzip ads and dds
x = tf.data.Dataset.zip((ads, dds))
xa, xd = tf.data.Dataset.get_single_element(x.batch(len(x)))
xa = list(xa.numpy())
xd = list(xd.numpy())
print(xa, xd) # [1,2] [13, 14] # notice how xa is now different from a because ads was curtailed when zip was done above.
d == xd # True
Building on Ouwen Huang's answer, this function seems to work for arbitrary datasets:
def split_datasets(dataset):
tensors = {}
names = list(dataset.element_spec.keys())
for name in names:
tensors[name] = dataset.map(lambda x: x[name])
return tensors
I have written a more general unzip function for tf.data.Dataset pipelines, which also handles the recursive case where a pipeline has multiple levels of zipping.
import tensorflow as tf
def tfdata_unzip(
tfdata: tf.data.Dataset,
*,
recursive: bool=False,
eager_numpy: bool=False,
num_parallel_calls: int=tf.data.AUTOTUNE,
):
"""
Unzip a zipped tf.data pipeline.
Args:
tfdata: the :py:class:`tf.data.Dataset`
to unzip.
recursive: Set to ``True`` to recursively unzip
multiple layers of zipped pipelines.
Defaults to ``False``.
eager_numpy: Set this to ``True`` to return
Python lists of primitive types or
:py:class:`numpy.array` objects. Defaults
to ``False``.
num_parallel_calls: The level of parallelism to
each time we ``map()`` over a
:py:class:`tf.data.Dataset`.
Returns:
Returns a Python list of either
:py:class:`tf.data.Dataset` or NumPy
arrays.
"""
if isinstance(tfdata.element_spec, tf.TensorSpec):
if eager_numpy:
return list(tfdata.as_numpy_iterator())
return tfdata
def tfdata_map(i: int) -> list:
return tfdata.map(
lambda *cols: cols[i],
deterministic=True,
num_parallel_calls=num_parallel_calls,
)
if isinstance(tfdata.element_spec, tuple):
num_columns = len(tfdata.element_spec)
if recursive:
return [
tfdata_unzip(
tfdata_map(i),
recursive=recursive,
eager_numpy=eager_numpy,
num_parallel_calls=num_parallel_calls,
)
for i in range(num_columns)
]
else:
return [
tfdata_map(i)
for i in range(num_columns)
]
raise ValueError(
"Unknown tf.data.Dataset element_spec: " +
str(tfdata.element_spec)
)
Here is how tfdata_unzip() works, given these example datasets:
>>> import numpy as np
>>> baby = tf.data.Dataset.from_tensor_slices([
np.array([1,2]),
np.array([3,4]),
np.array([5,6]),
])
>>> baby.element_spec
TensorSpec(shape=(2,), dtype=tf.int64, name=None)
TensorSpec(shape=(2,), dtype=tf.int64, name=None)
>>> parent = tf.data.Dataset.zip((baby, baby))
>>> parent.element_spec
(TensorSpec(shape=(2,), dtype=tf.int64, name=None),
TensorSpec(shape=(2,), dtype=tf.int64, name=None))
>>> grandparent = tf.data.Dataset.zip((parent, parent))
>>> grandparent.element_spec
((TensorSpec(shape=(2,), dtype=tf.int64, name=None),
TensorSpec(shape=(2,), dtype=tf.int64, name=None)),
(TensorSpec(shape=(2,), dtype=tf.int64, name=None),
TensorSpec(shape=(2,), dtype=tf.int64, name=None)))
This is what tfdata_unzip() returns on the above baby, parent, and grandparent datasets:
>>> tfdata_unzip(baby)
<TensorSliceDataset shapes: (2,), types: tf.int64>
>>> tfdata_unzip(parent)
[<ParallelMapDataset shapes: (2,), types: tf.int64>,
<ParallelMapDataset shapes: (2,), types: tf.int64>]
>>> tfdata_unzip(grandparent)
[<ParallelMapDataset shapes: ((2,), (2,)), types: (tf.int64, tf.int64)>,
<ParallelMapDataset shapes: ((2,), (2,)), types: (tf.int64, tf.int64)>]
>>> tfdata_unzip(grandparent, recursive=True)
[[<ParallelMapDataset shapes: (2,), types: tf.int64>,
<ParallelMapDataset shapes: (2,), types: tf.int64>],
[<ParallelMapDataset shapes: (2,), types: tf.int64>,
<ParallelMapDataset shapes: (2,), types: tf.int64>]]
>>> tfdata_unzip(grandparent, recursive=True, eager_numpy=True)
[[[array([1, 2]), array([3, 4]), array([5, 6])],
[array([1, 2]), array([3, 4]), array([5, 6])]],
[[array([1, 2]), array([3, 4]), array([5, 6])],
[array([1, 2]), array([3, 4]), array([5, 6])]]]

Cumulative Mode in numpy

I'd like to efficiently calculate a cumulative mode along an axis in numpy.
e.g.
>>> arr = np.random.RandomState(3).randint(3, size = (2, 5))
>>> arr
array([[2, 0, 1, 0, 0],
[0, 1, 1, 2, 1]])
>>> assert np.array_equal(cummode(arr, axis = 1), [[2,2,2,0,0],[0,0,1,1,1])
Is there an efficient way to do this? I guess it can handle ties by returning the first number to achieve the given count.
Here's a pure Python function that works on a list, or any iterable:
from collections import defaultdict
def cummode(alist):
dd = defaultdict(int)
mode = [(None,0)]
for i in alist:
dd[i] += 1
if dd[i]>mode[-1][1]:
newmode = (i,dd[i])
else:
newmode = mode[-1]
mode.append(newmode)
mode.pop(0)
return mode, dd
mode,dd = cummode([0,1,3,6,1,2,3,3,2,1,10,0])
print(dd)
print(mode)
which for the test case, produces
defaultdict(<type 'int'>, {0: 2, 1: 3, 2: 2, 3: 3, 6: 1, 10: 1})
[(0, 1), (0, 1), (0, 1), (0, 1), (1, 2), (1, 2), (1, 2), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
A detaultdict is a clean fast way of accumulating values when you don't know all the keys before hand. For small lists and arrays it probably beats a numpy based version, even with weave, simply because it does not incure the overhead of creating arrays. But with large ones it probably will lag. Plus I haven't generalized it to handle multiple values (rows).
Ok, it's not fully general in that it doesn't work along any axis, but for now I've made a version that works using scipy-weave.
from scipy import weave
def cummode(x, axis = 1):
assert x.ndim == 2 and axis == 1, 'Only implemented for a special case!'
all_values, element_ids = np.unique(x, return_inverse=True)
n_unique = len(all_values)
element_ids = element_ids.reshape(x.shape)
result = np.zeros(x.shape, dtype = int)
counts = np.zeros(n_unique, dtype = int)
code = """
int n_samples = Nelement_ids[0];
int n_events = Nelement_ids[1];
for (int i=0; i<n_samples; i++){
int maxcount = 0;
int maxel = -1;
for (int k=0; k<n_unique; k++)
counts[k] = 0;
for (int j=0; j<n_events; j++){
int ix = i*n_events+j;
int k = element_ids[ix];
counts[k]+=1;
if (counts[k] > maxcount){
maxcount = counts[k];
maxel = k;
}
result[ix]=maxel;
}
}
"""
weave.inline(code, ['element_ids', 'result', 'n_unique', 'counts'], compiler = 'gcc')
mode_values = all_values[result]
return mode_values