Pandas, groupby include number of rows grouped in each row - pandas

Have any way to use
df = pd.read_excel(r'a.xlsx')
df2 = df.groupby(by=["col"], as_index=False).mean()
Include new column with number of rows grouped in each row?

in absence of sample data, I'm assuming you have multiple numeric columns
can use apply() to then calculate all means and append len() to this series
df = pd.DataFrame(
{
"col": np.random.choice(list("ABCD"), 200),
"val": np.random.uniform(1, 5, 200),
"val2": np.random.uniform(5, 10, 200),
}
)
df2 = df.groupby(by=["col"], as_index=False).apply(
lambda d: d.select_dtypes("number").mean().append(pd.Series({"len": len(d)}))
)
df2
col
val
val2
len
0
A
3.13064
7.63837
42
1
B
3.1057
7.50656
44
2
C
3.0111
7.82628
54
3
D
3.20709
7.32217
60
comment code
def w_avg(df, values, weights, exp):
d = df[values]
w = df[weights] ** exp
return (d * w).sum() / w.sum()
dfg1 = pd.DataFrame(
{
"Jogador": np.random.choice(list("ABCD"), 200),
"Evento": np.random.choice(list("XYZ"),200),
"Rating Calculado BW": np.random.uniform(1, 5, 200),
"Lances": np.random.uniform(5, 10, 200),
}
)
dfg = dfg1.groupby(by=["Jogador", "Evento"]).apply(
lambda dfg1: dfg1.select_dtypes("number")
.agg(lambda d: w_avg(dfg1, "Rating Calculado BW", "Lances", 1))
.append(pd.Series({"len": len(dfg1)}))
)
dfg

Related

How to update a pandas column

Given the following dataframe
col1 col2
1 ('A->B', 'B->C')
2 ('A->D', 'D->C', 'C->F')
3 ('A->K', 'K->M', 'M->P')
...
I want to convert this to the following format
col1 col2
1 'A-B-C'
2 'A-D-C-F'
3 'A-K-M-P'
...
Each sequence shows an arc within a path. Hence, the sequence is like (a,b), (b,c), (c,d) ...
def merge_values(val):
val = [x.split('->') for x in val]
out = []
for char in val:
out.append(char[0])
out.append(val[-1][1])
return '-'.join(out)
df['col2'] = df['col2'].apply(merge_values)
print(df)
Output:
col1 col2
0 1 A-B-C
1 2 A-D-C-F
2 3 A-K-M-P
Given
df = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [
('A->B', 'B->C'),
('A->D', 'D->C', 'C->F'),
('A->K', 'K->M', 'M->P'),
],
})
You can do:
def combine(t, old_sep='->', new_sep='-'):
if not t: return ''
if type(t) == str: t = [t]
tokens = [x.partition(old_sep)[0] for x in t]
tokens += t[-1].partition(old_sep)[-1]
return new_sep.join(tokens)
df['col2'] = df['col2'].apply(combine)

How to index the unique value count in numpy? [duplicate]

Consider the following lists short_list and long_list
short_list = list('aaabaaacaaadaaac')
np.random.seed([3,1415])
long_list = pd.DataFrame(
np.random.choice(list(ascii_letters),
(10000, 2))
).sum(1).tolist()
How do I calculate the cumulative count by unique value?
I want to use numpy and do it in linear time. I want this to compare timings with my other methods. It may be easiest to illustrate with my first proposed solution
def pir1(l):
s = pd.Series(l)
return s.groupby(s).cumcount().tolist()
print(np.array(short_list))
print(pir1(short_list))
['a' 'a' 'a' 'b' 'a' 'a' 'a' 'c' 'a' 'a' 'a' 'd' 'a' 'a' 'a' 'c']
[0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1]
I've tortured myself trying to use np.unique because it returns a counts array, an inverse array, and an index array. I was sure I could these to get at a solution. The best I got is in pir4 below which scales in quadratic time. Also note that I don't care if counts start at 1 or zero as we can simply add or subtract 1.
Below are some of my attempts (none of which answer my question)
%%cython
from collections import defaultdict
def get_generator(l):
counter = defaultdict(lambda: -1)
for i in l:
counter[i] += 1
yield counter[i]
def pir2(l):
return [i for i in get_generator(l)]
def pir3(l):
return [i for i in get_generator(l)]
def pir4(l):
unq, inv = np.unique(l, 0, 1, 0)
a = np.arange(len(unq))
matches = a[:, None] == inv
return (matches * matches.cumsum(1)).sum(0).tolist()
setup
short_list = np.array(list('aaabaaacaaadaaac'))
functions
dfill takes an array and returns the positions where the array changes and repeats that index position until the next change.
# dfill
#
# Example with short_list
#
# 0 0 0 3 4 4 4 7 8 8 8 11 12 12 12 15
# [ a a a b a a a c a a a d a a a c]
#
# Example with short_list after sorting
#
# 0 0 0 0 0 0 0 0 0 0 0 0 12 13 13 15
# [ a a a a a a a a a a a a b c c d]
argunsort returns the permutation necessary to undo a sort given the argsort array. The existence of this method became know to me via this post.. With this, I can get the argsort array and sort my array with it. Then I can undo the sort without the overhead of sorting again.
cumcount will take an array sort it, find the dfill array. An np.arange less dfill will give me cumulative count. Then I un-sort
# cumcount
#
# Example with short_list
#
# short_list:
# [ a a a b a a a c a a a d a a a c]
#
# short_list.argsort():
# [ 0 1 2 4 5 6 8 9 10 12 13 14 3 7 15 11]
#
# Example with short_list after sorting
#
# short_list[short_list.argsort()]:
# [ a a a a a a a a a a a a b c c d]
#
# dfill(short_list[short_list.argsort()]):
# [ 0 0 0 0 0 0 0 0 0 0 0 0 12 13 13 15]
#
# np.range(short_list.size):
# [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
#
# np.range(short_list.size) -
# dfill(short_list[short_list.argsort()]):
# [ 0 1 2 3 4 5 6 7 8 9 10 11 0 0 1 0]
#
# unsorted:
# [ 0 1 2 0 3 4 5 0 6 7 8 0 9 10 11 1]
foo function recommended by #hpaulj using defaultdict
div function recommended by #Divakar (old, I'm sure he'd update it)
code
def dfill(a):
n = a.size
b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
return np.arange(n)[b[:-1]].repeat(np.diff(b))
def argunsort(s):
n = s.size
u = np.empty(n, dtype=np.int64)
u[s] = np.arange(n)
return u
def cumcount(a):
n = a.size
s = a.argsort(kind='mergesort')
i = argunsort(s)
b = a[s]
return (np.arange(n) - dfill(b))[i]
def foo(l):
n = len(l)
r = np.empty(n, dtype=np.int64)
counter = defaultdict(int)
for i in range(n):
counter[l[i]] += 1
r[i] = counter[l[i]]
return r - 1
def div(l):
a = np.unique(l, return_counts=1)[1]
idx = a.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -a[:-1]+1
rng = id_arr.cumsum()
return rng[argunsort(np.argsort(l))]
demonstration
cumcount(short_list)
array([ 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1])
time testing
code
functions = pd.Index(['cumcount', 'foo', 'foo2', 'div'], name='function')
lengths = pd.RangeIndex(100, 1100, 100, 'array length')
results = pd.DataFrame(index=lengths, columns=functions)
from string import ascii_letters
for i in lengths:
a = np.random.choice(list(ascii_letters), i)
for j in functions:
results.set_value(
i, j,
timeit(
'{}(a)'.format(j),
'from __main__ import a, {}'.format(j),
number=1000
)
)
results.plot()
Here's a vectorized approach using custom grouped range creating function and np.unique for getting the counts -
def grp_range(a):
idx = a.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -a[:-1]+1
return id_arr.cumsum()
count = np.unique(A,return_counts=1)[1]
out = grp_range(count)[np.argsort(A).argsort()]
Sample run -
In [117]: A = list('aaabaaacaaadaaac')
In [118]: count = np.unique(A,return_counts=1)[1]
...: out = grp_range(count)[np.argsort(A).argsort()]
...:
In [119]: out
Out[119]: array([ 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1])
For getting the count, few other alternatives could be proposed with focus on performance -
np.bincount(np.unique(A,return_inverse=1)[1])
np.bincount(np.fromstring('aaabaaacaaadaaac',dtype=np.uint8)-97)
Additionally, with A containing single-letter characters, we could get the count simply with -
np.bincount(np.array(A).view('uint8')-97)
Besides defaultdict there are a couple of other counters. Testing a slightly simpler case:
In [298]: from collections import defaultdict
In [299]: from collections import defaultdict, Counter
In [300]: def foo(l):
...: counter = defaultdict(int)
...: for i in l:
...: counter[i] += 1
...: return counter
...:
In [301]: short_list = list('aaabaaacaaadaaac')
In [302]: foo(short_list)
Out[302]: defaultdict(int, {'a': 12, 'b': 1, 'c': 2, 'd': 1})
In [303]: Counter(short_list)
Out[303]: Counter({'a': 12, 'b': 1, 'c': 2, 'd': 1})
In [304]: arr=[ord(i)-ord('a') for i in short_list]
In [305]: np.bincount(arr)
Out[305]: array([12, 1, 2, 1], dtype=int32)
I constructed arr because bincount only works with ints.
In [306]: timeit np.bincount(arr)
The slowest run took 82.46 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 5.63 µs per loop
In [307]: timeit Counter(arr)
100000 loops, best of 3: 13.6 µs per loop
In [308]: timeit foo(arr)
100000 loops, best of 3: 6.49 µs per loop
I'm guessing it would hard to improve on pir2 based on default_dict.
Searching and counting like this are not a strong area for numpy.

How to modify dataframe based on column values

I want to add relationships to column 'relations' based on rel_list. Specifically, for each tuple, i.e. ('a', 'b'), I want to replace the relationships column value '' with 'b' in the first row, but no duplicate, meaning that for the 2nd row, don't replace '' with 'a', since they are considered as duplicated. The following code doesn't work fully correct:
import pandas as pd
data = {
"names": ['a', 'b', 'c', 'd'],
"ages": [50, 40, 45, 20],
"relations": ['', '', '', '']
}
rel_list = [('a', 'b'), ('a', 'c'), ('c', 'd')]
df = pd.DataFrame(data)
for rel_tuple in rel_list:
head = rel_tuple[0]
tail = rel_tuple[1]
df.loc[df.names == head, 'relations'] = tail
print(df)
The current result of df is:
names ages relations
0 a 50 c
1 b 40
2 c 45 d
3 d 20
However, the correct one is:
names ages relations
0 a 50 b
0 a 50 c
1 b 40
2 c 45 d
3 d 20
There are new rows that need to be added. The 2nd row in this case, like above. How to do that?
You can craft a dataframe and merge:
(df.drop('relations', axis=1)
.merge(pd.DataFrame(rel_list, columns=['names', 'relations']),
on='names',
how='outer'
)
# .fillna('') # uncomment to replace NaN with empty string
)
Output:
names ages relations
0 a 50 b
1 a 50 c
2 b 40 NaN
3 c 45 d
4 d 20 NaN
Instead of updating df you can create a new one and add relations row by row:
import pandas as pd
data = {
"names": ['a', 'b', 'c', 'd'],
"ages": [50, 40, 45, 20],
"relations": ['', '', '', '']
}
rel_list = [('a', 'b'), ('a', 'c'), ('c', 'd')]
df = pd.DataFrame(data)
new_df = pd.DataFrame(data)
new_df.loc[:, 'relations'] = ''
for head, tail in rel_list:
new_row = df[df.names == head]
new_row.loc[:,'relations'] = tail
new_df = new_df.append(new_row)
print(new_df)
Output:
names ages relations
0 a 50
1 b 40
2 c 45
3 d 20
0 a 50 b
0 a 50 c
2 c 45 d
Then, if needed, in the end you can delete all rows without value in 'relations':
new_df = new_df[new_df['relations']!='']

using agg to flatten a series of lists in pandas

I have a number of multi-index columns each with a list of tuples that I want to flatten (the list, not the tuples) but I'm struggling with it. Here's what I have:
df = pd.DataFrame([[[(1,'a')],[(6,'b')],np.nan,np.nan],[[(5,'d'),(10,'e')],np.nan,np.nan,[(8,'c')]]])
df.columns = pd.MultiIndex.from_tuples([('a', 0), ('a', 1), ('b', 0), ('b', 1)])
>>> df
a b
0 1 0 1
0 [(1, a)] [(6, b)] NaN NaN
1 [(5, d), (10, e)] NaN NaN [(8, c)]
Desired result:
>>> df
a b
0 [(1, a), (6, b)] [NaN, NaN]
1 [(5, d), (10, e), NaN] [NaN, (8, c)]
How do I do this? From this related question, I tried the following:
>>> df.stack(level=1).groupby(level=[0]).agg(lambda x: np.array(list(x)).flatten())
a b
0 a b
1 a b
>>> df.stack(level=1).groupby(level=[0]).agg(lambda x: np.concatenate(list(x)))
...
Exception: Must produce aggregated value
Here's a way to do:
# taken from https://stackoverflow.com/questions/12472338/flattening-a-list-recursively
def flatten(S):
if S == []:
return S
if isinstance(S[0], list):
return flatten(S[0]) + flatten(S[1:])
return S[:1] + flatten(S[1:])
# reshape the data for get the desired structure
df2 = (df
.unstack()
.reset_index()
.drop('level_1', 1)
.groupby(['level_0', 'level_2'])[0]
.apply(list).apply(flatten).unstack().T)
df2.index.name = None
df2.columns.name = None
print(df2)
a b
0 [(1, a), (6, b)] [na, na]
1 [(5, d), (10, e), na] [na, (8, c)]
Found a one-liner:
Using the flatten custom function given by #YOLO
>>> df.stack(level=1).groupby(level=0).agg(list).applymap(flatten)
a b
0 [(1, a), (6, b)] [nan, nan]
1 [(5, d), (10, e), nan] [nan, (8, c)]
where
def flatten(S):
if S == []:
return S
if isinstance(S[0], list):
return flatten(S[0]) + flatten(S[1:])
return S[:1] + flatten(S[1:])

Selecting values with Pandas multiindex using lists of tuples

I have a DataFrame with a MultiIndex with 3 levels:
id foo bar col1
0 1 a -0.225873
2 a -0.275865
2 b -1.324766
3 1 a -0.607122
2 a -1.465992
2 b -1.582276
3 b -0.718533
7 1 a -1.904252
2 a 0.588496
2 b -1.057599
3 a 0.388754
3 b -0.940285
Preserving the id index level, I want to sum along the foo and bar levels, but with different values for each id.
For example, for id = 0 I want to sum over foo = [1] and bar = [["a", "b"]], for id = 3 I want to sum over foo = [2] and bar = [["a", "b"]], and for id = 7 I want to sum over foo = [[1,2]] and bar = [["a"]]. Giving the result:
id col1
0 -0.225873
3 -3.048268
7 -1.315756
I have been trying something along these lines:
df.loc(axis = 0)[[(0, 1, ["a","b"]), (3, 2, ["a","b"]), (7, [1,2], "a")].sum()
Not sure if this is even possible. Any elegant solution (possibly removing the MultiIndex?) would be much appreciated!
The list of tuples is not the problem. The fact that each tuple does not correspond to a single index is the problem (Since a list isn't a valid key). If you want to index a Dataframe like this, you need to expand the lists inside each tuple to their own entries.
Define your options like the following list of dictionaries, then transform using a list comprehension and index using all individual entries.
d = [
{
'id': 0,
'foo': [1],
'bar': ['a', 'b']
},
{
'id': 3,
'foo': [2],
'bar': ['a', 'b']
},
{
'id': 7,
'foo': [1, 2],
'bar': ['a']
},
]
all_idx = [
(el['id'], i, j)
for el in d
for i in el['foo']
for j in el['bar']
]
# [(0, 1, 'a'), (0, 1, 'b'), (3, 2, 'a'), (3, 2, 'b'), (7, 1, 'a'), (7, 2, 'a')]
df.loc[all_idx].groupby(level=0).sum()
col1
id
0 -0.225873
3 -3.048268
7 -1.315756
A more succinct solution using slicers:
sections = [(0, 1, slice(None)), (3, 2, slice(None)), (7, slice(1,2), "a")]
pd.concat(df.loc[s] for s in sections).groupby("id").sum()
col1
id
0 -0.225873
3 -3.048268
7 -1.315756
Two things to note:
This may be less memory-efficient than the accepted answer since pd.concat creates a new DataFrame.
The slice(None)'s are mandatory, otherwise the index columns of the df.loc[s]'s mismatch when calling pd.concat.