Multi-column label-encoding: Print mappings - pandas

Following code can be used to transform strings into categorical labels:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame([['A','B','C','D','E','F','G','I','K','H'],
['A','E','H','F','G','I','K','','',''],
['A','C','I','F','H','G','','','','']],
columns=['A1', 'A2', 'A3','A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'])
pd.DataFrame(columns=df.columns, data=LabelEncoder().fit_transform(df.values.flatten()).reshape(df.shape))
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10
0 1 2 3 4 5 6 7 9 10 8
1 1 5 8 6 7 9 10 0 0 0
2 1 3 9 6 8 7 0 0 0 0
Question:
How can I query the mappings (it appears they are sorted alphabetically)?
I.e. a list like:
A: 1
B: 2
C: 3
...
I: 9
K: 10
Thank you!

yes, it's possible if you define the LabelEncoder separately and query its classes_ attribute later.
le = LabelEncoder()
data = le.fit_transform(df.values.flatten())
dict(zip(le.classes_[1:], np.arange(1, len(le.classes_))))
{'A': 1,
'B': 2,
'C': 3,
'D': 4,
'E': 5,
'F': 6,
'G': 7,
'H': 8,
'I': 9,
'K': 10}
The classes_ stores a list of classes, in the order that they were encoded.
le.classes_
array(['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K'], dtype=object)
So you may safely assume the first element is encoded as 1, and so on.
To reverse encodings, use le.inverse_transform.

I think there is transform in LabelEncoder
le=LabelEncoder()
le.fit(df.values.flatten())
dict(zip(df.values.flatten(),le.transform(df.values.flatten()) ))
Out[137]:
{'': 0,
'A': 1,
'B': 2,
'C': 3,
'D': 4,
'E': 5,
'F': 6,
'G': 7,
'H': 8,
'I': 9,
'K': 10}

Related

Combine 2 different sized arrays element-wise based on index pairing array

Say, we had 2 arrays of unique values:
a = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) # any values are possible,
b = np.array([0, 11, 12, 13, 14, 15, 16, 17, 18, 19]) # sorted values are for demonstration
, where a[0] corresponds to b[0], a[1] to b[11], a[2]-b[12], etc.
Then, due to some circumstances we randomly lost some of it and received noise elements from/to both a & b. Now 'useful data' in a and b are kind of 'eroded' like this:
a = np.array([0, 1, 313, 2, 3, 4, 5, 934, 6, 8, 9, 730, 241, 521])
b = np.array([112, 514, 11, 13, 16, 955, 17, 18, 112])
The noise elements have negligible probability to coincide with any of 'useful data'. So, if to search them, we could find the left ones and to define the 'index pairing array':
cor_tab = np.array([[1,2], [4,3], [8,4], [9,7]])
which, if applied, provides pairs of 'useful data' left:
np.column_stack((a[cor_tab[:,0]], b[cor_tab[:,1]]))
array([[1, 11],
[3, 13],
[6, 16],
[8, 18]])
The question: Given the 'eroded' a and b, how to combine them into numpy array such that:
values indexed in cor_tab are paired in the same column/row,
lost values are treated as -1,
noise as 'don't care', and
array looks like this:
[[ -1 112],
[ 0 514],
[ 1 11],
[313 -1],
[ 2 -1],
[ 3 13],
[ 4 -1],
[ 5 -1],
[934 -1],
[ 6 16],
[ -1 955],
[ -1 17],
[ 8 18],
[ 9 -1],
[730 -1],
[241 -1],
[521 112]]
, where 'useful data' is at indices: 2, 5, 9, 12?
Initially I solved this, in dubious way:
import numpy as np
def combine(aa, bb, t):
c0 = np.empty((0), int)
c1 = np.empty((0), int)
# add -1 & 'noise' at the left side:
if t[0][0] > t[0][1]:
c0 = np.append(c0, aa[: t[0][0]])
c1 = np.append(c1, [np.append([-1] * (t[0][0] - t[0][1]), bb[: t[0][1]])])
else:
c0 = np.append(c0, [np.append([-1] * (t[0][1] - t[0][0]), aa[: t[0][0]])])
c1 = np.append(c1, bb[: t[0][1]])
ind_compenstr = t[0][0] - t[0][1] # 'index compensator'
for i, ii in enumerate(t):
x = ii[0] - ii[1] - ind_compenstr
# add -1 & 'noise' in the middle:
if x > 0:
c0 = np.append(c0, [aa[ii[0]-x:ii[0]]])
c1 = np.append(c1, [[-1] * x])
elif x == 0:
c0 = np.append(c0, [aa[ii[0]-x:ii[0]]])
c1 = np.append(c1, [bb[ii[1]-x:ii[1]]])
else:
x = abs(x)
c0 = np.append(c0, [[-1] * x])
c1 = np.append(c1, [bb[ii[1]-x:ii[1]]])
# add useful elements:
c0 = np.append(c0, aa[ii[0]])
c1 = np.append(c1, bb[ii[1]])
ind_compenstr += x
# add -1 & 'noise' at the right side:
l0 = len(aa) - t[-1][0]
l1 = len(bb) - t[-1][1]
if l0 > l1:
c0 = np.append(c0, aa[t[-1][0] + 1:])
c1 = np.append(c1, [np.append(bb[t[-1][1] + 1:], [-1] * (l0 - l1))])
else:
c0 = np.append(c0, [np.append(aa[t[-1][0] + 1:], [-1] * (l1 - l0))])
c1 = np.append(c1, bb[t[-1][1] + 1:])
return np.array([c0,c1])
But bellow I suggest another solution.
It is difficult to understand what the question want, but IIUC, at first, we need to find the column size of the expected array that contains combined uncommon values between the two arrays (np.union1d), and then create an array based on that size full filled by -1 (np.full). Now, using np.searchsorted, the indices of values of an array in another array will be achieved. Values that are not contained in the other array can be given by np.in1d in invert mode. So we can achieve the goal by indexing as:
union_ = np.union1d(a, b)
# [0 1 2 3 4 5 6 7 8 9]
res = np.full((2, union_.size), -1)
# [[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
# [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]
arange_row_ids = np.arange(union_.size)
# [0 1 2 3 4 5 6 7 8 9]
col_inds = np.searchsorted(a, b)[np.in1d(b, a, invert=True)]
# np.searchsorted(a, b) ---> [1 3 6 7 7]
# np.in1d(b, a, invert=True) ---> [False False False True False]
# [7]
res[0, np.delete(arange_row_ids, col_inds + np.arange(col_inds.size))] = a
# np.delete(arange_row_ids, col_inds + np.arange(col_inds.size)) ---> [0 1 2 3 4 5 6 8 9]
# [[ 0 1 2 3 4 5 6 -1 8 9]
# [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]
col_inds = np.searchsorted(b, a)[np.in1d(a, b, invert=True)]
# np.searchsorted(b, a) ---> [0 0 1 1 2 2 2 4 5]
# np.in1d(a, b, invert=True) ---> [ True False True False True True False False True]
# [0 1 2 2 5]
res[1, np.delete(arange_row_ids, col_inds + np.arange(col_inds.size))] = b
# np.delete(arange_row_ids, col_inds + np.arange(col_inds.size)) ---> [1 3 6 7 8]
# [[ 0 1 2 3 4 5 6 -1 8 9]
# [-1 1 -1 3 -1 -1 6 7 8 -1]]
The question is not clear enough to see if the answer is the expected one, but I think it is helpful that could help for further modifications based on the need.
Here's a partially vectorized solution:
import numpy as np
# this function if from Divakar's answer at #https://stackoverflow.com/questions/38619143/convert-python-#sequence-to-numpy-array-filling-missing-values that I used as #function:
def boolean_indexing(v):
lens = np.array([len(item) for item in v])
mask = lens[:,None] > np.arange(lens.max())[::-1]
out = np.full(mask.shape, -1, dtype=int)
out[mask] = np.concatenate(v)
return out
# 2 arrays with eroded useful data and the index pairing array:
a = np.array([0, 1, 313, 2, 3, 4, 5, 934, 6, 8, 9, 730, 241, 521])
b = np.array([112, 514, 11, 13, 16, 955, 17, 18, 112])
cor_tab = np.array([[1,2], [4,3], [8,4], [9,7]])
# split every array by correspondent indices in `cor_tab`:
aa = np.split(a, cor_tab[:,0]+1)
bb = np.split(b, cor_tab[:,1]+1)
#initiate 2 flat empty arrays:
aaa = np.empty((0), int)
bbb = np.empty((0), int)
# loop over the splitted arrays:
for i, j in zip(aa,bb):
c = boolean_indexing([i, j])
aaa = np.append(aaa, c[0])
bbb = np.append(bbb, c[1])
ccc = np.array([aaa,bbb]).T
In case of other types of data, here is another example. Lets take two arrays of letters:
a = np.array(['y', 'w', 'a', 'e', 'i', 'o', 'u', 'y', 'w', 'a', 'e', 'i', 'o', 'u'])
b = np.array(['t', 'h', 'b', 't', 'c', 'n', 's', 'j', 'p', 'z', 'n', 'h', 't', 's', 'm', 'p'])
, and index pairing array:
cor_tab = np.array([[2,0], [3,2], [4,3], [5,5], [6,6], [9,10], [11,12], [13,13]])
np.column_stack((a[cor_tab[:,0]], b[cor_tab[:,1]]))
array([['a', 't'], # useful data
['e', 'b'],
['i', 't'],
['o', 'n'],
['u', 's'],
['a', 'n'],
['i', 't'],
['u', 's']], dtype='<U1')
The only correction required is dtype='<U1' in boolean_indexing(). Result is:
[['y' '-'],
['w' '-'],
['a' 't'],
['-' 'h'],
['e' 'b'],
['i' 't'],
['-' 'c'],
['o' 'n'],
['u' 's'],
['-' 'j'],
['y' 'p'],
['w' 'z'],
['a' 'n'],
['e' 'h'],
['i' 't'],
['o' '-'],
['u' 's'],
['-' 'm'],
['-' 'p']]
It works for floats as well if change dtype in boolean_indexing() to float.

i try to set multiply values using DataFrame.loc in pandas but "Must have equal len keys and value when setting with an ndarray" appear

When i do the following code:
df = pd.DataFrame({'team': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
'a': [18, 22, 19, 14, 14, 11, 20, 28, 22],
'b': [5, 7, 7, 9, 12, 9, 9, 4, 8],
'c': [11, 8, 10, 6, 6, 5, 9, 12, 9]})
df.loc[(df.b>8), ["one", "two"]] = df.c, df.a
df.loc[(df.b<=8), ["one", "two"]] = df.b*5, df.c*10
print(df)
I got ValueError: Must have equal len keys and value when setting with an ndarray
What is wrong?
If i do:
df.loc[(df.b>8), ["one", "two"]] = df.c
df.loc[(df.b<=8), ["one", "two"]] = df.b
it works
You can't due to index alignment.
You would need to use:
df.loc[(df.b>8), ["one", "two"]] = df[['c', 'a']].set_axis(['one', 'two'], axis=1)
df.loc[(df.b<=8), ["one", "two"]] = df[['b', 'c']].mul([5,10]).set_axis(['one', 'two'], axis=1)
Alternative with numpy.where:
df[['one', 'two']] = np.where(np.tile(df['b'].gt(8), (2, 1)).T,
df[['c', 'a']], df[['b', 'c']].mul([5,10]))
output:
team a b c one two
0 A 18 5 11 25 110
1 B 22 7 8 35 80
2 C 19 7 10 35 100
3 D 14 9 6 6 14
4 E 14 12 6 6 14
5 F 11 9 5 5 11
6 G 20 9 9 9 20
7 H 28 4 12 20 120
8 I 22 8 9 40 90

Second-level aggregation in pandas

I have a simple example:
DF = pd.DataFrame(
{"F1" : ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
"F2" : [1, 2, 1, 2, 2, 3, 1, 2, 3, 2],
"F3" : ['xx', 'yy', 'zz', 'zz', 'zz', 'xx', 'yy', 'zz', 'zz', 'zz']})
DF
How can I improve the code so that in the F3-unique column, in addition to the list of unique values of the F3 column in the group, the number of appearances of these values in the group is displayed like this:
Use .groupby() + .sum() + value_counts() + .agg():
df2 = DF.groupby('F1')['F2'].sum()
df3 = (DF.groupby(['F1', 'F3'])['F3']
.value_counts()
.reset_index([2], name='count')
.apply(lambda x: x['F3'] + '-' + str(x['count']), axis=1)
)
df4 = df3.groupby(level=0).agg(' '.join)
df4.name = 'F3'
df_out = pd.concat([df2, df4], axis=1).reset_index()
Result:
print(df_out)
F1 F2 F3
0 A 4 xx-1 yy-1 zz-1
1 B 7 xx-1 zz-2
2 C 8 yy-1 zz-3
Seems like groupby aggregate's named aggregation + python's collections.Counter could work well here:
from collections import Counter
df2 = DF.groupby('F1', as_index=False).aggregate({
'F2': 'sum',
'F3': lambda g: ' '.join([f'{k}-{v}' for k, v in Counter(g).items()])
})
df2:
F1 F2 F3
0 A 4 xx-1 yy-1 zz-1
1 B 7 zz-2 xx-1
2 C 8 yy-1 zz-3
aggregating to a Counter turns a collection into a dictionary based on the number of unique values:
df2 = DF.groupby('F1', as_index=False).aggregate({
'F2': 'sum',
'F3': Counter
})
F1 F2 F3
0 A 4 {'xx': 1, 'yy': 1, 'zz': 1}
1 B 7 {'zz': 2, 'xx': 1}
2 C 8 {'yy': 1, 'zz': 3}
The surrounding comprehension is used to reformat the data display:
Sample with 1 row:
' '.join([f'{k}-{v}' for k, v in Counter({'xx': 1, 'yy': 1, 'zz': 1}).items()])
xx-1 yy-1 zz-1

Pandas dataframe to json with key

I have a dataframe with columns ['a', 'b', 'c' ]
and would like to export in dictionnary as follow :
{ 'value of a' : { 'b': 3, 'c': 7},
'value2 of a' : { 'b': 7, 'c': 9}
}
I believe you need set_index with DataFrame.to_dict:
df = pd.DataFrame({'a':list('ABC'),
'b':[4,5,4],
'c':[7,8,9]})
print (df)
a b c
0 A 4 7
1 B 5 8
2 C 4 9
d = df.set_index('a').to_dict('index')
print (d)
{'A': {'b': 4, 'c': 7}, 'B': {'b': 5, 'c': 8}, 'C': {'b': 4, 'c': 9}}
And for json use DataFrame.to_json:
j = df.set_index('a').to_json(orient='index')
print (j)
{"A":{"b":4,"c":7},"B":{"b":5,"c":8},"C":{"b":4,"c":9}}

Pandas groupby(dictionary) not returning intended result

I'm trying to group the following data:
>>> a=[{'A': 1, 'B': 2, 'C': 3, 'D':4, 'E':5, 'F':6},{'A': 2, 'B': 3, 'C': 4, 'D':5, 'E':6, 'F':7},{'A': 3, 'B': 4, 'C': 5, 'D':6, 'E':7, 'F':8}]
>>> df = pd.DataFrame(a)
>>> df
A B C D E F
0 1 2 3 4 5 6
1 2 3 4 5 6 7
2 3 4 5 6 7 8
With the Following Dictionary:
dict={'A':1,'B':1,'C':1,'D':2,'E':2,'F':2}
such that
df.groupby(dict).groups
Will output
{1:['A','B','C'],2:['D','E','F']}
Needed to add the axis argument to groupby:
>>> grouped = df.groupby(groupDict,axis=1)
>>> grouped.groups
{1: ['A', 'B', 'C'], 2: ['D', 'E', 'F']}