average on dataframe segments - pandas

In the following picture, I have DataFrame that renders zero after each cycle of operation (the cycle has random length). I want to calculate the average (or perform other operations) for each patch. For example, the average of [0.762, 0.766] alone, and [0.66, 1.37, 2.11, 2.29] alone and so forth till the end of the DataFrame.

So I worked with this data :
random_value
0 0
1 0
2 1
3 2
4 3
5 0
6 4
7 4
8 0
9 1
There is probably a way better solution, but here is what I came with :
def avg_function(df):
avg_list = []
value_list = list(df["random_value"])
temp_list = []
for i in range(len(value_list)):
if value_list[i] == 0:
if temp_list:
avg_list.append(sum(temp_list) / len(temp_list))
temp_list = []
else:
temp_list.append(value_list[i])
if temp_list: # for the last values
avg_list.append(sum(temp_list) / len(temp_list))
return avg_list
test_list = avg_function(df=df)
test_list
[Out] : [2.0, 4.0, 1.0]
Edit: since requested in the comments, here is a way to add the means to the dataframe. I dont know if there is a way to do that with pandas (and there might be!), but I came up with this :
def add_mean(df, mean_list):
temp_mean_list = []
list_index = 0 # will be the index for the value of mean_list
df["random_value_shifted"] = df["random_value"].shift(1).fillna(0)
random_value = list(df["random_value"])
random_value_shifted = list(df["random_value_shifted"])
for i in range(df.shape[0]):
if random_value[i] == 0 and random_value_shifted[i] == 0:
temp_mean_list.append(0)
elif random_value[i] == 0 and random_value_shifted[i] != 0:
temp_mean_list.append(0)
list_index += 1
else:
temp_mean_list.append(mean_list[list_index])
df = df.drop(["random_value_shifted"], axis=1)
df["mean"] = temp_mean_list
return df
df = add_mean(df=df, mean_list=mean_list
Which gave me :
df
[Out] :
random_value mean
0 0 0
1 0 0
2 1 2
3 2 2
4 3 2
5 0 0
6 4 4
7 4 4
8 0 0
9 1 1

Related

Pandas xs where level in list of options

If I have a pd.DataFrame that looks like:
new_df = []
for i in range(10):
df_example = pd.DataFrame(np.random.normal(size=[10,1]))
cols = [round(np.random.uniform(low=0,high=10)),round(np.random.uniform(low=0,high=10)),
round(np.random.uniform(low=0,high=10)),round(np.random.uniform(low=0,high=10))]
keys = ['A','B','C','D']
new_ix = pd.MultiIndex.from_tuples([cols],names=keys)
df_example.columns = new_ix
new_df.append(df_example)
new_df = pd.concat(new_df,axis=1)
Which could yield something like:
Now, if I want where C=4 and A=1 I can do:
df.xs(axis=1,level=['A','C'],key=[1,4])
How do I express if I want:
C in [4,2] and A in [5,2]
C in [4,2] or A in [5,2]
To the best of my knowledge, you can't use anything but tuples for key parameter in xs, so such queries are not possible.
The next best thing is to define helper functions for that purpose, such as the following:
def xs_or(df: pd.DataFrame, params: dict[str, list[int]]) -> pd.DataFrame:
"""Helper function.
Args:
df: input dataframe.
params: columns/values to query.
Returns:
Filtered dataframe.
"""
df = pd.concat(
[
df.xs(axis=1, level=[level], key=(key,))
for level, keys in params.items()
for key in keys
],
axis=1,
)
for level in params.keys():
try:
df = df.droplevel([level], axis=1)
except KeyError:
pass
return df
def xs_and(df: pd.DataFrame, params: dict[str, list[int]]) -> pd.DataFrame:
"""Helper function.
Args:
df: input dataframe.
params: columns/values to query.
Returns:
Filtered dataframe.
"""
for level, keys in params.items():
df = xs_or(df, {level: keys})
return df
And so, with the following dataframe named df:
A 4 7 3 1 7 9 4 0 3 9
B 6 7 4 6 7 5 8 0 8 0
C 2 10 5 2 9 9 4 3 4 5
D 0 1 7 3 8 3 6 7 9 10
0 -0.199458 1.155345 1.298027 0.575606 0.785291 -1.126484 0.019082 1.765094 0.034631 -0.243635
1 1.173873 0.523277 -0.709546 1.378983 0.266661 1.626118 1.647584 -0.228162 -1.708271 0.111583
2 0.321156 0.049470 -0.611111 -1.238887 1.092369 0.019503 -0.473618 1.804474 -0.850320 -0.217921
3 0.339307 -0.758909 0.072159 1.636119 -0.541920 -0.160791 -1.131100 1.081766 -0.530082 -0.546489
4 -1.523110 -0.662232 -0.434115 1.698073 0.568690 0.836359 -0.833581 0.230585 0.166119 1.085600
5 0.020645 -1.379587 -0.608083 -1.455928 1.855402 1.714663 -0.739409 1.270043 1.650138 -0.718430
6 1.280583 -1.317288 0.899278 -0.032213 -0.347234 2.543415 0.272228 -0.664116 -1.404851 -0.517939
7 -1.201619 0.724669 -0.705984 0.533725 0.820124 0.651339 0.363214 0.727381 -0.282170 0.651201
8 1.829209 0.049628 0.655277 -0.237327 -0.007662 1.849530 0.095479 0.295623 -0.856162 -0.350407
9 -0.690613 1.419008 -0.791556 0.180751 -0.648182 0.240589 -0.247574 -1.947492 -1.010009 1.549234
You can filter like this:
# C in [10, 2] or A in [1, 0]
print(xs_or(df, {"C": [10, 2], "A": [1, 0]}))
# Output
B 7 6 2 3
D 1 0 3 3 7
0 1.155345 -0.199458 0.575606 0.575606 1.765094
1 0.523277 1.173873 1.378983 1.378983 -0.228162
2 0.049470 0.321156 -1.238887 -1.238887 1.804474
3 -0.758909 0.339307 1.636119 1.636119 1.081766
4 -0.662232 -1.523110 1.698073 1.698073 0.230585
5 -1.379587 0.020645 -1.455928 -1.455928 1.270043
6 -1.317288 1.280583 -0.032213 -0.032213 -0.664116
7 0.724669 -1.201619 0.533725 0.533725 0.727381
8 0.049628 1.829209 -0.237327 -0.237327 0.295623
9 1.419008 -0.690613 0.180751 0.180751 -1.947492
# C in [10, 2] and A in [1, 7]
print(xs_and(df, {"C": [10, 2], "A": [1, 7]}))
# Output
B 6 7
D 3 1
0 0.575606 1.155345
1 1.378983 0.523277
2 -1.238887 0.049470
3 1.636119 -0.758909
4 1.698073 -0.662232
5 -1.455928 -1.379587
6 -0.032213 -1.317288
7 0.533725 0.724669
8 -0.237327 0.049628
9 0.180751 1.419008

How to index the unique value count in numpy? [duplicate]

Consider the following lists short_list and long_list
short_list = list('aaabaaacaaadaaac')
np.random.seed([3,1415])
long_list = pd.DataFrame(
np.random.choice(list(ascii_letters),
(10000, 2))
).sum(1).tolist()
How do I calculate the cumulative count by unique value?
I want to use numpy and do it in linear time. I want this to compare timings with my other methods. It may be easiest to illustrate with my first proposed solution
def pir1(l):
s = pd.Series(l)
return s.groupby(s).cumcount().tolist()
print(np.array(short_list))
print(pir1(short_list))
['a' 'a' 'a' 'b' 'a' 'a' 'a' 'c' 'a' 'a' 'a' 'd' 'a' 'a' 'a' 'c']
[0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1]
I've tortured myself trying to use np.unique because it returns a counts array, an inverse array, and an index array. I was sure I could these to get at a solution. The best I got is in pir4 below which scales in quadratic time. Also note that I don't care if counts start at 1 or zero as we can simply add or subtract 1.
Below are some of my attempts (none of which answer my question)
%%cython
from collections import defaultdict
def get_generator(l):
counter = defaultdict(lambda: -1)
for i in l:
counter[i] += 1
yield counter[i]
def pir2(l):
return [i for i in get_generator(l)]
def pir3(l):
return [i for i in get_generator(l)]
def pir4(l):
unq, inv = np.unique(l, 0, 1, 0)
a = np.arange(len(unq))
matches = a[:, None] == inv
return (matches * matches.cumsum(1)).sum(0).tolist()
setup
short_list = np.array(list('aaabaaacaaadaaac'))
functions
dfill takes an array and returns the positions where the array changes and repeats that index position until the next change.
# dfill
#
# Example with short_list
#
# 0 0 0 3 4 4 4 7 8 8 8 11 12 12 12 15
# [ a a a b a a a c a a a d a a a c]
#
# Example with short_list after sorting
#
# 0 0 0 0 0 0 0 0 0 0 0 0 12 13 13 15
# [ a a a a a a a a a a a a b c c d]
argunsort returns the permutation necessary to undo a sort given the argsort array. The existence of this method became know to me via this post.. With this, I can get the argsort array and sort my array with it. Then I can undo the sort without the overhead of sorting again.
cumcount will take an array sort it, find the dfill array. An np.arange less dfill will give me cumulative count. Then I un-sort
# cumcount
#
# Example with short_list
#
# short_list:
# [ a a a b a a a c a a a d a a a c]
#
# short_list.argsort():
# [ 0 1 2 4 5 6 8 9 10 12 13 14 3 7 15 11]
#
# Example with short_list after sorting
#
# short_list[short_list.argsort()]:
# [ a a a a a a a a a a a a b c c d]
#
# dfill(short_list[short_list.argsort()]):
# [ 0 0 0 0 0 0 0 0 0 0 0 0 12 13 13 15]
#
# np.range(short_list.size):
# [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
#
# np.range(short_list.size) -
# dfill(short_list[short_list.argsort()]):
# [ 0 1 2 3 4 5 6 7 8 9 10 11 0 0 1 0]
#
# unsorted:
# [ 0 1 2 0 3 4 5 0 6 7 8 0 9 10 11 1]
foo function recommended by #hpaulj using defaultdict
div function recommended by #Divakar (old, I'm sure he'd update it)
code
def dfill(a):
n = a.size
b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
return np.arange(n)[b[:-1]].repeat(np.diff(b))
def argunsort(s):
n = s.size
u = np.empty(n, dtype=np.int64)
u[s] = np.arange(n)
return u
def cumcount(a):
n = a.size
s = a.argsort(kind='mergesort')
i = argunsort(s)
b = a[s]
return (np.arange(n) - dfill(b))[i]
def foo(l):
n = len(l)
r = np.empty(n, dtype=np.int64)
counter = defaultdict(int)
for i in range(n):
counter[l[i]] += 1
r[i] = counter[l[i]]
return r - 1
def div(l):
a = np.unique(l, return_counts=1)[1]
idx = a.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -a[:-1]+1
rng = id_arr.cumsum()
return rng[argunsort(np.argsort(l))]
demonstration
cumcount(short_list)
array([ 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1])
time testing
code
functions = pd.Index(['cumcount', 'foo', 'foo2', 'div'], name='function')
lengths = pd.RangeIndex(100, 1100, 100, 'array length')
results = pd.DataFrame(index=lengths, columns=functions)
from string import ascii_letters
for i in lengths:
a = np.random.choice(list(ascii_letters), i)
for j in functions:
results.set_value(
i, j,
timeit(
'{}(a)'.format(j),
'from __main__ import a, {}'.format(j),
number=1000
)
)
results.plot()
Here's a vectorized approach using custom grouped range creating function and np.unique for getting the counts -
def grp_range(a):
idx = a.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -a[:-1]+1
return id_arr.cumsum()
count = np.unique(A,return_counts=1)[1]
out = grp_range(count)[np.argsort(A).argsort()]
Sample run -
In [117]: A = list('aaabaaacaaadaaac')
In [118]: count = np.unique(A,return_counts=1)[1]
...: out = grp_range(count)[np.argsort(A).argsort()]
...:
In [119]: out
Out[119]: array([ 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 1])
For getting the count, few other alternatives could be proposed with focus on performance -
np.bincount(np.unique(A,return_inverse=1)[1])
np.bincount(np.fromstring('aaabaaacaaadaaac',dtype=np.uint8)-97)
Additionally, with A containing single-letter characters, we could get the count simply with -
np.bincount(np.array(A).view('uint8')-97)
Besides defaultdict there are a couple of other counters. Testing a slightly simpler case:
In [298]: from collections import defaultdict
In [299]: from collections import defaultdict, Counter
In [300]: def foo(l):
...: counter = defaultdict(int)
...: for i in l:
...: counter[i] += 1
...: return counter
...:
In [301]: short_list = list('aaabaaacaaadaaac')
In [302]: foo(short_list)
Out[302]: defaultdict(int, {'a': 12, 'b': 1, 'c': 2, 'd': 1})
In [303]: Counter(short_list)
Out[303]: Counter({'a': 12, 'b': 1, 'c': 2, 'd': 1})
In [304]: arr=[ord(i)-ord('a') for i in short_list]
In [305]: np.bincount(arr)
Out[305]: array([12, 1, 2, 1], dtype=int32)
I constructed arr because bincount only works with ints.
In [306]: timeit np.bincount(arr)
The slowest run took 82.46 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 5.63 µs per loop
In [307]: timeit Counter(arr)
100000 loops, best of 3: 13.6 µs per loop
In [308]: timeit foo(arr)
100000 loops, best of 3: 6.49 µs per loop
I'm guessing it would hard to improve on pir2 based on default_dict.
Searching and counting like this are not a strong area for numpy.

Build a decision Column by ANDing multiple columns in pandas

I have a pandas data frame which is shown below:
>>> x = [[1,2,3,4,5],[1,2,4,4,3],[2,4,5,6,7]]
>>> columns = ['a','b','c','d','e']
>>> df = pd.DataFrame(data = x, columns = columns)
>>> df
a b c d e
0 1 2 3 4 5
1 1 2 4 4 3
2 2 4 5 6 7
I have an array of objects (conditions) as shown below:
[
{
'header' : 'a',
'condition' : '==',
'values' : [1]
},
{
'header' : 'b',
'condition' : '==',
'values' : [2]
},
...
]
and an assignHeader which is:
assignHeader = decision
now I want to do an operation which builds up all the conditions from the conditions array by looping through it, for example something like this:
pConditions = []
for eachCondition in conditions:
header = eachCondition['header']
values = eachCondition['values']
if eachCondition['condition'] == "==":
pConditions.append(df[header].isin(values))
else:
pConditions.append(~df[header].isin(values))
df[assignHeader ] = and(pConditions)
I was thinking of using all operator in pandas but am unable to crack the right syntax to do so. The list I shared can go big and dynamic and so I want to use this nested approach and check for the equality. Does anyone know a way to do so?
Final Output:
conditons = [df['a']==1,df['b']==2]
>>> df['decision'] = (df['a']==1) & (df['b']==2)
>>> df
a b c d e decision
0 1 2 3 4 5 True
1 1 2 4 4 3 True
2 2 4 5 6 7 False
Here conditions array will be variable. And I want to have a function which takes df, 'newheadernameandconditions` as input and returns the output as shown below:
>>> df
a b c d e decision
0 1 2 3 4 5 True
1 1 2 4 4 3 True
2 2 4 5 6 7 False
where newheadername = 'decision'
I was able to solve the problem using the code shown below. I am not sure if this is kind of fast way of getting things done, but would love to know your inputs in case you have any specific thing to point out.
def andMerging(conditions, mergeHeader, df):
if len(conditions) != 0:
df[mergeHeader] = pd.concat(conditions, axis = 1).all(axis = 1)
return df
where conditions are an array of pd.Series with boolean values.
And conditions are formatted as shown below:
def prepareForConditionMerging(conditionsArray, df):
conditions = []
for prop in conditionsArray:
condition = prop['condition']
values = prop['values']
header = prop['header']
if type(values) == str:
values = [values]
if condition=="==":
conditions.append(df[header].isin(values))
else:
conditions.append(~df[header].isin(values))
# Here we can add more conditions such as greater than less than etc.
return conditions

Multiply String in Dataframe?

My desired output is the following:
count tally
1 2 //
2 3 ///
3 5 /////
4 3 ///
5 2 //
My code:
my_list = [1,1,2,2,2,3,3,3,3,3,4,4,4,5,5]
my_series = pd.Series(my_list)
values_counted = pd.Series(my_series.value_counts(),name='count')
# other calculated columns left out for SO simplicity
df = pd.concat([values_counted], axis=1).sort_index()
df['tally'] = values_counted * '/'
With the code above I get the following error:
masked_arith_op
result[mask] = op(xrav[mask], y)
numpy.core._exceptions.UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U21'), dtype('<U21')) -> dtype('<U21')
In searching for solutions I found one on SO that said to try:
values_counted * float('/')
But that did not work.
In 'normal' Python outside of Dataframes the following code works:
10 * '/'
and returns
///////////
How can I achieve the same functionality in a Dataframe?
Use lambda function for repeat values, your solution is simplify:
my_list = [1,1,2,2,2,3,3,3,3,3,4,4,4,5,5]
df1 = pd.Series(my_list).value_counts().to_frame('count').sort_index()
df1['tally'] = df1['count'].apply(lambda x: x * '/')
print (df1)
count tally
1 2 //
2 3 ///
3 5 /////
4 3 ///
5 2 //
You can group the series by itself and then aggregate:
new_df = my_series.groupby(my_series).agg(**{"count": "size",
"tally": lambda s: "/" * s.size})
to get
>>> new_df
count tally
1 2 //
2 3 ///
3 5 /////
4 3 ///
5 2 //

Pandas custom file format

I have a huge Pandas DataFrame that I need to write away to a format that RankLib can understand. Example with a target, a query ID and 3 features is this:
5 qid:4 1:12 2:0.6 3:13
1 qid:4 1:8 2:0.4 3:11
I have written my own function that iterates over the rows and writes them away like this:
data_file = open(filename, 'w')
for index, row in data.iterrows():
line = str(row['score'])
line += ' qid:'+str(row['srch_id'])
counter = 0
for feature in feature_columns:
counter += 1
line += ' '+str(counter)+':'+str(row[feature])
data_file.write(line+'\n')
data_file.close()
Since I have about 200 features and 5m rows this is obviously very slow. Is there a better approach using the I/O of Pandas itself?
you can do it this way:
Data:
In [155]: df
Out[155]:
f1 f2 f3 score srch_id
0 12 0.6 13 5 4
1 8 0.4 11 1 4
2 11 0.7 14 2 10
In [156]: df.dtypes
Out[156]:
f1 int64
f2 float64
f3 int64
score object
srch_id int64
dtype: object
Solution:
feature_columns = ['f1','f2','f3']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}
def f(x):
if x.name in feature_columns:
return cols2id[x.name] + ':' + x.astype(str)
elif x.name == 'srch_id':
return 'quid:' + x.astype(str)
else:
return x
(df.apply(lambda x: f(x))[['score','srch_id'] + feature_columns]
.to_csv('d:/temp/out.csv', sep=' ', index=False, header=None)
)
out.csv:
5 quid:4 1:12 2:0.6 3:13
1 quid:4 1:8 2:0.4 3:11
2 quid:10 1:11 2:0.7 3:14
cols2id helper dict:
In [158]: cols2id
Out[158]: {'f1': '1', 'f2': '2', 'f3': '3'}