pandas groupby with function as key - pandas

I would like to calculate the mean with with a timespan of 3 years.
My data are like that :
import pandas as pd
import numpy as np
N=120
data = {'p1': np.random.randint(50,100,N),
'p2': np.random.randint(0,100,N),
'p3': np.random.randint(10,70,N)
}
df = (pd.DataFrame(data, index=pd.bdate_range(start='20100101', periods=N, freq='BM'))
.stack()
.reset_index()
.rename(columns={'level_0': 'date', 'level_1': 'type', 0: 'price'})
.sort_values('date')
)
I tried :
(df.sort_values('date')
.groupby(['type',
''.join([(df.date.dt.year-3), '-', (df.date.dt.year)]) #3 years time span
]
)
['price']
.apply(lambda x: x.mean())
)
but get an error message :
TypeError: sequence item 0: expected str instance, Series found
I would like to calculate the mean (and others stat) on price with group by type/time period of 2010-2013, 2011-2014, 2012-2015...
The label is important because I can use :
(df.sort_values('date')
.groupby(['type', df.date.dt.year//3]) #3 years time span
['price']
.apply(lambda x: x.mean())
)
any idea ?

I think I found the answer to my own question with (someone else could be interested) :
(df.sort_values('date')
.groupby(['type', (df.date.dt.year-3).astype(str).str.cat((df.date.dt.year).astype(str), sep='-')
]
)
['price']
.apply(lambda x: x.mean())
)

Related

Pd['Column1] > Pd['Column2'] Key Error: 0

I am trying to write a function that loops through a pandas dataframe and compares if column1 > column2, if so appends 1 to a list, which is then returned.
Importing finance data from yahoo finance, calculating 2 Std and assigning to column upper and lower, and the moving average.
import pandas as pd
import pandas_datareader as web
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
import numpy as np
end = datetime(2021, 1, 16)
start = datetime(2020 , 1, 16)
symbols = ['ETH-USD']
stock_df = web.get_data_yahoo(symbols, start, end)
period = 20
# Simple Moving Average
stock_df['SMA'] = stock_df['Close'].rolling(window=period).mean()
# Standard deviation
stock_df['STD'] = stock_df['Close'].rolling(window=period).std()
# Upper Bollinger Band
stock_df['Upper'] = stock_df['SMA'] + (stock_df['STD'] * 2)
# Lower Bollinger Band
stock_df['Lower'] = stock_df['SMA'] - (stock_df['STD'] * 2)
# List of columns
column_list = ['Close', 'SMA', 'Upper', 'Lower']
stock_df[column_list].plot(figsize=(12.2,6.4)) #Plot the data
plt.title('ETH-USD')
plt.ylabel('USD Price ($)')
plt.show();
#Create a new data frame, Period for calculation, removes NAN's
bolldf = stock_df[period-1:]
#Show the new data frame
bolldf
Function, Loop through column rows and compare, append df['Close'][0] to buy/sell signal if condition is met.
def signal(df):
buy_signal = []
sell_signal = []
for i in range(len(df['Close'])):
if df['Close'][i] > df['Upper'][i]:
buy_signal.append(1)
return buy_signal
buy_signal = signal(bolldf)
buy_signal
Info about the Error:
KeyError: 0
During handling of the above exception, another exception occurred:
---> 12 buy_signal = greaterthan(stock_df)
KeyError: 0
During handling of the above exception, another exception occurred:
11
---> 12 buy_signal = greaterthan(stock_df)
13
----> 8 if bolldf['Close'][i] > bolldf['Open'][i]:
KeyError: 0
When i attempt this function on the columns df['Upper'] > df['Lower], or df['SMA'] < df['Lower'] for example it works as expected, its only when using the columns from the original data that it does not work.
Any help would be amazing. Thank you.
Since it is a multi-index, the columns must also be specified in the multi-index format. You can check the contents of the columns with bolldf.columns, and you can get them by modifying as follows.
bolldf.columns
MultiIndex([('Adj Close', 'ETH-USD'),
( 'Close', 'ETH-USD'),
( 'High', 'ETH-USD'),
( 'Low', 'ETH-USD'),
( 'Open', 'ETH-USD'),
( 'Volume', 'ETH-USD'),
( 'SMA', ''),
( 'STD', ''),
( 'Upper', ''),
( 'Lower', '')],
names=['Attributes', 'Symbols'])
def signal(df):
buy_signal = []
sell_signal = []
for i in range(len(df[('Close','ETH-USD')])):
if df[('Close','ETH-USD')][i] > df[('Upper','')][i]:
buy_signal.append(1)
return buy_signal
buy_signal = signal(bolldf)

GroupBy Function Not Applying

I am trying to groupby for the following specializations but I am not getting the expected result (or any for that matter). The data stays ungrouped even after this step. Any idea what's wrong in my code?
cols_specials = ['Enterprise ID','Specialization','Specialization Branches','Specialization Type']
specials = pd.read_csv(agg_specials, engine='python')
specials = specials.merge(roster, left_on='Enterprise ID', right_on='Enterprise ID', how='left')
specials = specials[cols_specials]
specials = specials.groupby(['Enterprise ID'])['Specialization'].transform(lambda x: '; '.join(str(x)))
specials.to_csv(end_report_specials, index=False, encoding='utf-8-sig')
Please try using agg:
import pandas as pd
df = pd.DataFrame(
[
['john', 'eng', 'build'],
['john', 'math', 'build'],
['kevin', 'math', 'asp'],
['nick', 'sci', 'spi']
],
columns = ['id', 'spec', 'type']
)
df.groupby(['id'])[['spec']].agg(lambda x: ';'.join(x))
resiults in:
if you need to preserve starting number of lines, use transform. transform returns one column:
df['spec_grouped'] = df.groupby(['id'])[['spec']].transform(lambda x: ';'.join(x))
df
results in:

Same calculation for all combinations in DataFrame

I got dataframe like below,
import pandas as pd
df = pd.DataFrame({'CITY': ['A','B','C','A','C','B'],
'MAKE_NAME': ['SO','OK','CO','LU','CO','OK'],
'USER' : ['JK','JK','MK','JK','JK','JK'],
'RESULT_CODE' : ['Y','Y','N','N','Y','Y'],
'VALID' : [1,1,1,1,1,0],
'COUNT' : [1,1,1,1,1,1] })
I want to calculate the valid/count of all combinations in double and triple and quadruple. Also i want to get result as dataframe.
For example result for double like below,
Also result for triple like below,
Thanks for all,
You can find the solution below.
import pandas as pd
df = pd.DataFrame({'CITY': ['A','B','C','A','C','B'],
'MAKE_NAME': ['SO','OK','CO','LU','CO','OK'],
'USER' : ['JK','JK','MK','JK','JK','JK'],
'RESULT_CODE' : ['Y','Y','N','N','Y','Y'],
'VALID' : [1,1,1,1,1,0],
'COUNT' : [1,1,1,1,1,1] })
for i in df.columns:
for j in df.columns:
for k in df.columns:
for l in df.columns:
try:
a1 = df.groupby([i,j,k,l], as_index=False, sort=True, group_keys=True)[['VALID','COUNT']].count()
a1['RATE'] = a1.VALID / a1.COUNT
print(a1)
except Exception:
pass

Transform list of pyspark rows into pandas data frame through a dictionary

I am trying to convert a list of PySpark sorted rows to a Pandas data frame using dictionary comprehension but only works when explicitly stating the key and value of the desired dictionary.
row_list = sorted(data, key=lambda row: row['date'])
future_df = {'key': int(key),
'date': map(lambda row: row["date"], row_list),
'col1': map(lambda row: row["col1"], row_list),
'col2': map(lambda row: row["col2"], row_list)}
And then converting it to Pandas with:
pd.DataFrame(future_df)
This operation is to be found inside the class ForecastByKey invoked by:
rdd = df.select('*')
.rdd \
.map(lambda row: ((row['key']), row)) \
.groupByKey() \
.map(lambda args: spark_ops.run(args[0], args[1]))
Up to this point, everything works fine; meaning explicitly indicating the columns inside the dictionary future_df.
The problem arises when trying to convert the whole set of columns (700+) with something like:
future_df = {'key': int(key),
'date': map(lambda row: row["date"], row_list)}
for col_ in columns:
future_df[col_] = map(lambda row: row[col_], row_list)
pd.DataFrame(future_df)
Where columns contains the name of each coumn passed to the ForecastByKey class.
The result of this operation is a data frame with empty or close-to-zero columns.
I am using Python 3.6.10 and PySpark 2.4.5
How is this iteration to be done in order to get a data frame with the right information?
After some research, I realized this can be solved with:
row_list = sorted(data, key=lambda row: row['date'])
def f(x):
return map(lambda row: row[x], row_list)
pre_df = {col_: col_ for col_ in self.sdf_cols}
future_df = toolz.valmap(f, pre_df)
future_df['key'] = int(key)

Quantile across rows and down columns using selected columns only [duplicate]

I have a dataframe with column names, and I want to find the one that contains a certain string, but does not exactly match it. I'm searching for 'spike' in column names like 'spike-2', 'hey spike', 'spiked-in' (the 'spike' part is always continuous).
I want the column name to be returned as a string or a variable, so I access the column later with df['name'] or df[name] as normal. I've tried to find ways to do this, to no avail. Any tips?
Just iterate over DataFrame.columns, now this is an example in which you will end up with a list of column names that match:
import pandas as pd
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6], 'spiked-in': [7,8,9], 'no': [10,11,12]}
df = pd.DataFrame(data)
spike_cols = [col for col in df.columns if 'spike' in col]
print(list(df.columns))
print(spike_cols)
Output:
['hey spke', 'no', 'spike-2', 'spiked-in']
['spike-2', 'spiked-in']
Explanation:
df.columns returns a list of column names
[col for col in df.columns if 'spike' in col] iterates over the list df.columns with the variable col and adds it to the resulting list if col contains 'spike'. This syntax is list comprehension.
If you only want the resulting data set with the columns that match you can do this:
df2 = df.filter(regex='spike')
print(df2)
Output:
spike-2 spiked-in
0 1 7
1 2 8
2 3 9
This answer uses the DataFrame.filter method to do this without list comprehension:
import pandas as pd
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6]}
df = pd.DataFrame(data)
print(df.filter(like='spike').columns)
Will output just 'spike-2'. You can also use regex, as some people suggested in comments above:
print(df.filter(regex='spike|spke').columns)
Will output both columns: ['spike-2', 'hey spke']
You can also use df.columns[df.columns.str.contains(pat = 'spike')]
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6], 'spiked-in': [7,8,9], 'no': [10,11,12]}
df = pd.DataFrame(data)
colNames = df.columns[df.columns.str.contains(pat = 'spike')]
print(colNames)
This will output the column names: 'spike-2', 'spiked-in'
More about pandas.Series.str.contains.
# select columns containing 'spike'
df.filter(like='spike', axis=1)
You can also select by name, regular expression. Refer to: pandas.DataFrame.filter
df.loc[:,df.columns.str.contains("spike")]
Another solution that returns a subset of the df with the desired columns:
df[df.columns[df.columns.str.contains("spike|spke")]]
You also can use this code:
spike_cols =[x for x in df.columns[df.columns.str.contains('spike')]]
Getting name and subsetting based on Start, Contains, and Ends:
# from: https://stackoverflow.com/questions/21285380/find-column-whose-name-contains-a-specific-string
# from: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html
# from: https://cmdlinetips.com/2019/04/how-to-select-columns-using-prefix-suffix-of-column-names-in-pandas/
# from: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html
import pandas as pd
data = {'spike_starts': [1,2,3], 'ends_spike_starts': [4,5,6], 'ends_spike': [7,8,9], 'not': [10,11,12]}
df = pd.DataFrame(data)
print("\n")
print("----------------------------------------")
colNames_contains = df.columns[df.columns.str.contains(pat = 'spike')].tolist()
print("Contains")
print(colNames_contains)
print("\n")
print("----------------------------------------")
colNames_starts = df.columns[df.columns.str.contains(pat = '^spike')].tolist()
print("Starts")
print(colNames_starts)
print("\n")
print("----------------------------------------")
colNames_ends = df.columns[df.columns.str.contains(pat = 'spike$')].tolist()
print("Ends")
print(colNames_ends)
print("\n")
print("----------------------------------------")
df_subset_start = df.filter(regex='^spike',axis=1)
print("Starts")
print(df_subset_start)
print("\n")
print("----------------------------------------")
df_subset_contains = df.filter(regex='spike',axis=1)
print("Contains")
print(df_subset_contains)
print("\n")
print("----------------------------------------")
df_subset_ends = df.filter(regex='spike$',axis=1)
print("Ends")
print(df_subset_ends)