Getting minimum of rolling window in pandas - pandas

I am trying to obtain the minimum value of three consecutive cells in pandas. The calculation should take into account the one cell above and one below.
I have tried scipy's argelextrema but I have a feeling it does not perform a rolling window.
Thanks
This is a wild approach but it did not perform as expected.
def pivot_swing_low(df):
data = df.copy()
data['d1'] = data.Close.shift(-1)
data['d3'] = data.Close.shift(0)
data['d4'] = data.Close.shift(1)
data['minPL'] = data[['d1', 'd3', 'd4']].min(axis=1)
data['PL'] = np.where(data['minPL'] == data['d3'], data['d3'], "NaN")
data['recentPL'] = data.PL.shift(2).astype(float).fillna(method='ffill')
data = data.drop(columns=['d1', 'd3', 'd4'])
return data
It will always capture the row number 33, but to me row 31 is relevant as well.
38.78 1671068699999 2022-12-15 01:44:59.999 NaN NaN -0.37 0.00 0.37 0.023571 0.054286 0.023125 0.057698 0.400805 28.612474 NaN NaN 38.78 38.78 39.15
30 38.79 1671068999999 2022-12-15 01:49:59.999 NaN NaN 0.01 0.01 0.00 0.022857 0.054286 0.022188 0.053576 0.414137 29.285496 NaN NaN 38.48 NaN 39.15
31 38.48 1671069299999 2022-12-15 01:54:59.999 NaN NaN -0.31 0.00 0.31 0.021429 0.076429 0.020603 0.071892 0.286583 22.274722 22.274722 NaN 38.48 38.48 38.78
32 38.67 1671069599999 2022-12-15 01:59:59.999 NaN NaN 0.19 0.19 0.00 0.035000 0.074286 0.032703 0.066757 0.489878 32.880419 NaN NaN 38.37 NaN 38.78
33 38.37 1671069899999 2022-12-15 02:04:59.999 38.37000000 NaN -0.30 0.00 0.30 0.035000 0.093571 0.030367 0.083417 0.364036 26.688174 NaN NaN 38.37 38.37 38.48
34 38.58 1671070199999 2022-12-15 02:09:59.999 NaN NaN 0.21 0.21 0.00 0.050000 0.090000 0.043198 0.077459 0.557687 35.802263 NaN NaN 38.37 NaN 38.48
35 38.70 1671070499999 2022-12-15 02:14:59.999 NaN NaN 0.12 0.12 0.00 0.058571 0.090000 0.048684 0.071926 0.676857 40.364625 NaN 40.364625 38.58 NaN 38.37

import pandas as pd
# Load the data into a dataframe
df = pd.read_csv('data.csv')
# Calculate the minimum of the current cell, the cell above, and the cell below
min_three_cells = df['value'].rolling(3, min_periods=1).min()
# View the results
print(min_three_cells)
This might help.

Related

Scraping Table across Multipe WebPages Using BeautifulSoup

Link to table: https://home.treasury.gov/resource-center/data-chart-center/interest-rates/TextView?type=daily_treasury_yield_curve&field_tdr_date_value=all&page=0
This table goes from page 0 to page 27.
I have successfully scraped the table into a pandas df for page 0:
url = 'https://home.treasury.gov/resource-center/data-chart-center/interest-rates/TextView?type=daily_treasury_yield_curve&field_tdr_date_value=all&page=0'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
#getting the table
table = soup.find('table', {'class':'views-table views-view-table cols-20'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
Now I need to do the same for all the pages and store it into a single a df.
You can use pandas.read_html to parse tables to dataframes and then concat them:
import pandas as pd
url = "https://home.treasury.gov/resource-center/data-chart-center/interest-rates/TextView?type=daily_treasury_yield_curve&field_tdr_date_value=all&page={}"
all_df = []
for page in range(0, 10): # <-- increase number of pages here
print("Getting page", page)
all_df.append(pd.read_html(url.format(page))[0])
final_df = pd.concat(all_df).reset_index(drop=True)
print(final_df.tail(10).to_markdown(index=False))
Date
20 YR
30 YR
Extrapolation Factor
8 WEEKS BANK DISCOUNT
COUPON EQUIVALENT
52 WEEKS BANK DISCOUNT
COUPON EQUIVALENT.1
1 Mo
2 Mo
3 Mo
6 Mo
1 Yr
2 Yr
3 Yr
5 Yr
7 Yr
10 Yr
20 Yr
30 Yr
12/13/2001
nan
nan
nan
nan
nan
nan
nan
1.69
nan
1.69
1.78
2.2
3.09
3.62
4.4
4.9
5.13
5.81
5.53
12/14/2001
nan
nan
nan
nan
nan
nan
nan
1.7
nan
1.73
1.81
2.22
3.2
3.73
4.52
5.01
5.24
5.89
5.59
12/17/2001
nan
nan
nan
nan
nan
nan
nan
1.72
nan
1.74
1.84
2.24
3.21
3.74
4.54
5.03
5.26
5.91
5.61
12/18/2001
nan
nan
nan
nan
nan
nan
nan
1.72
nan
1.71
1.81
2.24
3.13
3.66
4.46
4.93
5.16
5.81
5.52
12/19/2001
nan
nan
nan
nan
nan
nan
nan
1.69
nan
1.69
1.8
2.23
3.11
3.63
4.38
4.84
5.08
5.73
5.45
12/20/2001
nan
nan
nan
nan
nan
nan
nan
1.67
nan
1.69
1.79
2.22
3.15
3.67
4.42
4.86
5.08
5.73
5.43
12/21/2001
nan
nan
nan
nan
nan
nan
nan
1.67
nan
1.71
1.81
2.23
3.17
3.69
4.45
4.89
5.12
5.76
5.45
12/24/2001
nan
nan
nan
nan
nan
nan
nan
1.66
nan
1.72
1.83
2.24
3.22
3.74
4.49
4.95
5.18
5.81
5.49
12/26/2001
nan
nan
nan
nan
nan
nan
nan
1.77
nan
1.75
1.87
2.34
3.26
3.8
4.55
5
5.22
5.84
5.52
12/27/2001
nan
nan
nan
nan
nan
nan
nan
1.75
nan
1.74
1.84
2.27
3.19
3.71
4.46
4.9
5.13
5.78
5.49

Plot stacked (100%) bar chart for multiple categories on multiple dates

I have following initial dataframe:
Post ID
Submission_Date
Flair
0
row1
01.12.2020
NaN
1
row2
03.12.2020
Discussion
2
row3
03.12.2020
News
3
row4
03.12.2020
Discussion
4
row5
06.12.2020
Due Diligence
5
row6
07.12.2020
Discussion
6
row7
31.12.2020
Discussion
1
row8
01.01.2021
Hedge Fund Tears
Multiple Dates with missing dates in between
Multiple categories on dates
I grouped the dataframe with:
import pandas as pd
import numpy as np # for test data
data = {'Post ID': ['row1', 'row2', 'row3', 'row4', 'row5', 'row6', 'row7', 'row8'], 'Submission_Date': ['01.12.2020', '03.12.2020', '03.12.2020', '03.12.2020', '06.12.2020', '07.12.2020', '31.12.2020', '01.01.2021'], 'Flair': [np.nan, 'Discussion', 'News', 'Discussion', 'Due Diligence', 'Discussion', 'Discussion', 'Hedge Fund Tears']}
df = pd.DataFrame(data)
df['Submission_Date'] = pd.to_datetime(df['Submission_Date'], format = "%Y-%m-%d %H:%M:%S").dt.strftime('%Y-%m-%d')
df = df.groupby('Submission_Date')['Flair'].value_counts(normalize=True).unstack()
The result is this:
I want to fill the dates with "empty" bars and plot like this
something like this:
I tried already this:
fig, ax = plt.subplots(figsize=(20,10))
df.plot(kind='bar',ax=ax, stacked=True, width=1)
plt.xlabel('Submission_Date', fontsize=16)
plt.ylabel('Ratio of Flairs used', fontsize=16)
But the dates are incorrect since the empty days are not displayed
Assuming this input as df2 (the output of your groupby operation):
Flair Discussion Due Diligence Hedge Fund Tears News
Submission_Date
01.01.2021 NaN NaN 1.0 NaN
03.12.2020 0.666667 NaN NaN 0.333333
06.12.2020 NaN 1.0 NaN NaN
07.12.2020 1.000000 NaN NaN NaN
31.12.2020 1.000000 NaN NaN NaN
You can reindex from pd.date_range:
df2.index = pd.to_datetime(df2.index, format='%d.%m.%Y')
df2 = df2.reindex(pd.date_range(df2.index.min(), df2.index.max()))
df2.index = df2.index.strftime('%Y-%m-%d')
Flair Discussion Due Diligence Hedge Fund Tears News
2020-12-03 0.666667 NaN NaN 0.333333
2020-12-04 NaN NaN NaN NaN
2020-12-05 NaN NaN NaN NaN
2020-12-06 NaN 1.0 NaN NaN
2020-12-07 1.000000 NaN NaN NaN
...
2020-12-30 NaN NaN NaN NaN
2020-12-31 1.000000 NaN NaN NaN
2021-01-01 NaN NaN 1.0 NaN
graphical outcome (small size):

Converting Annual and Monthly data to weekly in Python

My current data has variables recorded at different time interval and I want to have all variables cleaned and nicely aligned in a weekly format by either redistribution (weekly = monthly/4) or fill in the monthly value for each week (weekly = monthly).
df=pd.DataFrame({
'Date':['2020-06-03','2020-06-08','2020-06-15','2020-06-22','2020-06-29','2020-07-15','2020-08-15','2020-09-15','2020-10-14','2020-11-15','2020-12-15','2020-12-31','2021-01-15'],
'Date_Type':['Week_start_Mon','Week_start_Mon','Week_start_Mon','Week_start_Mon','Week_start_Mon','Monthly','Monthly','Monthly','Monthly','Monthly','Annual','Annual','Annual'],
'Var_Name':['A','A','A','A','B','C','C','C','E','F','G','G','H'],
'Var_Value':
[150,50,0,200,800,5000,2000,6000.15000,2300,3300,650000,980000,1240000]})
Date Date_Type Var_Name Var_Value
0 2020-06-03 Week_start_Mon A 150.0
1 2020-06-08 Week_start_Mon A 50.0
2 2020-06-15 Week_start_Mon A 0.0
3 2020-06-22 Week_start_Mon A 200.0
4 2020-06-29 Week_start_Mon B 800.0
5 2020-07-15 Monthly C 5000.0
6 2020-08-15 Monthly C 2000.0
7 2020-09-15 Monthly C 6000.15
8 2020-10-14 Monthly E 2300.0
9 2020-11-15 Monthly F 3300.0
10 2020-12-15 Annual G 650000.0
11 2020-12-31 Annual G 980000.0
12 2021-01-15 Annual H 1240000.0
An ideal output will look like this:
For variable C, the date range will be the start to the end dates of master df. All dates are aligned and set to start on Mondays of that week. The monthly variable value is evenly distributed to 4 weeks, and there would 0 for each week in June.
Similarly annual variables will be distributed to 52 weeks.
Date Date_Type Var_Name Var_Value
0 2020-06-01 Monthly C 0
1 2020-06-08 Monthly C 0
2 2020-06-15 Monthly C 0
3 2020-06-22 Monthly C 0
4 2020-06-29 Monthly C 0
5 2020-07-06 Monthly C 1250
6 2020-07-13 Monthly C 1250
7 2020-07-20 Monthly C 1250
8 2020-07-27 Monthly C 1250
9 2020-08-03 Monthly C 400
10 2020-08-10 Monthly C 400
11 2020-08-17 Monthly C 400
12 2020-08-24 Monthly C 400
13 2020-08-31 Monthly C 400
.
.
.
to the end date
For variable E, a percentage value that need to be filled for every week where it applies, the output would look like this:
Date Date_Type Var_Name Var_Value
0 2020-06-01 Monthly E 0
1 2020-06-08 Monthly E 0
2 2020-06-15 Monthly E 0
3 2020-06-22 Monthly E 0
.
.
.
5 2020-09-28 Monthly E 0
6 2020-10-05 Monthly E 0.35
7 2020-10-12 Monthly E 0.35
8 2020-10-19 Monthly E 0.35
9 2020-10-26 Monthly E 0.35
10 2020-11-02 Monthly E 0
11 2020-11-09 Monthly E 0
12 2020-11-16 Monthly E 0
Ultimately my goal is to create a loop for treating this kind of data
if weekly
xxxxx
if monthly
xxxxx
if annual
xxxxx
Please help!
This is a partial answer, I need some explanation.
Set Date as index and realign all dates to Monday (I assume Date is already a datetime64 dtype)
df = df.set_index("Date")
df.index = df.index.map(lambda d: d - pd.tseries.offsets.Day(d.weekday()))
>>> df
Date_Type Var_Name Var_Value
Date
2020-06-01 Weekly A 150.00
2020-06-08 Weekly A 50.00
2020-06-15 Weekly A 0.00
2020-06-22 Weekly A 200.00
2020-06-29 Weekly B 800.00
2020-07-13 Monthly C 5000.00
2020-08-10 Monthly C 2000.00
2020-09-14 Monthly C 6000.15
2020-10-12 Monthly E 2300.00
2020-11-09 Monthly F 3300.00
2020-12-14 Annual G 650000.00
2020-12-28 Annual G 980000.00
2021-01-11 Annual H 1240000.00
Create the index for each variable from 2020-06-01 to 2021-01-11 with a frequency of 7 days:
dti = pd.date_range(df.index.min(), df.index.max(), freq="7D", name="Date")
>>> dti
DatetimeIndex(['2020-06-01', '2020-06-08', '2020-06-15', '2020-06-22',
'2020-06-29', '2020-07-06', '2020-07-13', '2020-07-20',
'2020-07-27', '2020-08-03', '2020-08-10', '2020-08-17',
'2020-08-24', '2020-08-31', '2020-09-07', '2020-09-14',
'2020-09-21', '2020-09-28', '2020-10-05', '2020-10-12',
'2020-10-19', '2020-10-26', '2020-11-02', '2020-11-09',
'2020-11-16', '2020-11-23', '2020-11-30', '2020-12-07',
'2020-12-14', '2020-12-21', '2020-12-28', '2021-01-04',
'2021-01-11'],
dtype='datetime64[ns]', name='Date', freq='7D')
Reindex your dataframe with the new index (pivot for a better display):
df = df.pivot(columns=["Date_Type", "Var_Name"], values="Var_Value").reindex(dti)
>>> df
Date_Type Weekly Monthly Annual
Var_Name A B C E F G H
Date
2020-06-01 150.0 NaN NaN NaN NaN NaN NaN
2020-06-08 50.0 NaN NaN NaN NaN NaN NaN
2020-06-15 0.0 NaN NaN NaN NaN NaN NaN
2020-06-22 200.0 NaN NaN NaN NaN NaN NaN
2020-06-29 NaN 800.0 NaN NaN NaN NaN NaN
2020-07-06 NaN NaN NaN NaN NaN NaN NaN
2020-07-13 NaN NaN 5000.00 NaN NaN NaN NaN
2020-07-20 NaN NaN NaN NaN NaN NaN NaN
2020-07-27 NaN NaN NaN NaN NaN NaN NaN
2020-08-03 NaN NaN NaN NaN NaN NaN NaN
2020-08-10 NaN NaN 2000.00 NaN NaN NaN NaN
2020-08-17 NaN NaN NaN NaN NaN NaN NaN
2020-08-24 NaN NaN NaN NaN NaN NaN NaN
2020-08-31 NaN NaN NaN NaN NaN NaN NaN
2020-09-07 NaN NaN NaN NaN NaN NaN NaN
2020-09-14 NaN NaN 6000.15 NaN NaN NaN NaN
2020-09-21 NaN NaN NaN NaN NaN NaN NaN
2020-09-28 NaN NaN NaN NaN NaN NaN NaN
2020-10-05 NaN NaN NaN NaN NaN NaN NaN
2020-10-12 NaN NaN NaN 2300.0 NaN NaN NaN
2020-10-19 NaN NaN NaN NaN NaN NaN NaN
2020-10-26 NaN NaN NaN NaN NaN NaN NaN
2020-11-02 NaN NaN NaN NaN NaN NaN NaN
2020-11-09 NaN NaN NaN NaN 3300.0 NaN NaN
2020-11-16 NaN NaN NaN NaN NaN NaN NaN
2020-11-23 NaN NaN NaN NaN NaN NaN NaN
2020-11-30 NaN NaN NaN NaN NaN NaN NaN
2020-12-07 NaN NaN NaN NaN NaN NaN NaN
2020-12-14 NaN NaN NaN NaN NaN 650000.0 NaN
2020-12-21 NaN NaN NaN NaN NaN NaN NaN
2020-12-28 NaN NaN NaN NaN NaN 980000.0 NaN
2021-01-04 NaN NaN NaN NaN NaN NaN NaN
2021-01-11 NaN NaN NaN NaN NaN NaN 1240000.0
It only remains to fill in the missing values. It can be easy if I know how to deal with:
if weekly
xxxxx
if monthly
xxxxx
if annual
xxxxx

Converting a list of list of tuples to a DataFrame (First argument: column, Second argument: value)

So I need a DataFrame out of a list_of_list_of_tuples:
My data looks like this:
tuples = [[(5,0.45),(6,0.56)],[(1,0.23),(2,0.54),(6,0.63)],[(3,0.86),(6,0.36)]]
What I need is this:
index
1
2
3
4
5
6
1
nan
nan
nan
nan
0.45
0.56
2
0.23
0.54
nan
nan
nan
0.63
3
nan
nan
0.86
nan
nan
0.36
So that the first argument in the tuple is the column, and the second is the value.
An index would be nice also.
Can anyone help me?
I have no idea how to formulate the code.
Convert each tuple to dictionary, pass to DataFrame constructor and last add DataFrame.reindex for change order and also add missing columns:
df = pd.DataFrame([dict(x) for x in tuples])
df = df.reindex(range(df.columns.min(), df.columns.max() + 1), axis=1)
print (df)
1 2 3 4 5 6
0 NaN NaN NaN NaN 0.45 0.56
1 0.23 0.54 NaN NaN NaN 0.63
2 NaN NaN 0.86 NaN NaN 0.36
tuples = [[(5,0.45),(6,0.56)],[(1,0.23),(2,0.54),(6,0.63)],[(3,0.86),(6,0.36)]]
for x in tuples:
print(x)
index=[]
values=[]
for tuple in x:
print(tuple[0],tuple[1])
index.append(tuple[0])
values.append(tuple[1])
print(index,values)

Column names appearing against the whole dataframe

I am trying to add column names to a dataframe which has no header.
Dataframe
1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00
2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00
3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00
Trying to add colum names:
col_names=['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Glass Type']
uci=pd.read_csv('UCI.csv', delimiter=',',header=None, names=col_names)
but first column name is appearing against the whole dataframe, rest of the column names have NaN
O/P:
Id RI Na Mg Al Si K Ca Ba Fe Glass Type
0 1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
I get NaNs only for last columns, because more values in name list:
import pandas as pd
temp=u"""1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00
2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00
3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00"""
#after testing replace 'pd.compat.StringIO(temp)' to 'filename.csv'
col_names=['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Glass Type']
df = pd.read_csv(pd.compat.StringIO(temp), names=col_names)
print (df)
Id RI Na Mg Al Si K Ca Ba Fe \
0 1.52101 13.64000 4.49 1.10 71.78 0.06 8.75 0.00 NaN NaN
1 2.00000 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0.0 NaN
2 3.00000 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0.0 NaN
Glass Type
0 NaN
1 NaN
2 NaN
But it seems your data are different, there are trailing ", so is necessary add parameter quoting:
temp=u'''"1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00"
"2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00"
"3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00"'''
#after testing replace 'pd.compat.StringIO(temp)' to 'filename.csv'
col_names=['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Glass Type']
df = pd.read_csv(pd.compat.StringIO(temp), names=col_names, quoting=3)
print (df)
Id RI Na Mg Al Si K Ca Ba Fe Glass Type
0 "1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0.00" NaN NaN
1 "2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0.00" NaN NaN
2 "3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0.00" NaN NaN
#last manually remove traling "
df['Id'] = df['Id'].str.strip('"')
df['Ba'] = df['Ba'].str.strip('"').astype(float)
print (df)
Id RI Na Mg Al Si K Ca Ba Fe Glass Type
0 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0.00 NaN NaN
1 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0.00 NaN NaN
2 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0.00 NaN NaN
Verify problem:
col_names=['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Glass Type']
print (pd.read_csv(pd.compat.StringIO(temp), names=col_names))
Id RI Na Mg Al Si K Ca \
0 1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00 NaN NaN NaN NaN NaN NaN NaN
1 2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00 NaN NaN NaN NaN NaN NaN NaN
2 3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00 NaN NaN NaN NaN NaN NaN NaN
Ba Fe Glass Type
0 NaN NaN NaN
1 NaN NaN NaN
2 NaN NaN NaN