Convert a dict to a DataFrame in pandas - pandas

I am using the following code:
import pandas as pd
from yahoofinancials import YahooFinancials
mutual_funds = ['PRLAX', 'QASGX', 'HISFX']
yahoo_financials_mutualfunds = YahooFinancials(mutual_funds)
daily_mutualfund_prices = yahoo_financials_mutualfunds.get_historical_price_data('2015-01-01', '2021-01-30', 'daily')
I get a dictionary as the output file. I would like to get a pandas dataframe with the columns: data, PRLAX, QASGX, HISFX where data is the formatted_date and the Open price for each ticker
pandas dataframe

What you can do is this:
df = pd.DataFrame({
a: {x['formatted_date']: x['adjclose'] for x in daily_mutualfund_prices[a]['prices']} for a in mutual_funds
})
which gives:
PRLAX QASGX HISFX
2015-01-02 19.694817 17.877445 11.852874
2015-01-05 19.203604 17.606575 11.665626
2015-01-06 19.444574 17.316357 11.450289
2015-01-07 19.963596 17.616247 11.525190
2015-01-08 20.260176 18.003208 11.665626
... ... ... ...
2021-01-25 21.799999 33.700001 14.350000
2021-01-26 22.000000 33.139999 14.090000
2021-01-27 21.620001 32.000000 13.590000
2021-01-28 22.120001 32.360001 13.990000
2021-01-29 21.379999 31.709999 13.590000
[1530 rows x 3 columns]
or any other of the values in the dict.

Related

Create a bar plot in plt when having the bins and heights

I have the following ranges of bins and their desired heights
Range | Height
-------------------------
0.0-0.0905 | 0.02601
0.0905-0.1811| 0.13678
0.1811-0.2716| 0.22647
0.2716-0.3621| 0.31481
0.3621-0.4527| 0.40681
0.4527-0.5432| 0.50200
0.5432-0.6337| 0.58746
0.6337-0.7243| 0.68153
0.7243-0.8148| 0.76208
0.8148-0.9053| 0.86030
0.9053-0.9958| 0.95027
0.9958-1 | 0.99584
The desired outcome is a histogram/bar plot with the edges according to Range and the heights according to Height.
You can split your Range and explode to get the edges of the bins:
import pandas as pd
from io import StringIO
data = StringIO("""Range | Height
-------------------------
0.0-0.0905 | 0.02601
0.0905-0.1811| 0.13678
0.1811-0.2716| 0.22647
0.2716-0.3621| 0.31481
0.3621-0.4527| 0.40681
0.4527-0.5432| 0.50200
0.5432-0.6337| 0.58746
0.6337-0.7243| 0.68153
0.7243-0.8148| 0.76208
0.8148-0.9053| 0.86030
0.9053-0.9958| 0.95027
0.9958-1 | 0.99584""")
df = pd.read_csv(data, sep="\s*\|\s*", engine="python", skiprows=[1])
df['Range'] = df['Range'].str.split('-')
df = df.explode('Range').drop_duplicates('Range').astype(float)
This will give you:
Range Height
0 0.0000 0.02601
0 0.0905 0.02601
1 0.1811 0.13678
2 0.2716 0.22647
3 0.3621 0.31481
4 0.4527 0.40681
5 0.5432 0.50200
6 0.6337 0.58746
7 0.7243 0.68153
8 0.8148 0.76208
9 0.9053 0.86030
10 0.9958 0.95027
11 1.0000 0.99584
Then use plt.stairs:
import matplotlib.pyplot as plt
plt.stairs(df['Height'].iloc[1:], edges=df['Range'].values, fill=True)
plt.show()
Output:

Python np select to create a new column by applying condition on other columns

I am trying to create a new column for a data frame, but it seems giving incorrect result in the new column, The data is below:
df = pd.DataFrame(np.random.randint(0,30,size=10),
columns=["Random"],
index=pd.date_range("20180101", periods=10))
df=df.reset_index()
df.loc[:,'Random'] = '20'
df['Recommandation']=['No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No']
df['diff']=[3,2,4,1,6,1,2,2,3,1]
df
I am trying to create another column in 'new' by using the following condition:
If the 'index' is in the first three date, then, 'new'='random',
elif the 'Recommendation' is yes, than 'new'= 'Value of the previous row of the new column'+'diff'
else: 'new'= 'Value of the previous row of the new column'
My code is below:
import numpy as np
df['new'] = 0
df['new'] = np.select([df['index'].isin(df['index'].iloc[:3]), df['Recommandation'].eq('Yes')],
[df['new'], df['diff']+df['new'].shift(1)],
df['new'].shift(1)
)
#The expected output
df[new]=[20,20,20,21,27,28,28,28,31,31]
df
try this:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,30,size=10),
columns=["Random"],
index=pd.date_range("20180101", periods=10))
df = df.reset_index()
df.loc[:,'Random'] = 20
df['Recommandation'] = ['No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No']
df['diff'] = [3,2,4,1,6,1,2,2,3,1]
df.loc[5, 'index'] = pd.to_datetime('2018-01-02') # I modified this data
df['new'] = df['diff']
df['new'] = df['new'].where(df.Recommandation.eq('Yes'))
# the mask that 'index' is in the first three date
m = df['index'].isin(df['index'][:3])
df.loc[m, 'new'] = df.Random
idx = m[m].index.drop([df.index.min()], errors='ignore')
df['new'] = pd.concat(map(lambda x: x.cumsum().ffill(), np.split(df.new, idx)))
df
>>>
index Random Recommandation diff new
0 2018-01-01 20 No 3 20.0
1 2018-01-02 20 Yes 2 20.0
2 2018-01-03 20 No 4 20.0
3 2018-01-04 20 Yes 1 21.0
4 2018-01-05 20 Yes 6 27.0
5 2018-01-02 20 Yes 1 20.0
6 2018-01-07 20 No 2 20.0
7 2018-01-08 20 No 2 20.0
8 2018-01-09 20 Yes 3 23.0
9 2018-01-10 20 No 1 23.0

Mapping selective data with other dataframe in pandas

I want to map data( df2 & df1 ) with selective columns
import pandas as pd
df_data = [{'id':'1234','task':'data_trasnfer','filename':'orance_bank','date':'17-3-22'},{'id':'234','task':'data2trasnfer','filename':'ftr_data','date':'16-03-2022'},{'id':'4567','task':'data3_transfer','filename':'trnienn_data','date':'15-2-22'}]
df1 = pd.DataFrame(df_data)
df1
id task filename date
0 1234 data_trasnfer orance_bank 17-3-22
1 234 data2trasnfer ftr_data 16-03-2022
2 4567 data3_transfer trnienn_data 15-2-22
df_data1 = [{'target':'ed34','status':'sucess','flow_in':'ntfc_to_pad'},{'target':'der456','status':'error','flow_in':'htr_tokid'}]
df2 = pd.DataFrame(df_data1)
df2
target status flow_in
0 ed34 sucess ntfc_to_pad
1 der456 error htr_tokid
expected output :
df2 data ed34 should map with only with fileaname orance_bank & der456 only map with trnienn_data
id task filename date target status flow_in
0 1234 data_trasnfer orance_bank 17-3-22 ed34 sucess ntfc_to_pad
1 234 data2trasnfer ftr_data 16-03-2022
2 4567 data3_transfer trnienn_data 15-2-22 der456 error htr_tokid
First make a mapping function, like this:
filemap = {
"ed34": "orance_bank",
"der456": "trnienn_data"
}
df2['filename'] = df2['target'].map(filemap)
Then merge the 2 dataframes:
df1.merge(df2, on='filename', how='outer').fillna('')

How to loop through pandas data frame to make boxplots at once

I have a data frame like this with thousand of entries and I want to make box plot to check the outliers in my data.
HR
O2Sat
Temp
SBP
DBP
Resp
110.9
102.5
57.21
165.2
64.0
15.2
97.0
95.0
38.72
98.0
72.0
19.0
89.0
99.0
45.02
112.0
62.5
22.0
90.0
95.0
36.7
175.0
105.0
30.0
103.0
88.5
37.47
122.0
104.0
24.5
I am using seaborn library to make Boxplots. But I have to write 6 different code lines for each column like this:
import seaborn as sns
sns.boxplot(y = 'HR', data = box_df_1)
sns.boxplot(y = 'O2Sat', data = box_df_1)
sns.boxplot(y = 'Temp', data = box_df_1)
sns.boxplot(y = 'SBP', data = box_df_1)
sns.boxplot(y = 'DBP', data = box_df_1)
sns.boxplot(y = 'Resp', data = box_df_1)
Can someone help me with some code in which Loop is used and a loop will make the boxplots at once using seaborn, and I don't have to write separate line of code for each column?
Regards,
Huzaifa
create a list for the columns:
cols = ['HR', 'O2Sat', 'Temp', 'SBP', 'DBP', 'Resp']
iterate over the list
for item in cols:
sns.boxplot(x = box_df_1[item])
plt.show()

Using Datetime indexing to analyse dataframe data

I'm trying to write a program that will compute the average press, temp and humidity within a specified date and time.... but not sure why im getting 'nan' values... ? here is my code: any ideas?
import pandas as pd
import numpy as np
df = pd.DataFrame.from_csv('C:\Users\Joey\Desktop\Python\CSV\TM4CVC.csv',index_col = None)
df2 = pd.DataFrame({'temp':df['Ch1_Value'],
'press':df['Ch2_Value'],
'humid':df['Ch3_Value'], 'Date' : df['Date'], 'Time' : df['Time']})
df2['DateTime'] = pd.to_datetime(df2.apply(lambda x: x['Date']+ ' '+ x['Time'], 1))
df2.index = pd.to_datetime(df2.pop('DateTime'))
df3 = df2.drop(['Date', 'Time'], 1)
#------------------------------------------------------------------------------
def TempPressHumid(datetime_i, datetime_e):
index = df3[datetime_i:datetime_e]
out = {'temp_avg':np.mean(index['temp']),
'temp_std':np.std(index['temp']),
'press_avg':np.mean(index['press']),
'press_std':np.std(index['press']),
'humid_avg':np.mean(index['humid']),
'humid_std':np.std(index['humid'])}
print out
TempPressHumid(datetime_i = '2012-06-25 08:27:19', datetime_e = '2012-01-25 10:59:33')
My output is:
{'humid_std': nan, 'press_std': nan, 'humid_avg': nan, 'temp_avg': nan, 'temp_std': nan, 'press_avg': nan}
print df3 gives me:
humid press temp
DateTime
2012-06-25 08:21:19 1004.0 21.2 26.0
2012-06-25 08:22:19 1004.0 21.2 26.0
2012-06-25 08:23:19 1004.1 21.3 26.0
-----------------------------------------
etc...
You could try something like this:
a = pd.Series(np.random.random_sample(1000))
b = pd.Series(np.random.random_sample(1000))
c = pd.Series(np.random.random_sample(1000))
df = pd.DataFrame({"temp": a, "press": b, "humid": c})
i = pd.date_range('20120625', periods=1000, freq="h")
df.index = pd.to_datetime(i)
At this point data frame df looks like
humid press temp
2012-06-25 00:00:00 0.910517 0.588777 ...
2012-06-25 01:00:00 0.742219 0.501180
2012-06-25 02:00:00 0.810515 0.172370
2012-06-25 03:00:00 0.215735 0.046797
2012-06-25 04:00:00 0.094144 0.822310
2012-06-25 05:00:00 0.662934 0.629981
2012-06-25 06:00:00 0.876086 0.586799
...
Now let's calculate the mean and standard deviation of the desired date ranges
def TempPressHumid(start, end, df):
values = {'temp_mean':np.mean(df['temp'][start:end]),
'temp_std':np.std(df['temp'][start:end]),
'press_mean':np.mean(df['press'][start:end]),
'press_std':np.std(df['press'][start:end]),
'humid_mean':np.mean(df['humid'][start:end]),
'humid_std':np.std(df['humid'][start:end]),
}
print(values)
return
So if you call TempPressHumid('2012-06-25 08:00:00', '2012-07-25 10:00:00', df) you should see the dictionary of desired values.