I'm using the world bank API to analyze data and I want to create multiple data frames with the same indicators for different countries.
import wbgapi as wb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time as t_lib
#Variables
indicators = ['AG.PRD.LVSK.XD', 'AG.YLD.CREL.KG', 'NY.GDP.MINR.RT.ZS', 'SP.POP.TOTL.FE.ZS']
countries = ['PRT', 'BRL', 'ARG']
time = range(1995, 2021)
#Code
def create_df(country):
df = wb.data.DataFrame(indicators, country, time, labels = True).reset_index()
columns = [ item for item in df['Series'] ]
columns
df = df.T
df.columns = columns
df.drop(['Series', 'series'], axis= 0, inplace = True)
df = df.reset_index()
return df
list_of_dfs = []
for n in range(len(countries)):
var = create_df(countries[n])
list_of_dfs.append(var)
Want I really wanted is to create a data frame with a different name for a different country and to store them in a list or dict like: [df_1, df_2, df_3...]
EDIT:
I'm trying this now:
a_dictionary = {}
for n in range(len(countries)):
a_dictionary["key%s" %n] = create_df(countries[n])
It was suppose to work but still get the same error in the 2nd loop:
APIResponseError: APIError: JSON decoding error (https://api.worldbank.org/v2/en/sources/2/series/AG.PRD.LVSK.XD;AG.YLD.CREL.KG;NY.GDP.MINR.RT.ZS;SP.POP.TOTL.FE.ZS/country/BRL/time/YR1995;YR1996;YR1997;YR1998;YR1999;YR2000;YR2001;YR2002;YR2003;YR2004;YR2005;YR2006;YR2007;YR2008;YR2009;YR2010;YR2011;YR2012;YR2013;YR2014;YR2015;YR2016;YR2017;YR2018;YR2019;YR2020?per_page=1000&page=1&format=json)
UPDATE:
Thanks to notiv I noticed the problema was in "BRA" instead of "BRL".
I'm also putting here a new approach that works as well by creating a master dataframe and then slicing it by country to create the desired dataframes:
df = wb.data.DataFrame(indicators, countries, time, labels = True).reset_index()
columns = [ item for item in df['Series'] ]
columns
df = df.T
df.columns = columns
df.drop(['Series', 'series'], axis= 0, inplace = True)
df = df.reset_index()
df
a_dictionary = {}
for n in range(len(countries)):
new_df = df.loc[: , (df == countries[n]).any()]
new_df['index'] = df['index']
new_df.set_index('index', inplace = True)
new_df.drop(['economy', 'Country'], axis= 0, inplace = True)
a_dictionary["eco_df%s" %n] = new_df
for loop in range(len(countries)):
for n in range(len(a_dictionary[f'eco_df{loop}'].columns)):
sns.set_theme(style="dark")
g = sns.relplot( data= a_dictionary[f'eco_df{loop}'], x= a_dictionary[f'eco_df{loop}'].index, y= a_dictionary[f'eco_df{loop}'].iloc[:,n], kind="line", palette="crest",
height=5, aspect=1.61, legend=False).set(title=countries[loop])
g.set_axis_labels("Years")
g.set_xticklabels(rotation=45)
g.tight_layout()
At the end I used the dataframes to create a chart for each indicator for each country.
Many thanks for the help
Related
I have a CSV file for the code I wrote.
I have an assignment to filter and plot data. I didn't really understand, so I just copied the code from my lecturer's presentation file, but I made the CSV file myself. When I tried to run the plot, it didn't work. This is what it said.
I want to make a bar chart to show the number of people with the same age. If it's possible, how do I make a pie chart instead, and show the percentage?
btw, "Umur" means Age
import pandas as pd
from pathlib import Path
df = pd.read_csv('inicsvanakbisdig.txt')
filepath = Path('tugaspertemuan12afk.csv')
df.to_csv(filepath)
#Column Selection
df1 = df['Nama']
print(df1)
#Select row where 'Umur' is equal to 20
df2 = df.loc[df['Umur'] == 20]
print(df2)
#Drop Kolom 'Umur'
df3 = df.drop(['Umur'], axis=1)
print(df3)
#Computes a summary of statistics
df4 = df.describe()
print(df4)
#Plot
import matplotlib.pyplot as plt
df5 = df.loc[['Umur']]
p = df5['Umur'].sort_index()
p.plot(kind = 'bar',title = 'Umur anak bisdig', xlabel = "Umur", ylabel = "Counter")
plt.show()
Single label in loc accesses row, you might want
df5 = df.loc[:, ['Umur']]
# or
df5 = df[['Umur']]
I'm a newbie and wanna do a line graph.
I wanna do a line graph of decades on the x axis and # of religion on the y-axis but two lines, one religious schools, and one non-religious.
here is my csv file.
https://drive.google.com/file/d/16XuvoQKSrSMaUPsfHOWY6cBy1ry6UNz6/view?usp=sharing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('ReligiousRate.csv', dtype='string')
df_religious = df[['Religious', 'founded1']]
df_non_religious = df[['Non-Religious', 'founded2']]
dfs = [df_religious, df_non_religious]
names = ['Religious Schools', 'Non Religious Schools']
counts = []
for df, name in zip(dfs, names):
df.columns = ['Name', 'Founded']
df['Founded'] = pd.to_datetime(df['Founded'], yearfirst=True)
df = df.set_index('Founded')
df_decades = df.resample('10AS').count()
df_decades.columns = [name]
counts.append(df_decades)
sns.set_palette(["#090364", "#ff0000"])
sns.lineplot(data=pd.concat(counts), dashes=False)
plt.show()
Output:
I've got dataframe like this
original data
and I hope to have new dataframe like below
new data
How can I create code for this modification?
It need to consolidate first series data and create new dataframe.
Some imports:
import pandas as pd
import numpy as np
Here we create dataframe from data you provided:
df = pd.DataFrame({
"a" : [
'A2C02158300', 'D REC/BAS16-03W 100V 250mA SOD323 0s SMD', 'D201,D206,D218,D219,D222,D302,D308,D408', 'D409,D501,D502,D505,D506,D507,D508',
'A2C02250500', 'T BIP/PUMD3,SOT363,SMD SOLDERING', 'T209,T501,T502'
]
})
df.head(10)
Output:
Then we prepare dataframe with first 2 columns:
s1 = df.iloc[::4, :]
s1.reset_index(drop=True, inplace=True)
s2 = df.iloc[1::4, :]
s2.reset_index(drop=True, inplace=True)
df = pd.DataFrame({
'a': s1['a'],
'b': s2['a']
})
After that prepare and add third column:
s3 = df.iloc[2::4, :]
s3.reset_index(drop=True, inplace=True)
s3 = s3['a'].str.split(',').apply(pd.Series, 1).stack()
s3.index = s3.index.droplevel(-1)
s3.name = 'c'
df = df.join(s3)
df.reset_index(drop=True, inplace=True)
df
Output:
I have dataframe with 2 columns:
Col1- managers' name
Col2 - their profit
I want plot a pie chart where I can show most profitable 5 managers seperately , and others in one slice as 'others'
How about that:
With automatic labeling of the pie pieces using autopct argument.
import pandas as pd
import matplotlib.pyplot as plt
data = {'managers':['mike1','mike2','mike3','mike4','mike5','mike6','mike7'],
'profit':[110,60,40,30,10,5,5],
}
df = pd.DataFrame(data)
df = df.sort_values(by = 'profit', ascending = False)
top_5 = df.iloc[:5]
others = df.iloc[5:]['profit'].sum()
df2 = pd.DataFrame([['others',others]], columns = ['managers','profit'])
all_data = top_5.append(df2, ignore_index=True)
all_data.index = all_data['managers']
#func to lable the pieces
def auto_func(val):
return(round(val))
all_data.plot.pie(y = 'profit', autopct = auto_func)
# ax = plt.gca()
plt.show()
To keep track of all simulation-results in a parametric run, i create a MultIndex DataFrame named dfParRun in pandas as follows:
import pandas as pd
import numpy as np
import itertools
limOpt = [0.1,1,10]
reimbOpt = ['Cash','Time']
xOpt = [0.1, .02, .03, .04, .05, .06, .07, .08]
zOpt = [1,5n10]
arrays = [limOpt, reimbOpt, xOpt, zOpt]
parameters = list(itertools.product(*arrays))
nPar = len(parameters)
variables = ['X', 'Y', 'Z']
nVar = len(variables)
index = pd.MultiIndex.from_tuples(parameters, names=['lim', 'reimb', 'xMax', 'zMax'])
dfParRun = pd.DataFrame(np.random.rand((nPar, nVar)), index=index, columns=variables)
To analyse my parametric run, i want to slice this dataframe but this seems a burden. For example, i want to have all results for xMax above 0.5 and lim equal to 10. At this moment, the only working method i find is:
df = dfParRun.reset_index()
df.loc[(df.xMax>0.5) & (df.lim==10)]
and i wonder if there is a method without resetting the index of the DataFrame ?
option 1
use pd.IndexSlice
caveat: requires sort_index
dfParRun.sort_index().loc[pd.IndexSlice[10, :, .0500001:, :]]
option 2
use your df after having reset_index
df.query('xMax > 0.05 & lim == 10')
setup
import pandas as pd
import numpy as np
import itertools
limOpt = [0.1,1,10]
reimbOpt = ['Cash','Time']
xOpt = [0.1, .02, .03, .04, .05, .06, .07, .08]
zOpt = [1, 5, 10]
arrays = [limOpt, reimbOpt, xOpt, zOpt]
parameters = list(itertools.product(*arrays))
nPar = len(parameters)
variables = ['X', 'Y', 'Z']
nVar = len(variables)
index = pd.MultiIndex.from_tuples(parameters, names=['lim', 'reimb', 'xMax', 'zMax'])
dfParRun = pd.DataFrame(np.random.rand(*(nPar, nVar)), index=index, columns=variables)
df = dfParRun.reset_index()