Get 2 DataFrames into 1 - pandas

Hi everyone so I have a DataFrame about Pokemon data
data = pd.read_csv('pokemon.csv')
And I'm only interested in 2 columns 'type1' 'type2' (type2 can be null) as the same way the original videogame does. What I need is to get a DataFrame that looks like this:
data.type1 looks like this:
data.type2:
So basically I need to take a single DataFrames using those 2 columns.
I've code this stuff trying to get 2 DataFrame that I can turn into the final one I am asked to reach:
tabla = {}
def contar(tipo):
buscando=tipo
if tipo == np.NaN:
pass
else:
if tipo in tabla:
tabla[tipo] += 1
else:
tabla[tipo] = 1
tabla2 = {}
def contar2(tipo):
buscando=tipo
if tipo == np.NaN:
pass
else:
if tipo in tabla2:
tabla2[tipo] += 1
else:
tabla2[tipo] = 1
def reset_tabla():
tabla = {}
tabla2 = {}
data['type1'].apply(contar)
df_type1 = pd.DataFrame.from_dict(tabla, orient='index')
reset_tabla()
data['type2'].apply(contar2)
df_type2 = pd.DataFrame.from_dict(tabla2, orient='index')
df_types = pd.concat([df_type1, df_type2])
df_type1
So with above code I get the data I want but no the way I need it.
I expected:
Instead, this was the output:
img continues and data appears 2 times due to 2 types columns
I think what I am doing wrong is the concat because type1 and 2 look like this separately:
and
Finally, if you know how to combine these 2 DataFrames or you think you can solve this problem better let me know.
Thanks you all :).

I've solved this issue, so if it's useful for somebody the solution is here:
tabla = {}
def contar(tipo):
buscando=tipo
if tipo in tabla:
tabla[tipo] += 1
else:
tabla[tipo] = 1
tabla2 = {}
def contar2(tipo):
buscando=tipo
if tipo == np.NaN:
pass
else:
if tipo in tabla2:
tabla2[tipo] += 1
else:
tabla2[tipo] = 1
def reset_tabla():
tabla = {}
tabla2 = {}
reset_tabla()
data['type1'].apply(contar)
data['type2'].apply(contar2)
for x in tabla2.keys():
if type(x)==float:
delete = x
del tabla2[delete]
types = {"type1": tabla,
"type2": tabla2}
df_types = pd.DataFrame(types)
df_types
So I get

Related

Capping multiples columns

I found an interesting snippet (vrana95) that caps multiple columns, however this function works on the main "df" as well instead to work only on "final_df". Someone knows why?
def cap_data(df):
for col in df.columns:
print("capping the ",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)
As I wanted to cap only a few columns I changed the for loop of the original snippet. It works, but I would to know why this function is working with both dataframes.
cols = ['score_3', 'score_6', 'credit_limit', 'last_amount_borrowed', 'reported_income', 'income']
def cap_data(df):
for col in cols:
print("capping the column:",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)

Extract 1st Column data and update 2ndColumn based on 1st Column data

I have an excel file with the following data:
LogID
T-1111
P-09899
P-09189,T-0011
T-111,T-2111
P-09099,P-7897
RCT-0989,RCT-099
I need to extract the first column LogID before the delimiter "-" and then populate a second column 'LogType' based on the string extracted (T is Tank LogType, P is Pump LogType)
For the above input, the output should be
LogID
LogType
T-1111
Tank
P-09899
Pump
P-09189,T-0011
Multiple
T-111,T-2111
Tank
P-09099,P-7897
Pump
RCT-0989,RCT-099
Reactor
I have written a function to do this in python:
def log_parser(log_string):
log_dict = { "T":"Tank","P":"Pump" }
log_list = log_string.split(",")
for i in log_list:
str_extract = i.upper().split("-",1)
if len(log_list) ==1:
result = log_dict[str_extract[0]]
return result
break
else:
idx = log_list.index(i)
for j in range(len(log_list)):
if (idx == j):
continue
str_extract_j = log_list[j].upper().split("-",1)
if str_extract_j[0] != str_extract[0]:
result = "Multiple"
return result
break
else:
result = log_dict[str_extract[0]]
return result
I am not sure how to implement this function in pandas..
Can i define the function in pandas and then use the lamba apply funtion like this:
test_df['LogType'] = test_df[['LogID']].apply(lambda x:log_parser(x), axis=1)
You can use:
# mapping dictionary for types
d = {'T': 'Tank', 'P': 'Pump'}
# extract letters before -
s = df['LogID'].str.extractall('([A-Z])-')[0]
# group by index
g = s.groupby(level=0)
df['LogType'] = (g.first() # get first match
.map(d) # map type name
# mask if several types
.mask(g.nunique().gt(1),
'Multiple')
)
Output:
LogID LogType
0 T-1111 Tank
1 P-09899 Pump
2 P-09189,T-0011 Multiple

return pandas dataframe from function

I want to return a dataframe from this function, which can be used elsewhere (for plotly graph to be exact).
My idea is to use the dataframe I can create with points_sum(), save it as the team name, and then use that dataframe in my px.line(dataframe = team_name).
In essence, I want to use the men_points_df variable after I created it.
def points_sum(team):
points = 0
men_points = []
for index, row in menscore_df.iterrows():
if row['hometeam'] == team:
if row['homegoals'] > row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] < row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
if row['awayteam'] == team:
if row['homegoals'] < row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] > row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
men_points_df = pd.DataFrame(men_points, columns = ["Date", 'Points'])
return men_points_df
In plotly, I am trying to use my new dataframe (men_points_df), like below, but I get the error undefined name, even though I can print it (for example: test = points_sum("FIF") (FIF is one of the team names) and it shows the correct dataframe in the console (when I type test):
elif pathname == "/page-3":
return [html.H1('Seasonal performance',
style={'textAlign':'center'}),
html.Div(
children=[
html.H2('Select team',style={'textAlign':'center'}),
html.Br(),
html.Br(),
dcc.Dropdown(
id='team_dd',
options=[{'label': v, 'value': k} for k,v in teams_all.items()],
)]),
dcc.Graph(id="performance_graph")
]
Output(component_id="performance_graph", component_property="figure"),
Input(component_id="team_dd", component_property="value")
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
line_fig = px.line(
test, # <------------ THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig
Just call points_sum in the update_graph function, before you use test:
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
# vvv Here vvv
test = points_sum("FIF")
line_fig = px.line(
test, #THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig

jDE(Adaptive Differential Evolution)

In jDE, each individual has its own F and CR values. How to assign these values to each individuals programmatically. How to update these values.
A pseudo-code will help.
If you want each individual to have its own F and CR values, you can simply save it in a list. (Pseudo-code: Python)
ID_POS = 0
ID_FIT = 1
ID_F = 2
ID_CR = 3
def create_solution(problem_size):
pos = np.random.uniform(lower_bound, upper_bound, problem_size)
fit = fitness_function(pos)
F = your_values
CR = your values
return [pos, fit, F, CR]
def training(problem_size, pop_size, max_iteration):
# Initialization
pop = [create_solution(problem_size) for _ in range(0, pop_size)]
# Evolution process
for iteration in range(0, max_iteration):
for i in range(0, pop_size):
# Do your stuff here
pos_new = ....
fit_new = ....
F_new = ...
CR_new = ...
if pop[i][ID_FIT] < fit_new: # meaning the new solution has better fitness than the old one.
pop[i][ID_F] = F_new
pop[i][ID_CR] = CR_new # This is how you update F and CR for every individual.
...
You can check out my repo's contains most of the state-of-the-art meta-heuristics here.
https://github.com/thieunguyen5991/metaheuristics

Pandas Apply(), Transform() ERROR = invalid dtype determination in get_concat_dtype

Flowing on from this question, which i link as background, but question is standalone.
4 questions:
I cannot understand the error I see when using apply or transform:
"invalid dtype determination in get_concat_dtype"
Why does ClipNetMean work but the other 2 methods not?
Unsure if or why i need the .copy(deep=True)
Why the slightly different syntax needed to call the InnerFoo function
The DataFrame:
cost
section item
11 1 25
2 100
3 77
4 10
12 5 50
1 39
2 7
3 32
13 4 19
1 21
2 27
The code:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = {'section' : [11,11,11,11,12,12,12,12,13,13,13]
,'item' : [1,2,3,4,5,1,2,3,4,1,2]
,'cost' : [25.,100.,77.,10.,50.,39.,7.,32.,19.,21.,27.]
})
df.set_index(['section','item'],inplace=True)
upper =50
lower = 10
def ClipAndNetMean(cost,upper,lower):
avg = cost.mean()
new_cost = (cost- avg).clip(lower,upper)
return new_cost
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
return v.to_frame()
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
return v.to_frame()
return inner
#These 2 work fine.
print df.groupby(level = 'section').apply(ClipAndNetMean,lower,upper)
print df.groupby(level = 'section').transform(ClipAndNetMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(MiniMean,lower,upper)
print df.groupby(level = 'section').transform(MiniMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(InnerFoo(lower,upper))
print df.groupby(level = 'section').transform(InnerFoo(lower,upper))
exit()
So to Chris's answer, note that if I add back the column header the methods will work in a Transform call.
see v.columns = ['cost']
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
return inner
1 & 2) transform expects something "like-indexed", while apply is flexible. The two failing functions are adding additional columns.
3) In some cases, (e.g. if you're passing a whole DataFrame into a function) it can be necessary to copy to avoid mutating the original. It should not be necessary here.
4) The first two functions take a DataFrame with two parameters and returns data. InnerFoo actually returns another function, so it needs to be called before being passed into apply.