pandas dataframe multiplication with missing values - pandas

I have a dataframe with 2columns (floating types), but one of them has missing data represented by a string ".."
When performing a multiplication operation, an exception is raised and the whole operation is aborted.
What I try to achieve is to perform the multiplication for the float values and leave ".." for the missing ones.
2 * 6
.. * 4
should give [12, ..]
I found a naive solution consisting in replacing .. by 0 then perform the multiplication, then replace back the 0 by ..
It doesn't seem very optimized. Any other solution?
df['x'] = pd.to_numeric(df['x'], errors='coerce').fillna(0)
mg['x'] = df['x'] * df["Value"]
for col in mg.columns:
mg[col] = mg[col].apply(update)
def update(v):
if (v == 0):
return ".."
return v

You can use np.where and Series.isna:
import numpy as np
mg['x'] = np.where(df['X'].isna(), df['X'], df['X']*df['Value'])
If you want to replace the null with '..' and multiply others:
mg['x'] = np.where(df['X'].isna(), '..', df['X']*df['Value'])
So anywhere the Value of column x is null, the it remains the same, otherwise it's multiplies with the value of the corresponding row of Value column
In you solution you can also do a fillna(1):
df['x'] = pd.to_numeric(df['x'], errors='coerce').fillna(1)
mg['x'] = df['x'] * df["Value"]
This is how I tried:
df = pd.DataFrame({'X': [ 2, np.nan],
'Value': [6, 4]})
df
X Value
0 2.0 6
1 NaN 4
np.where(df['X'].isna(), df['X'], df['X']*df['Value'])
array([12., nan])

Related

How to transform columns with method chaining?

What's the most fluent (or easy to read) method chaining solution for transforming columns in Pandas?
(“method chaining” or “fluent” is the coding style made popular by Tom Augspurger among others.)
For the sake of the example, let's set up some example data:
import pandas as pd
import seaborn as sns
df = sns.load_dataset("iris").astype(str) # Just for this example
df.loc[1, :] = "NA"
df.head()
#
# sepal_length sepal_width petal_length petal_width species
# 0 5.1 3.5 1.4 0.2 setosa
# 1 NA NA NA NA NA
# 2 4.7 3.2 1.3 0.2 setosa
# 3 4.6 3.1 1.5 0.2 setosa
# 4 5.0 3.6 1.4 0.2 setosa
Just for this example: I want to map certain columns through a function - sepal_length using pd.to_numeric - while keeping the other columns as they were. What's the easiest way to do that in a method chaining style?
I can already use assign, but I'm repeating the column name here, which I don't want.
new_result = (
df.assign(sepal_length = lambda df_: pd.to_numeric(df_.sepal_length, errors="coerce"))
.head() # Further chaining methods, what it may be
)
I can use transform, but transform drops(!) the unmentioned columns. Transform with passthrough for the other columns would be ideal:
# Columns not mentioned in transform are lost
new_result = (
df.transform({'sepal_length': lambda series: pd.to_numeric(series, errors="coerce")})
.head() # Further chaining methods...
)
Is there a “best” way to apply transformations to certain columns, in a fluent style, and pass the other columns along?
Edit: Below this line, a suggestion after reading Laurent's ideas.
Add a helper function that allows applying a mapping to just one column:
import functools
coerce_numeric = functools.partial(pd.to_numeric, errors='coerce')
def on_column(column, mapping):
"""
Adaptor that takes a column transformation and returns a "whole dataframe" function suitable for .pipe()
Notice that columns take the name of the returned series, if applicable
Columns mapped to None are removed from the result.
"""
def on_column_(df):
df = df.copy(deep=False)
res = mapping(df[column])
# drop column if mapped to None
if res is None:
df.pop(column)
return df
df[column] = res
# update column name if mapper changes its name
if hasattr(res, 'name') and res.name != col:
df = df.rename(columns={column: res.name})
return df
return on_column_
This now allows the following neat chaining in the previous example:
new_result = (
df.pipe(on_column('sepal_length', coerce_numeric))
.head() # Further chaining methods...
)
However, I'm still open to ways how to do this just in native pandas without the glue code.
Edit 2 to further adapt Laurent's ideas, as an alternative. Self-contained example:
import pandas as pd
df = pd.DataFrame(
{"col1": ["4", "1", "3", "2"], "col2": [9, 7, 6, 5], "col3": ["w", "z", "x", "y"]}
)
def map_columns(mapping=None, /, **kwargs):
"""
Transform the specified columns and let the rest pass through.
Examples:
df.pipe(map_columns(a=lambda x: x + 1, b=str.upper))
# dict for non-string column names
df.pipe({(0, 0): np.sqrt, (0, 1): np.log10})
"""
if mapping is not None and kwargs:
raise ValueError("Only one of a dict and kwargs can be used at the same time")
mapping = mapping or kwargs
def map_columns_(df: pd.DataFrame) -> pd.DataFrame:
mapping_funcs = {**{k: lambda x: x for k in df.columns}, **mapping}
# preserve original order of columns
return df.transform({key: mapping_funcs[key] for key in df.columns})
return map_columns_
df2 = (
df
.pipe(map_columns(col2=pd.to_numeric))
.sort_values(by="col1")
.pipe(map_columns(col1=lambda x: x.astype(str) + "0"))
.pipe(map_columns({'col2': lambda x: -x, 'col3': str.upper}))
.reset_index(drop=True)
)
df2
# col1 col2 col3
# 0 10 -7 Z
# 1 20 -5 Y
# 2 30 -6 X
# 3 40 -9 W
Here is my take on your interesting question.
I don't know of a more idiomatic way in Pandas to do method chaining than combining pipe, assign, or transform. But I understand that "transform with passthrough for the other columns would be ideal".
So, I suggest using it with a higher-order function to deal with other columns, doing even more functional-like coding by taking advantage of Python standard library functools module.
For example, with the following toy dataframe:
df = pd.DataFrame(
{"col1": ["4", "1", "3", "2"], "col2": [9, 7, 6, 5], "col3": ["w", "z", "x", "y"]}
)
You can define the following partial object:
from functools import partial
from typing import Any, Callable
import pandas as pd
def helper(df: pd.DataFrame, col: str, method: Callable[..., Any]) -> pd.DataFrame:
funcs = {col: method} | {k: lambda x: x for k in df.columns if k != col}
# preserve original order of columns
return {key: funcs[key] for key in df.columns}
on = partial(helper, df)
And then do all sorts of chain assignments using transform, for instance:
df = (
df
.transform(on("col1", pd.to_numeric))
.sort_values(by="col1")
.transform(on("col2", lambda x: x.astype(str) + "0"))
.transform(on("col3", str.upper))
.reset_index(drop=True)
)
print(df)
# Ouput
col1 col2 col3
0 1 70 Z
1 2 50 Y
2 3 60 X
3 4 90 W
If I understand the question correctly, perhaps using ** within assign will be helpful. For example, if you just wanted to convert the numeric data types using pd.to_numeric the following should work.
df.assign(**df.select_dtypes(include=np.number).apply(pd.to_numeric,errors='coerce'))
By unpacking the df, you are essentially giving assign what it needs to assign each column. This would be equivalent to writing sepal_length = pd.to_numeric(df['sepal_length'],errors='coerce'), sepal_width = ... for each column.

Why does pandas.DataFrame.apply produces Series instead of DataFrame

I do not really understand why from the following code pandas return is Series but not a DataFrame.
import pandas as pd
df = pd.DataFrame([[4,9]]*3, columns = ["A", "B"])
def plus_2(x):
y =[]
for i in range(0, len(x)):
y.append(x[i]+2)
return y
df_row = df.apply(plus_2, axis = 1) # Applied to each row
df_row
While if I change axis=0 it produces DataFrame as expected:
import pandas as pd
df = pd.DataFrame([[4,9]]*3, columns = ["A", "B"])
def plus_2(x):
y =[]
for i in range(0, len(x)):
y.append(x[i]+2)
return y
df_row = df.apply(plus_2, axis = 0) # Applied to each row
df_row
Here is the output:
In first example where you put axis=1 you implement on row level.
It means that for each row plus_2 function returns y which is list of two element (but list as a whole is single element so this is pd.Series).
Based on your example it will be returned 3x list (2 element each). Here single list if single row.
You could expand this result and create two columns (each element from list will be new column) by adding result_type="expand" in apply:
df_row = df.apply(lambda x: plus_2(x), axis=1, result_type="expand")
# output
0 1
0 6 11
1 6 11
2 6 11
In second approach you have axis=0 co this is applied on column level.
It means that for each column plus_2 function returns y, so plus_2 is applied twice, separately for A column and for B column. This is why it returns dataframe: your input is DataFrame with columns A and B, each column applies plus_2 function and returns A and B columns as result of plus_2 functions applied.
Based on your example it will be returned 2x list (3 element each). Here single list is single column.
So the main difference between axis=1 and axis=0 is that:
if you applied on row level apply will return:
[6, 11]
[6, 11]
[6, 11]
if you applied on column level apply will return:
[6, 6, 6]
[11, 11, 11]

Multiple operations iterating over dataframe columns (apply function?)

I have a pandas dataframe with thousands of columns and I would like to perform the following operations for each column of the dataframe:
check if the value i-th and i-1-th values are in the range (between x and y);
if #1 is satisfied, then find log(i/i-1) ** 2 of the column;
if #1 is not satisfied, assume 0;
find the total of #2 for each column.
Here is a dataframe with a single column:
d = {'col1': [10, 15, 23, 16, 5, 14, 11, 4]}
df = pd.DataFrame(data = d)
df
x = 10 and y = 20
Here is what I can do for this single column:
df["IsIn"] = "NA"
for i in range(1, len(df.col1)):
if (x < df.col1[i] < y) & (x < df.col1[i - 1] < y):
df.IsIn[i] = 1
else:
df.IsIn[i] = 0
df["rets"] = np.log(df["col1"] / df["col1"].shift(1))
df["var"] = df["IsIn"] * df["rets"]**2
Total = df["var"].sum()
Total
Ideally, I would have a (1 by n-cols) dataframe of Totals for each column. How can I best achieve this? I would also appreciate if you can supplement your answer with detailed explanation.
Yes, this is an instance where apply works. You only need to wrap your logic in a function. Also, consider between and shift on the condition to eliminate the first loop:
def func(s, x=10, y=20):
'''
compute the value given a series
'''
# mask where values are between x and y
valid = s.between(x,y)
# shift `valid` and double check
valid = valid & valid.shift(fill_value=False)
# squared log, mask with `valid`, and sum
return (np.log(s/s.shift())**2 * valid).sum()
# apply `func` on the columns
df.apply(func, x=10, y=20)
Output:
col1 0.222561
dtype: float64

Assert an integer is in list on pandas series

I have a DataFrame with two pandas Series as follow:
value accepted_values
0 1 [1, 2, 3, 4]
1 2 [5, 6, 7, 8]
I would like to efficiently check if the value is in accepted_values using pandas methods.
I already know I can do something like the following, but I'm interested in a faster approach if there is one (took around 27 seconds on 1 million rows DataFrame)
import pandas as pd
df = pd.DataFrame({"value":[1, 2], "accepted_values": [[1,2,3,4], [5, 6, 7, 8]]})
def check_first_in_second(values: pd.Series):
return values[0] in values[1]
are_in_accepted_values = df[["value", "accepted_values"]].apply(
check_first_in_second, axis=1
)
if not are_in_accepted_values.all():
raise AssertionError("Not all value in accepted_values")
I think if create DataFrame with list column you can compare by DataFrame.eq and test if match at least one value per row by DataFrame.any:
df1 = pd.DataFrame(df["accepted_values"].tolist(), index=df.index)
are_in_accepted_values = df1.eq(df["value"]).any(axis=1).all()
Another idea:
are_in_accepted_values = all(v in a for v, a in df[["value", "accepted_values"]].to_numpy())
I found a little optimisation to your second idea. Using a bit more numpy than pandas makes it faster (more than 3x, tested with time.perf_counter()).
values = df["value"].values
accepted_values = df["accepted_values"].values
are_in_accepted_values = all(s in e for s, e in np.column_stack([values, accepted_values]))

pandas devide column by another one safely ( zeros, None, str)

How to divide one column by another one zero and str safe?
I don't want to create new 'A' and 'B' cols without zeros and str for some reason. If devision is not possible, I want to get Nones.
df = pd.DataFrame({'A': [0, None, 2, 1 ,5], 'B': [1, 3,'', 'cat', 4]})
I try:
df['C'] = df['B'].divide(df['A'], fill_value=None) # error with zero devision
In fact, this works, but maybe there is more elegant way?
`df['C'] = df.apply(lambda row: row['B']/row['A'] if isinstance(row['A'], numbers.Number) and isinstance(row['B'], numbers.Number) and row['A'] != 0 else None, axis = 1) # this works perfectly but looks ugly`
Use pd.to_numeric to coerce non-numeric types:
import pandas as pd
import numpy as np
df['C'] = pd.to_numeric(df['B'], errors='coerce').divide(pd.to_numeric(df['A'], errors='coerce'))
# A B C
#0 0.0 1 inf
#1 NaN 3 NaN
#2 2.0 NaN
#3 1.0 cat NaN
#4 5.0 4 0.8
If you don't want np.inf then:
df['C'] = df.C.replace(np.inf, np.NaN)