Discrepencies in Pandas groupby aggregates vs dataframe, particularly on axis=1 - pandas

import pandas as pd
import numpy as np
def main():
df = pd.DataFrame([["a", "b", "c", "k"],["d", "e", "f", "l"],['g','h','i', "J"]], columns=["ay", "be", "ce", "jay"])
print(df)
gb1 = df.groupby({"ay": "x", "be": "x"}, axis=1)
gb2 = df.groupby({"ay": "x", "be": "x", "ce": "y", "jay": "y"}, axis=1)
print("apply sum by axis 0")
#print(df.apply(sum))
print("fails")
print("apply sum by axis 1")
# print(df.apply(sum, axis=1))
print("fails")
print("agg sum by axis 0")
print(df.agg(sum))
print("agg sum by axis 1")
print(df.agg(sum, axis=1))
print("gb1 apply sum axis 1")
print(gb1.apply(sum))
print("gb1 agg sum axis 1")
print(gb1.agg(sum))
print("gb2 apply sum axis 1")
# print(gb2.apply(sum))
print("fails")
print("gb2 agg sum axis 1")
print(gb2.agg(sum))
print(gb1.agg(lambda x: ";".join([x[0], x[1]]))
if __name__ == "__main__":
main()
I don't understand the failures occurring and I don't understand why apply on groups fails with 2 groups but not with one.
I've solved my overall goal (I was trying to concatenate some strings of columns together) but I am concerned that I am somewhat bewildered by these failures.
The driving goal for reference was to be able to do
gb1.agg(lambda x: ";".join(x))
and I also don't understand why that doesn't work
especially since
gb1.agg(lambda x: ";".join([x[0], x[1]]) does

There's a lot to unpack in there.
print("apply sum by axis 0")
#print(df.apply(sum))
print("fails")
print("apply sum by axis 1")
# print(df.apply(sum, axis=1))
print("fails")
...the above are failing because you're apply-ing the Python sum function, which requires numerical types. You could use either of the following to fix that (which I think under the hood relies on the ability of numpy to handle the object dtypes that pandas converts them to):
df.apply(np.sum)
df.sum()
Next, these two items say axis=1 in the print statement, but aren't really:
print("gb1 apply sum axis 1")
print(gb1.apply(sum))
print("gb2 apply sum axis 1")
# print(gb2.apply(sum))
print("fails")
...if you add axis=1 they'll work and give sensible results.
Note that you have a missing closing parenthesis in:
gb1.agg(lambda x: ";".join([x[0], x[1]])
...both in the sample code and in the later comment about it.
It seems like you're saying that the final bit of code is what accomplishes your goal. The previous attempt:
gb1.agg(lambda x: ";".join(x))
...is joining the items in the index of the one group that is present instead of the individual series. Examine:
print(gb1.groups)
Finally, given your dataframe if what you wanted to do was concatenate columns with ";" between them, you could also do:
cols = ['ay','be']
df.apply(lambda x: ";".join((x[c] for c in cols)), axis=1)
or for a small number of items,
df['concat'] = df['ay'] + ";" + df['be']
...rather than using groupby.

Related

How to transform columns with method chaining?

What's the most fluent (or easy to read) method chaining solution for transforming columns in Pandas?
(“method chaining” or “fluent” is the coding style made popular by Tom Augspurger among others.)
For the sake of the example, let's set up some example data:
import pandas as pd
import seaborn as sns
df = sns.load_dataset("iris").astype(str) # Just for this example
df.loc[1, :] = "NA"
df.head()
#
# sepal_length sepal_width petal_length petal_width species
# 0 5.1 3.5 1.4 0.2 setosa
# 1 NA NA NA NA NA
# 2 4.7 3.2 1.3 0.2 setosa
# 3 4.6 3.1 1.5 0.2 setosa
# 4 5.0 3.6 1.4 0.2 setosa
Just for this example: I want to map certain columns through a function - sepal_length using pd.to_numeric - while keeping the other columns as they were. What's the easiest way to do that in a method chaining style?
I can already use assign, but I'm repeating the column name here, which I don't want.
new_result = (
df.assign(sepal_length = lambda df_: pd.to_numeric(df_.sepal_length, errors="coerce"))
.head() # Further chaining methods, what it may be
)
I can use transform, but transform drops(!) the unmentioned columns. Transform with passthrough for the other columns would be ideal:
# Columns not mentioned in transform are lost
new_result = (
df.transform({'sepal_length': lambda series: pd.to_numeric(series, errors="coerce")})
.head() # Further chaining methods...
)
Is there a “best” way to apply transformations to certain columns, in a fluent style, and pass the other columns along?
Edit: Below this line, a suggestion after reading Laurent's ideas.
Add a helper function that allows applying a mapping to just one column:
import functools
coerce_numeric = functools.partial(pd.to_numeric, errors='coerce')
def on_column(column, mapping):
"""
Adaptor that takes a column transformation and returns a "whole dataframe" function suitable for .pipe()
Notice that columns take the name of the returned series, if applicable
Columns mapped to None are removed from the result.
"""
def on_column_(df):
df = df.copy(deep=False)
res = mapping(df[column])
# drop column if mapped to None
if res is None:
df.pop(column)
return df
df[column] = res
# update column name if mapper changes its name
if hasattr(res, 'name') and res.name != col:
df = df.rename(columns={column: res.name})
return df
return on_column_
This now allows the following neat chaining in the previous example:
new_result = (
df.pipe(on_column('sepal_length', coerce_numeric))
.head() # Further chaining methods...
)
However, I'm still open to ways how to do this just in native pandas without the glue code.
Edit 2 to further adapt Laurent's ideas, as an alternative. Self-contained example:
import pandas as pd
df = pd.DataFrame(
{"col1": ["4", "1", "3", "2"], "col2": [9, 7, 6, 5], "col3": ["w", "z", "x", "y"]}
)
def map_columns(mapping=None, /, **kwargs):
"""
Transform the specified columns and let the rest pass through.
Examples:
df.pipe(map_columns(a=lambda x: x + 1, b=str.upper))
# dict for non-string column names
df.pipe({(0, 0): np.sqrt, (0, 1): np.log10})
"""
if mapping is not None and kwargs:
raise ValueError("Only one of a dict and kwargs can be used at the same time")
mapping = mapping or kwargs
def map_columns_(df: pd.DataFrame) -> pd.DataFrame:
mapping_funcs = {**{k: lambda x: x for k in df.columns}, **mapping}
# preserve original order of columns
return df.transform({key: mapping_funcs[key] for key in df.columns})
return map_columns_
df2 = (
df
.pipe(map_columns(col2=pd.to_numeric))
.sort_values(by="col1")
.pipe(map_columns(col1=lambda x: x.astype(str) + "0"))
.pipe(map_columns({'col2': lambda x: -x, 'col3': str.upper}))
.reset_index(drop=True)
)
df2
# col1 col2 col3
# 0 10 -7 Z
# 1 20 -5 Y
# 2 30 -6 X
# 3 40 -9 W
Here is my take on your interesting question.
I don't know of a more idiomatic way in Pandas to do method chaining than combining pipe, assign, or transform. But I understand that "transform with passthrough for the other columns would be ideal".
So, I suggest using it with a higher-order function to deal with other columns, doing even more functional-like coding by taking advantage of Python standard library functools module.
For example, with the following toy dataframe:
df = pd.DataFrame(
{"col1": ["4", "1", "3", "2"], "col2": [9, 7, 6, 5], "col3": ["w", "z", "x", "y"]}
)
You can define the following partial object:
from functools import partial
from typing import Any, Callable
import pandas as pd
def helper(df: pd.DataFrame, col: str, method: Callable[..., Any]) -> pd.DataFrame:
funcs = {col: method} | {k: lambda x: x for k in df.columns if k != col}
# preserve original order of columns
return {key: funcs[key] for key in df.columns}
on = partial(helper, df)
And then do all sorts of chain assignments using transform, for instance:
df = (
df
.transform(on("col1", pd.to_numeric))
.sort_values(by="col1")
.transform(on("col2", lambda x: x.astype(str) + "0"))
.transform(on("col3", str.upper))
.reset_index(drop=True)
)
print(df)
# Ouput
col1 col2 col3
0 1 70 Z
1 2 50 Y
2 3 60 X
3 4 90 W
If I understand the question correctly, perhaps using ** within assign will be helpful. For example, if you just wanted to convert the numeric data types using pd.to_numeric the following should work.
df.assign(**df.select_dtypes(include=np.number).apply(pd.to_numeric,errors='coerce'))
By unpacking the df, you are essentially giving assign what it needs to assign each column. This would be equivalent to writing sepal_length = pd.to_numeric(df['sepal_length'],errors='coerce'), sepal_width = ... for each column.

Smart / pythonic way to find which columns contain / match another column

My question title sounds like little cryptic so I hope the example makes it clear.
I have a value in column "FindMe", and I want to know if this is in either of the options of "Search1" or "Search2". The logic I have works (though if its present in both search 1 & 2 I know I have an issue)
import pandas as pd
import numpy as np
data = {"Search1":["one_two","two_ten", "five_ten"],
"Search2":["three_four","one_four","two_twelve"],
"FindMe":["three","one","nine"]}
df =pd.DataFrame(data)
df["Present1"] = df.apply(lambda x: str(x.FindMe) in str(x.Search1), axis =1)
df["Present2"] = df.apply(lambda x: str(x.FindMe) in str(x.Search2), axis =1)
df["Present"] = np.where(df.apply(lambda x: str(x.FindMe) in str(x.Search1), axis =1) ==1,
df.Search1,
np.where(df.apply(lambda x: str(x.FindMe) in str(x.Search2), axis =1) ==1,
df.Search2,""))
print(df)
Like I say my "Present" column works as it should, returning the value of the column where its found. In reality, I have far more columns that I need to check and so yes I can create nested where's but this feels like there should be a better solution.
Any thoughts?
J
A list comprehension would do the job
df['Present'] = [[s for s in l if w in s] for l, w in
zip(df.filter(like='Search').to_numpy(), df['FindMe'])]
Search1 Search2 FindMe Present
0 one_two three_four three [three_four]
1 two_ten one_four one [one_four]
2 five_ten two_twelve nine []

How to make a scatter plot based on the values of a column in the data set?

I am given a data set that looks something like this
and I am trying to graph all the points with a 1 on the first column separate from the points with a 0, but I want to put them in the same chart.
I know the final result should be something similar to this
But I can't find a way to filter the points in Julia. I'm using LinearAlgebra, CSV, Plots, DataFrames for my project, and so far I haven't found a way to make DataFrames storage types work nicely with Plots functions. I keep running into errors like Cannot convert Float64 to series data for plotting when I try plotting the points individually with a for loop as a filter as shown in the code below
filter = select(data, :1)
newData = select(data, 2:3)
#graph one initial point to create the plot
plot(newData[1,1], newData[1,2], seriestype = :scatter, title = "My Scatter Plot")
#add the additional points with the 1 in front
for i in 2:size(newData)
if filter[i] == 1
plot!(newData[i, 1], newData[i, 2], seriestype = :scatter, title = "My Scatter Plot")
end
end
Other approaches have given me other errors, but I haven't recorded those.
I'm using Julia 1.4.0 and the latest versions of all of the packages mentioned.
Quick Edit:
It might help to know that I am trying to replicate the Nonlinear dimensionality reduction section of this article https://sebastianraschka.com/Articles/2014_kernel_pca.html#principal-component-analysis
With Plots.jl you can do the following (I am passing a fully reproducible code):
julia> df = DataFrame(c=rand(Bool, 100), x = 2 .* rand(100) .- 1);
julia> df.y = ifelse.(df.c, 1, -1) .* df.x .^ 2;
julia> plot(df.x, df.y, color=ifelse.(df.c, "blue", "red"), seriestype=:scatter, legend=nothing)
However, in this case I would additionally use StatsPlots.jl as then you can just write:
julia> using StatsPlots
julia> #df df plot(:x, :y, group=:c, seriestype=:scatter, legend=nothing)
If you want to do it manually by groups it is easiest to use the groupby function:
julia> gdf = groupby(df, :c);
julia> summary(gdf) # check that we have 2 groups in data
"GroupedDataFrame with 2 groups based on key: c"
julia> plot(gdf[1].x, gdf[1].y, seriestype=:scatter, legend=nothing)
julia> plot!(gdf[2].x, gdf[2].y, seriestype=:scatter)
Note that gdf variable is bound to a GroupedDataFrame object from which you can get groups defined by the grouping column (:c) in this case.

Create matplotlib subplots without manually counting number of subplots?

When doing ad-hoc analysis in Jupyter Notebook, I often want to view sequences of transformations to some Pandas DataFrame as vertically stacked subplots. My usual quick-and-dirty method is to not use subplots at all, but create a new figure for each plot:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.DataFrame({"a": range(100)}) # Some arbitrary DataFrame
df.plot(title="0 to 100")
plt.show()
df = df * -1 # Some transformation
df.plot(title="0 to -100")
plt.show()
df = df * 2 # Some other transformation
df.plot(title="0 to -200")
plt.show()
This method has limitations. The x-axis ticks are unaligned even when identically indexed (because the x-axis width depends on y-axis labels) and the Jupyter cell output contains several separate inline images, not a single one that I can save or copy-and-paste.
As far as I know, the proper solution is to use plt.subplots():
fig, axes = plt.subplots(3, figsize=(20, 9))
df = pd.DataFrame({"a": range(100)}) # Arbitrary DataFrame
df.plot(ax=axes[0], title="0 to 100")
df = df * -1 # Some transformation
df.plot(ax=axes[1], title="0 to -100")
df = df * 2 # Some other transformation
df.plot(ax=axes[2], title="0 to -200")
plt.tight_layout()
plt.show()
This yields exactly the output I'd like. However, it also introduces an annoyance that makes me use the first method by default: I have to manually count the number of subplots I've created and update this count in several different places as the code changes.
In the multi-figure case, adding a fourth plot is as simple as calling df.plot() and plt.show() a fourth time. With subplots, the equivalent change requires updating the subplot count, plus arithmetic to resize the output figure, replacing plt.subplots(3, figsize=(20, 9)) with plt.subplots(4, figsize=(20, 12)). Each newly added subplot needs to know how many other subplots already exist (ax=axes[0], ax=axes[1], ax=axes[2], etc.), so any additions or removals require cascading changes to the plots below.
This seems like it should be trivial to automate — it's just counting and multiplication — but I'm finding it impossible to implement with the matplotlib/pyplot API. The closest I can get is the following partial solution, which is terse enough but still requires explicit counting:
n_subplots = 3 # Must still be updated manually as code changes
fig, axes = plt.subplots(n_subplots, figsize=(20, 3 * n_subplots))
i = 0 # Counts how many subplots have been added so far
df = pd.DataFrame({"a": range(100)}) # Arbitrary DataFrame
df.plot(ax=axes[i], title="0 to 100")
i += 1
df = df * -1 # Arbitrary transformation
df.plot(ax=axes[i], title="0 to -100")
i += 1
df = df * 2 # Arbitrary transformation
df.plot(ax=axes[i], title="0 to -200")
i += 1
plt.tight_layout()
plt.show()
The root problem is that any time df.plot() is called, there must exist an axes list of known size. I considered delaying the execution of df.plot() somehow, e.g. by appending to a list of lambda functions that can be counted before they're called in sequence, but this seems like an extreme amount of ceremony just to avoid updating an integer by hand.
Is there a more convenient way to do this? Specifically, is there a way to create a figure with an "expandable" number of subplots, suitable for ad-hoc/interactive contexts where the count is not known in advance?
(Note: This question may appear to be a duplicate of either this question or this one, but the accepted answers to both questions contain exactly the problem I'm trying to solve — that the nrows= parameter of plt.subplots() must be declared before adding subplots.)
First create an empty figure and then add subplots using add_subplot. Update the subplotspecs of the existing subplots in the figure using a new GridSpec for the new geometry (the figure keyword is only needed if you're using constrained layout instead of tight layout).
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
def append_axes(fig, as_cols=False):
"""Append new Axes to Figure."""
n = len(fig.axes) + 1
nrows, ncols = (1, n) if as_cols else (n, 1)
gs = mpl.gridspec.GridSpec(nrows, ncols, figure=fig)
for i,ax in enumerate(fig.axes):
ax.set_subplotspec(mpl.gridspec.SubplotSpec(gs, i))
return fig.add_subplot(nrows, ncols, n)
fig = plt.figure(layout='tight')
df = pd.DataFrame({"a": range(100)}) # Arbitrary DataFrame
df.plot(ax=append_axes(fig), title="0 to 100")
df = df * -1 # Some transformation
df.plot(ax=append_axes(fig), title="0 to -100")
df = df * 2 # Some other transformation
df.plot(ax=append_axes(fig), title="0 to -200")
Example for adding the new subplots as columns (and using constrained layout for a change):
fig = plt.figure(layout='constrained')
df = pd.DataFrame({"a": range(100)}) # Arbitrary DataFrame
df.plot(ax=append_axes(fig, True), title="0 to 100")
df = df + 10 # Some transformation
df.plot(ax=append_axes(fig, True), title="10 to 110")
IIUC you need some container for your transformations to achieve this - a list for example. Something like:
arbitrary_trx = [
lambda x: x, # No transformation
lambda x: x * -1, # Arbitrary transformation
lambda x: x * 2] # Arbitrary transformation
fig, axes = plt.subplots(nrows=len(arbitrary_trx))
for ax, f in zip(axes, arbitrary_trx):
df = df.apply(f)
df.plot(ax=ax)
You can create an object that stores the data and only creates the figure once you tell it to do so.
import pandas as pd
import matplotlib.pyplot as plt
class AxesStacker():
def __init__(self):
self.data = []
self.titles = []
def append(self, data, title=""):
self.data.append(data)
self.titles.append(title)
def create(self):
nrows = len(self.data)
self.fig, self.axs = plt.subplots(nrows=nrows)
for d, t, ax in zip(self.data, self.titles, self.axs.flat):
d.plot(ax=ax, title=t)
stacker = AxesStacker()
df = pd.DataFrame({"a": range(100)}) # Some arbitrary DataFrame
stacker.append(df, title="0 to 100")
df = df * -1 # Some transformation
stacker.append(df, title="0 to -100")
df = df * 2 # Some other transformation
stacker.append(df, title="0 to -200")
stacker.create()
plt.show()

Combining Pandas Subplots into a Single Figure

I'm having trouble understanding Pandas subplots - and how to create axes so that all subplots are shown (not over-written by subsequent plot).
For each "Site", I want to make a time-series plot of all columns in the dataframe.
The "Sites" here are 'shark' and 'unicorn', both with 2 variables. The output should be be 4 plotted lines - the time-indexed plot for Var 1 and Var2 at each site.
Make Time-Indexed Data with Nans:
df = pd.DataFrame({
# some ways to create random data
'Var1':pd.np.random.randn(100),
'Var2':pd.np.random.randn(100),
'Site':pd.np.random.choice( ['unicorn','shark'], 100),
# a date range and set of random dates
'Date':pd.date_range('1/1/2011', periods=100, freq='D'),
# 'f':pd.np.random.choice( pd.date_range('1/1/2011', periods=365,
# freq='D'), 100, replace=False)
})
df.set_index('Date', inplace=True)
df['Var2']=df.Var2.cumsum()
df.loc['2011-01-31' :'2011-04-01', 'Var1']=pd.np.nan
Make a figure with a sub-plot for each site:
fig, ax = plt.subplots(len(df.Site.unique()), 1)
counter=0
for site in df.Site.unique():
print(site)
sitedat=df[df.Site==site]
sitedat.plot(subplots=True, ax=ax[counter], sharex=True)
ax[0].title=site #Set title of the plot to the name of the site
counter=counter+1
plt.show()
However, this is not working as written. The second sub-plot ends up overwriting the first. In my actual use case, I have 14 variable number of sites in each dataframe, as well as a variable number of 'Var1, 2, ...'. Thus, I need a solution that does not require creating each axis (ax0, ax1, ...) by hand.
As a bonus, I would love a title of each 'site' above that set of plots.
The current code over-writes the first 'Site' plot with the second. What I missing with the axes here?!
When you are using DataFrame.plot(..., subplot=True) you need to provide the correct number of axes that will be used for each column (and with the right geometry, if using layout=). In your example, you have 2 columns, so plot() needs two axes, but you are only passing one in ax=, therefore pandas has no choice but to delete all the axes and create the appropriate number of axes itself.
Therefore, you need to pass an array of axes of length corresponding to the number of columns you have in your dataframe.
# the grouper function is from itertools' cookbook
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
fig, axs = plt.subplots(len(df.Site.unique())*(len(df.columns)-1),1, sharex=True)
for (site,sitedat),axList in zip(df.groupby('Site'),grouper(axs,len(df.columns)-1)):
sitedat.plot(subplots=True, ax=axList)
axList[0].set_title(site)
plt.tight_layout()