In Pandas, how can a DataFrame be binned by two columns, with the other columns changed to the means within those bins? - pandas

I've got the standard iris dataset projected down to two dimensions using UMAP, with the UMAP dimensions for the x and y positions of the 2D plot added as columns to the dataframe:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
import umap # pip install umap-learn
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Series(iris.target).map(dict(zip(range(3), iris.target_names)))
_umap = umap.UMAP().fit_transform(iris.data)
iris_df['UMAP_x'] = _umap[:,0]
iris_df['UMAP_y'] = _umap[:,1]
iris_df.head()
I'd like to bin both the UMAP_x and UMAP_y columns into like 25 bins and then the other columns in the dataframe change to being the mean values of the columns in each of the bins. How might this be done? It feels like cut or resampling might lead to the answer, but I'm not sure how.

You can use cut to define bins and then use groupby with transform to calculate mean value for each bin.
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
import umap
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Series(iris.target).map(dict(zip(range(3), iris.target_names)))
_umap = umap.UMAP().fit_transform(iris.data)
iris_df['UMAP_x'] = _umap[:,0]
iris_df['UMAP_y'] = _umap[:,1]
# Define bins for UMAP_x and UMAP_y params
iris_df['UMAP_x_bin'] = pd.cut(iris_df['UMAP_x'], bins=25)
iris_df['UMAP_y_bin'] = pd.cut(iris_df['UMAP_y'], bins=25)
# Calculate mean value for each bin
iris_df['UMAP_x_mean'] = iris_df.groupby('UMAP_x_bin')['UMAP_x'].transform('mean')
iris_df['UMAP_y_mean'] = iris_df.groupby('UMAP_y_bin')['UMAP_y'].transform('mean')
iris_df.head()

Related

how to display netcdf raster values over map?

I'm trying to plot netcdf raster values of snowfall data in a text format overlaying what I currently have (mentioned further below). Example, something like this below:
Example
This is all the relevant code I have so far. I excluded the non relevant code. I tried plt.text and it gave me "ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()"
What I have plotted so far
import numpy
from datetime import datetime
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.mpl.ticker as cticker
import matplotlib.pyplot as plt
from matplotlib import ticker, patheffects
from metpy.units import units
import numpy as np
import numpy.ma as ma
from scipy.ndimage import gaussian_filter, maximum_filter, minimum_filter
import xarray as xr
from metpy.plots import USCOUNTIES
from gradient import Gradient
import pandas as pd
import matplotlib.colors as col
#Open NOAA Snowfall dataset
ds = xr.open_dataset('sfav2_CONUS_2021093012_to_2022042512.nc')
ds
lat = ds.lat
lon = ds.lon
#converts snowfall data to inches
snowdata = ds['Data'] * 39
plt.text(lon, lat, snowdata, transform=datacrs)
As far as I know there isn't a vectorized way of plotting text (plt.text or plt.annotated). So you'll have to loop over the arrays and plot each point.
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import cartopy.crs as ccrs
import numpy as np
data = np.random.rand(18, 9)
lons, lats = np.mgrid[-17:18:2, 8:-9:-2]
lons = lons * 10
lats = lats * 10
fig, ax = plt.subplots(figsize=(10, 5), dpi=86, facecolor="w", subplot_kw=dict(projection=ccrs.EqualEarth()))
ax.pcolormesh(lons, lats, data, cmap="coolwarm", alpha=.2, transform=ccrs.PlateCarree())
ax.coastlines()
for val, lat, lon in zip(data.flat, lats.flat, lons.flat):
ax.text(
lon, lat, f"{val:1.1f}", ha="center", va="center", transform=ccrs.PlateCarree(),
path_effects=[PathEffects.withStroke(linewidth=3, foreground="w", alpha=.5)],
)

Creating 3D scatter chart in Taipy

I was wondering how one would create a 3D scatter chart in Taipy.
I tried this code initially:
import pandas as pd
import numpy as np
from taipy import Gui
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1']=np.random.randint(0,3,100)
my_page ="""
Creation of a 3-D chart:
<|{df}|chart|type=Scatter3D|x=x|y=y|z=z|mode=markers|color=cluster|>
"""
Gui(page=my_page).run()
This does indeed display a 3D plot, but the colors (clusters) do not show up.
Any hint?
Yes, you need some massaging of your dataframes to do it.
Here's a sample code that achieves this:
import pandas as pd
import numpy as np
from taipy import Gui
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1']=np.random.randint(0,3,100)
# Create a list of 3 dataframes, one per cluster
datas = [df[df['cluster1']==i] for i in range(3)]
properties = {
}
# create dynamically the property list.
# str(i) points to a dataframe index
# "/x" points to the column value in the selected dataframe
for i in range(len(datas)):
properties[f"x[{i+1}]"] = str(i)+"/x"
properties[f"y[{i+1}]"] = str(i)+"/y"
properties[f"z[{i+1}]"] = str(i)+"/z"
properties[f'name[{i+1}]'] = str(i+1)
print(properties)
chart = "<|{datas}|chart|type=Scatter3D|properties={properties}|mode=markers|height=800px|>"
Gui(page=chart).run()
In fact, with the new release: Taipy 1.1, this is very easy to do in a few lines of code:
import pandas as pd
import numpy as np
from taipy import Gui
color_map={0:"blue",1:'green', 2:"red"}
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1'] = np.random.randint(0,3,100)
df['cluster_colors'] = df.apply(lambda row: color_map[row.cluster1], axis=1)
marker = {"color":"cluster_colors"}
chart = "<|{df}|chart|type=Scatter3D|x=x|y=y|z=z|marker={marker}|mode=markers|height=800px|>"
Gui(page=chart).run()
If you want to leave it to Taipy to pick the colors for you, then you can simply use:
import pandas as pd
import numpy as np
from taipy import Gui
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1'] = np.random.randint(0,3,100)
marker = {"color":"cluster1"}
chart = "<|{df}|chart|type=Scatter3D|x=x|y=y|z=z|marker={marker}|mode=markers|height=800px|>"
Gui(page=chart).run()

How to show the peaks of pmf by matplotlib and scipy?

this is the code(I want to know the peak of the picture but I don't know how to add this kind of code)
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from scipy import stats
n=25
p=0.6
k=np.arange(0,50)
#the pmf forming
picture=stats.binom.pmf(k,n,p)
print(picture)
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
mean,var,skew,kurt=stats.binom.stats(n,p,moments='mvsk')
print(mean,var,skew,kurt)
#the picture forming
plt.plot(k,picture,'o-')
plt.grid(True)
plt.show()
You can use scatter
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from scipy import stats
n=25
p=0.6
k=np.arange(0,50)
#the pmf forming
picture=stats.binom.pmf(k,n,p)
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
mean,var,skew,kurt=stats.binom.stats(n,p,moments='mvsk')
print(mean,var,skew,kurt)
#the picture forming
plt.plot(k,picture,'o-')
plt.grid(True)
# the two new lines
max_ind = np.argmax(picture)
plt.scatter(x=k[max_ind],y=picture[max_ind],c='r',s=100,zorder=10)
and this produces

How to convert normal to uniform in sns pair plot

Code is below
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
text = '''id,revenue ,profit,Label
101,779183,281257,1
102,144829,838451,1
103,766465,757565,-1'''
df = pd.read_csv(StringIO(text))
df = df[df.columns[1::]]
sns_plot = sns.pairplot(df,hue='Label')
My Picture is below
How to change the normal distribution to uniform distribution in the sns pairplot

seaborn.swarmplot problem with symlog scale: zero's are not expanded

I have a data set of positive values and zero's that I would like to show on the log scale. To represent zero's I use 'symlog' option, but all zero values are mapped into one point on swarmplot. How to fix it?
import numpy as np
import seaborn as sns
import pandas as pd
import random
import matplotlib.pyplot as plt
n = 100
x = np.concatenate(([0]*n,np.linspace(0,1,n),[5]*n,np.linspace(10,100,n),np.linspace(100,1000,n)),axis=None)
data = pd.DataFrame({'value': x, 'category': random.choices([0,1,2,3], k=len(x))})
f, ax = plt.subplots(figsize=(10, 6))
ax.set_yscale("symlog",linthreshy=1.e-2)
ax.set_ylim(ymax=1000)
sns.swarmplot(x="category", y="value", data=data)
sns.despine(left=True)
link to the resulting plot