how to do binning in discretization method and label it high low i have done this following method is there any short cut for binning and label itt - data-science

iris dataset
data.describe()
#WE USE DISCRETIZATION BECAUSE IT CONVERT CONTINUOUS DATA INTO DICRETE DATA
#WE DOING DISTRETIZATION FOR EACH COLUMN
data['Sepal.Length'] = pd.cut(data['Sepal.Length'], bins = [data['Sepal.Length'].min(), data['Sepal.Length'].mean(), data['Sepal.Length'].max()], labels = ["low","high"])
data['Sepal.Width'] = pd.cut(data['Sepal.Width'], bins = [data['Sepal.Width'].min(), data['Sepal.Width'].mean(), data['Sepal.Width'].max()], labels = ["low","high"])
data['Petal.Length'] = pd.cut(data['Petal.Length'], bins = [data['Petal.Length'].min(), data['Petal.Length'].mean(), data['Petal.Length'].max()], labels = ["low","high"])
data['Petal.Width'] = pd.cut(data['Petal.Width'], bins = [data['Petal.Width'].min(), data['Petal.Width'].mean(), data['Petal.Width'].max()], labels = ["low","high"])
#is there any method or short cut for this or by using for loop to discretized all columns at once

cols1 = ['Petal.Width','Petal.Length','Sepal.Width','Sepal.Length']
for i in cols1:
data[i] = pd.cut(data[i], bins = [data[i].min(), data[i].mean(), data[i].max()],labels = ["low","high"])
try to do it using a for loop

Related

mplcursors on multiaxis graph

In my program, im using mplcursors on a matplotlib graph so I can identify certain points precisely.
mplcursors.cursor(multiple=True).connect("add", lambda sel: sel.annotation.draggable(False))
Now I made a complex graph with multiple axis:
first = 1
offset = 60
for x in range(len(cat_list)):
if "Time" not in cat_list[x]:
if first and not cat_list[x].startswith("EngineSpeed"):
parasites[x] = ParasiteAxes(host, sharex = host)
host.parasites.append(parasites[x])
parasites[x].axis["right"].set_visible(True)
parasites[x].set_ylabel(cat_list[x])
parasites[x].axis["right"].major_ticklabels.set_visible(True)
parasites[x].axis["right"].label.set_visible(True)
p_plot, = parasites[x].plot(t, t_num_list[x], label = cat_list[x])
#parasites[x].axis["right"+str(x+1)].label.set_color(p_plot.get_color())
parasites[x].axis["right"].label.set_color(p_plot.get_color())
first = 0
elif not cat_list[x].startswith("EngineSpeed"):
parasites[x] = ParasiteAxes(host, sharex = host)
host.parasites.append(parasites[x])
parasites[x].set_ylabel(cat_list[x])
new_axisline = parasites[x].get_grid_helper().new_fixed_axis
parasites[x].axis["right"+str(x+1)] = new_axisline(loc = "right",
axes = parasites[x],
offset = (offset, 0))
p_plot, = parasites[x].plot(t, t_num_list[x])
parasites[x].axis["right"+str(x+1)].label.set_color(p_plot.get_color())
offset = offset + 60
host.legend()
fig.add_axes(host)
plt.show()
This code results in the following graph:
https://i.stack.imgur.com/Wl7yC.png
Now I have to somehow be able to select certain points by selecting which axis im using. How do I make a selection menu for choosing an active axis and how do I then use mplcursors to select my points?
Thanks,
Ziga

How to show the class distribution in Dataset object in Tensorflow

I am working on a multi-class classification task using my own images.
filenames = [] # a list of filenames
labels = [] # a list of labels corresponding to the filenames
full_ds = tf.data.Dataset.from_tensor_slices((filenames, labels))
This full dataset will be shuffled and split into train, valid and test dataset
full_ds_size = len(filenames)
full_ds = full_ds.shuffle(buffer_size=full_ds_size*2, seed=128) # seed is used for reproducibility
train_ds_size = int(0.64 * full_ds_size)
valid_ds_size = int(0.16 * full_ds_size)
train_ds = full_ds.take(train_ds_size)
remaining = full_ds.skip(train_ds_size)
valid_ds = remaining.take(valid_ds_size)
test_ds = remaining.skip(valid_ds_size)
Now I am struggling to understand how each class is distributed in train_ds, valid_ds and test_ds. An ugly solution is to iterate all the element in the dataset and count the occurrence of each class. Is there any better way to solve it?
My ugly solution:
def get_class_distribution(dataset):
class_distribution = {}
for element in dataset.as_numpy_iterator():
label = element[1]
if label in class_distribution.keys():
class_distribution[label] += 1
else:
class_distribution[label] = 0
# sort dict by key
class_distribution = collections.OrderedDict(sorted(class_distribution.items()))
return class_distribution
train_ds_class_dist = get_class_distribution(train_ds)
valid_ds_class_dist = get_class_distribution(valid_ds)
test_ds_class_dist = get_class_distribution(test_ds)
print(train_ds_class_dist)
print(valid_ds_class_dist)
print(test_ds_class_dist)
The answer below assumes:
there are five classes.
labels are integers from 0 to 4.
It can be modified to suit your needs.
Define a counter function:
def count_class(counts, batch, num_classes=5):
labels = batch['label']
for i in range(num_classes):
cc = tf.cast(labels == i, tf.int32)
counts[i] += tf.reduce_sum(cc)
return counts
Use the reduce operation:
initial_state = dict((i, 0) for i in range(5))
counts = train_ds.reduce(initial_state=initial_state,
reduce_func=count_class)
print([(k, v.numpy()) for k, v in counts.items()])
A solution inspired by user650654 's answer, only using TensorFlow primitives (with tf.unique_with_counts instead of for loop):
In theory, this should have better performance and scale better to large datasets, batches or class count.
num_classes = 5
#tf.function
def count_class(counts, batch):
y, _, c = tf.unique_with_counts(batch[1])
return tf.tensor_scatter_nd_add(counts, tf.expand_dims(y, axis=1), c)
counts = train_ds.reduce(
initial_state=tf.zeros(num_classes, tf.int32),
reduce_func=count_class)
print(counts.numpy())
Similar and simpler version with numpy that actually had better performances for my simple use-case:
count = np.zeros(num_classes, dtype=np.int32)
for _, labels in train_ds:
y, _, c = tf.unique_with_counts(labels)
count[y.numpy()] += c.numpy()
print(count)

how to make a memory efficient multiple dimension groupby/stack using xarray?

I have a large time series of np.float64 with a 5-min frequency (size is ~2,500,000 ~=24 years).
I'm using Xarray to represent it in-memory and the time-dimension is named 'time'.
I want to group-by 'time.hour' and then 'time.dayofyear' (or vice-versa) and remove both their mean from the time-series.
In order to do that efficiently, i need to reorder the time-series into a new xr.DataArray with the dimensions of ['hour', 'dayofyear', 'rest'].
I wrote a function that plays with the GroupBy objects of Xarray and manages to do just that although it takes a lot of memory to do that...
I have a machine with 32GB RAM and i still get the MemoryError from numpy.
I know the code works because i used it on an hourly re-sampled version of my original time-series. so here's the code:
def time_series_stack(time_da, time_dim='time', grp1='hour', grp2='dayofyear'):
"""Takes a time-series xr.DataArray objects and reshapes it using
grp1 and grp2. outout is a xr.Dataset that includes the reshaped DataArray
, its datetime-series and the grps."""
import xarray as xr
import numpy as np
import pandas as pd
# try to infer the freq and put it into attrs for later reconstruction:
freq = pd.infer_freq(time_da[time_dim].values)
name = time_da.name
time_da.attrs['freq'] = freq
attrs = time_da.attrs
# drop all NaNs:
time_da = time_da.dropna(time_dim)
# group grp1 and concat:
grp_obj1 = time_da.groupby(time_dim + '.' + grp1)
s_list = []
for grp_name, grp_inds in grp_obj1.groups.items():
da = time_da.isel({time_dim: grp_inds})
s_list.append(da)
grps1 = [x for x in grp_obj1.groups.keys()]
stacked_da = xr.concat(s_list, dim=grp1)
stacked_da[grp1] = grps1
# group over the concatenated da and concat again:
grp_obj2 = stacked_da.groupby(time_dim + '.' + grp2)
s_list = []
for grp_name, grp_inds in grp_obj2.groups.items():
da = stacked_da.isel({time_dim: grp_inds})
s_list.append(da)
grps2 = [x for x in grp_obj2.groups.keys()]
stacked_da = xr.concat(s_list, dim=grp2)
stacked_da[grp2] = grps2
# numpy part:
# first, loop over both dims and drop NaNs, append values and datetimes:
vals = []
dts = []
for i, grp1_val in enumerate(stacked_da[grp1]):
da = stacked_da.sel({grp1: grp1_val})
for j, grp2_val in enumerate(da[grp2]):
val = da.sel({grp2: grp2_val}).dropna(time_dim)
vals.append(val.values)
dts.append(val[time_dim].values)
# second, we get the max of the vals after the second groupby:
max_size = max([len(x) for x in vals])
# we fill NaNs and NaT for the remainder of them:
concat_sizes = [max_size - len(x) for x in vals]
concat_arrys = [np.empty((x)) * np.nan for x in concat_sizes]
concat_vals = [np.concatenate(x) for x in list(zip(vals, concat_arrys))]
# 1970-01-01 is the NaT for this time-series:
concat_arrys = [np.zeros((x), dtype='datetime64[ns]')
for x in concat_sizes]
concat_dts = [np.concatenate(x) for x in list(zip(dts, concat_arrys))]
concat_vals = np.array(concat_vals)
concat_dts = np.array(concat_dts)
# finally , we reshape them:
concat_vals = concat_vals.reshape((stacked_da[grp1].shape[0],
stacked_da[grp2].shape[0],
max_size))
concat_dts = concat_dts.reshape((stacked_da[grp1].shape[0],
stacked_da[grp2].shape[0],
max_size))
# create a Dataset and DataArrays for them:
sda = xr.Dataset()
sda.attrs = attrs
sda[name] = xr.DataArray(concat_vals, dims=[grp1, grp2, 'rest'])
sda[time_dim] = xr.DataArray(concat_dts, dims=[grp1, grp2, 'rest'])
sda[grp1] = grps1
sda[grp2] = grps2
sda['rest'] = range(max_size)
return sda
So for the 2,500,000 items time-series, numpy throws the MemoryError so I'm guessing this has to be my memory bottle-neck. What can i do to solve this ?
Would Dask help me ? and if so how can i implement it ?
Like you, I ran it without issue when inputting a small time series (10,000 long). However, when inputting a 100,000 long time series xr.DataArraythe grp_obj2 for loop ran away and used all the memory of the system.
This is what I used to generate the time series xr.DataArray:
n = 10**5
times = np.datetime64('2000-01-01') + np.arange(n) * np.timedelta64(5,'m')
data = np.random.randn(n)
time_da = xr.DataArray(data, name='rand_data', dims=('time'), coords={'time': times})
# time_da.to_netcdf('rand_time_series.nc')
As you point out, Dask would be a way to solve it but I can't see a clear path at the moment...
Typically, the kind of problem with Dask would be to:
Make the input a dataset from a file (like NetCDF). This will not load the file in memory but allow Dask to pull data from disk one chunk at a time.
Define all calculations with dask.delayed or dask.futures methods for entire body of code up until the writing the output. This is what allows Dask to chunk a small piece of data to read then write.
Calculate one chunk of work and immediately write output to new dataset file. Effectively you ending up steaming one chunk of input to one chunk of output at a time (but also threaded/parallelized).
I tried importing Dask and breaking the input time_da xr.DataArray into chunks for Dask to work on but it didn't help. From what I can tell, the line stacked_da = xr.concat(s_list, dim=grp1) forces Dask to make a full copy of stacked_da in memory and much more...
One workaround to this is to write stacked_da to disk then immediately read it again:
##For group1
xr.concat(s_list, dim=grp1).to_netcdf('stacked_da1.nc')
stacked_da = xr.load_dataset('stacked_da1.nc')
stacked_da[grp1] = grps1
##For group2
xr.concat(s_list, dim=grp2).to_netcdf('stacked_da2.nc')
stacked_da = xr.load_dataset('stacked_da2.nc')
stacked_da[grp2] = grps2
However, the file size for stacked_da1.nc is 19MB and stacked_da2.nc gets huge at 6.5GB. This is for time_da with 100,000 elements... so there's clearly something amiss...
Originally, it sounded like you want to subtract the mean of the groups from the time series data. It looks like Xarray docs has an example for that. http://xarray.pydata.org/en/stable/groupby.html#grouped-arithmetic
The key is to group once and loop over the groups and then group again on each of the groups and append it to list.
Next i concat and use pd.MultiIndex.from_product for the groups.
No Memory problems and no Dask needed and it only takes a few seconds to run.
here's the code, enjoy:
def time_series_stack(time_da, time_dim='time', grp1='hour', grp2='month',
plot=True):
"""Takes a time-series xr.DataArray objects and reshapes it using
grp1 and grp2. output is a xr.Dataset that includes the reshaped DataArray
, its datetime-series and the grps. plots the mean also"""
import xarray as xr
import pandas as pd
# try to infer the freq and put it into attrs for later reconstruction:
freq = pd.infer_freq(time_da[time_dim].values)
name = time_da.name
time_da.attrs['freq'] = freq
attrs = time_da.attrs
# drop all NaNs:
time_da = time_da.dropna(time_dim)
# first grouping:
grp_obj1 = time_da.groupby(time_dim + '.' + grp1)
da_list = []
t_list = []
for grp1_name, grp1_inds in grp_obj1.groups.items():
da = time_da.isel({time_dim: grp1_inds})
# second grouping:
grp_obj2 = da.groupby(time_dim + '.' + grp2)
for grp2_name, grp2_inds in grp_obj2.groups.items():
da2 = da.isel({time_dim: grp2_inds})
# extract datetimes and rewrite time coord to 'rest':
times = da2[time_dim]
times = times.rename({time_dim: 'rest'})
times.coords['rest'] = range(len(times))
t_list.append(times)
da2 = da2.rename({time_dim: 'rest'})
da2.coords['rest'] = range(len(da2))
da_list.append(da2)
# get group keys:
grps1 = [x for x in grp_obj1.groups.keys()]
grps2 = [x for x in grp_obj2.groups.keys()]
# concat and convert to dataset:
stacked_ds = xr.concat(da_list, dim='all').to_dataset(name=name)
stacked_ds[time_dim] = xr.concat(t_list, 'all')
# create a multiindex for the groups:
mindex = pd.MultiIndex.from_product([grps1, grps2], names=[grp1, grp2])
stacked_ds.coords['all'] = mindex
# unstack:
ds = stacked_ds.unstack('all')
ds.attrs = attrs
return ds

add more space to the color_row in sns.clustermap

How to add more spacing to column/row_colors? It looks it gets really dense after adding 3 or more rows
#modified example data is from this question: https://stackoverflow.com/questions/48173798/additional-row-colors-in-seaborn-cluster-map
matrix = pd.DataFrame(np.random.random_integers(0,1, size=(50,4)))
labels = np.random.random_integers(0,5, size=50)
lut = dict(zip(set(labels), sns.hls_palette(len(set(labels)), l=0.5, s=0.8)))
row_colors = pd.DataFrame(labels)[0].map(lut)
#Create additional row_colors here
labels2 = np.random.random_integers(0,1, size=50)
lut2 = dict(zip(set(labels2), sns.hls_palette(len(set(labels2)), l=0.5, s=0.8)))
row_colors2 = pd.DataFrame(labels2)[0].map(lut2)
# add more rows
row_colors = pd.concat([row_colors,row_colors,row_colors2,row_colors2,row_colors2],axis=1)
g=sns.clustermap(matrix, col_cluster=False, linewidths=0.1, cmap='coolwarm', row_colors=row_colors)
plt.show()
cluster_example
You can adjust the linewidths parameter.
g=sns.clustermap(matrix, col_cluster=False, linewidths=0.9, cmap='coolwarm', row_colors=row_colors) will give you more spacing.

Numpy: regrid by averaging?

I'm trying to regrid a numpy array onto a new grid. In this specific case, I'm trying to regrid a power spectrum onto a logarithmic grid so that the data are evenly spaced logarithmically for plotting purposes.
Doing this with straight interpolation using np.interp results in some of the original data being ignored entirely. Using digitize gets the result I want, but I have to use some ugly loops to get it to work:
xfreq = np.fft.fftfreq(100)[1:50] # only positive, nonzero freqs
psw = np.arange(xfreq.size) # dummy array for MWE
# new logarithmic grid
logfreq = np.logspace(np.log10(np.min(xfreq)), np.log10(np.max(xfreq)), 100)
inds = np.digitize(xfreq,logfreq)
# interpolation: ignores data *but* populates all points
logpsw = np.interp(logfreq, xfreq, psw)
# so average down where available...
logpsw[np.unique(inds)] = [psw[inds==i].mean() for i in np.unique(inds)]
# the new plot
loglog(logfreq, logpsw, linewidth=0.5, color='k')
Is there a nicer way to accomplish this in numpy? I'd be satisfied with just a replacement of the inline loop step.
You can use bincount() twice to calculate the average value of every bins:
logpsw2 = np.interp(logfreq, xfreq, psw)
counts = np.bincount(inds)
mask = counts != 0
logpsw2[mask] = np.bincount(inds, psw)[mask] / counts[mask]
or use unique(inds, return_inverse=True) and bincount() twice:
logpsw4 = np.interp(logfreq, xfreq, psw)
uinds, inv_index = np.unique(inds, return_inverse=True)
logpsw4[uinds] = np.bincount(inv_index, psw) / np.bincount(inv_index)
Or if you use Pandas:
import pandas as pd
logpsw4 = np.interp(logfreq, xfreq, psw)
s = pd.groupby(pd.Series(psw), inds).mean()
logpsw4[s.index] = s.values