Custom Class raises error when trying to call fit_transform - pandas

I have created custom classes that want to use with scikit-learn Pipelines and Feature-Unions.
Each class takes as input a data frame with 2 columns: 0, 1
Each row in the first column is a numpy array, and each row in the second row is a number.
Given these two inputs, I have created a static function that calculates for each row of the data-frame 40 values.
So if the original data-frame is of shape 100 x 2 the output should be 100 x 40.
I've used the BaseEstimator and TransformerMixin modules to handle the fit_transform methods easily.
When I call the fit_transform method I get the following error:
ValueError: Shape of passed values is (288, 40), indices imply (288, 2)
class MelFrequencyCepstralCoefficientsCalculator(BaseEstimator, TransformerMixin):
"""outputs Mel-frequency cepstral coefficients"""
def __init__(self):
"""
"""
#staticmethod
def calculate_coef(file_array: np.array,
sample_rate):
"""
:param file_array:
:param sample_rate:
:return:
"""
# mfcc: Mel-frequency cepstral coefficients to each row
# this output is an array of shape (40, )
mfccs = np.mean(librosa.feature.mfcc(y=file_array,
sr=sample_rate,
n_mfcc=40).T, axis=0)
return mfccs
def transform(self, X, y=None) -> np.array:
print('Calculating Mel-frequency cepstral coefficients')
res = X.progress_apply(lambda row: self.calculate_coef(row[0], row[1]), axis=1).values
print('Output shape: {}'.format(res.shape))
return res
def fit(self, X, y=None):
"""Returns `self` unless something different happens in train and test"""
return self
mfccs = MelFrequencyCepstralCoefficientsCalculator()
x_val = mfccs.fit_transform(x_val_raw)
Shouldn't I be able to have a custom class that creates more than one features in a sklearn Pipeline?
PS: The progress_appy is valid. I'm using tqdm module that extends apply in order to have a progress bar for each calculation
The workaround that is much slower but works is the following:
def transform(self, X: pd.DataFrame, y=None) -> np.array:
"""
:param X:
:param y:
:return:
"""
print('Calculating Mel-frequency cepstral coefficients')
# res = X.apply(lambda row: self.calculate_coef(row[0], row[1]), axis=1)
features = np.empty((0, 40))
rows = X.iterrows()
for row in tqdm(rows):
ext_features = self.calculate_coef(row[1][0], row[1][1])
features = np.vstack([features, ext_features])
print('Output shape: {}'.format(features.shape))
return features

Related

Receiving the same (not random) augmentations of image dataset

dataset = tf.data.Dataset.range(1, 6)
def aug(y):
x = np.random.uniform(0,1)
if x > 0.5:
y = 100
return y
dataset = dataset.map(aug)
print(list(dataset))
Run this code, then all the elements in the dataset are as they were, or all equal to 100. How do I make it so each element is individually transformed?
My more specific question below is basically asking this
I create my segmentation training set by:
dataset = tf.data.Dataset.from_tensor_slices((image_paths, mask_paths))
I then apply my augmentation function to the dataset:
def augment(image_path, mask_path)):
//use tf.io.read_file and tf.io.decode_jpeg to convert paths to tensors
x = np.random.choice([0,1])
if x == 1:
image = tf.image.flip_up_down(image)
mask = tf.image.flip_up_down(mask)
return image, mask
training_dataset = dataset.map(augment)
BATCH_SIZE=2
training_dataset = training_dataset.shuffle(100, reshuffle_each_iteration=True)
training_dataset = training_dataset.batch(BATCH_SIZE)
training_dataset = training_dataset.repeat()
training_dataset = training_dataset.prefetch(-1)
However when I visualise my training dataset, all the images have same flip applied- the are all either flipped upside down or not flipped. Where as I'm expecting them to have different flips- some upside down and some not.
Why is this happening?
You need to use tensorflow operations (not numpy or normal python) because tf.data.Dataset.map() executes the mapped function as a graph. When converting a function to a graph, numpy and base python are converted to constants. The augmentation function is only running np.random.uniform(0,1) once and storing it as a constant.
Note that irrespective of the context in which map_func is defined (eager vs. graph), tf.data traces the function and executes it as a graph.
The source for the above is here.
One solution is to use tensorflow operations. I have included an example below. Note that the y value in the if has to be cast to the same dtype as the input.
dataset = tf.data.Dataset.range(1, 6)
def aug(y):
x = tf.random.uniform([], 0, 1)
if x > 0.5:
y = tf.cast(100, y.dtype)
return y
dataset = dataset.map(aug)
print(list(dataset))
You can use a uniform random function or other probability distribution
tf.random.uniform(
shape, minval=0, maxval=None, dtype=tf.dtypes.float32, seed=None, name=None
)
even you can use prebuild method in TensorFlow or Keras for fliping
tf.keras.layers.experimental.preprocessing.RandomFlip(
mode=HORIZONTAL_AND_VERTICAL, seed=None, name=None, **kwargs
)

Decision boundary in perceptron not correct

I was preparing some code for a lecture and re-implemented a simple perceptron: 2 inputs and 1 output. Aim: a linear classifier.
Here's the code that creates the data, setups the perceptron and trains it:
from ipywidgets import interact
import numpy as np
import matplotlib.pyplot as plt
# Two randoms clouds
x = [(1,3)]*10+[(3,1)]*10
x = np.asarray([(i+np.random.rand(), j+np.random.rand()) for i,j in x])
# Colors
cs = "m"*10+"b"*10
# classes
y = [0]*10+[1]*10
class Perceptron:
def __init__(self):
self.w = np.random.randn(3)
self.lr = 0.01
def train(self, x, y, verbose=False):
errs = 0.
for xi, yi in zip(x,y):
x_ = np.insert(xi, 0, 1)
r = self.w # x_
######## HERE IS THE MAGIC HAPPENING #####
r = r >= 0
##########################################
err = float(yi)-float(r)
errs += np.abs(err)
if verbose:
print(yi, r)
self.w = self.w + self.lr * err * x_
return errs
def predict(self, x):
return np.round(self.w # np.insert(x, 0, 1, 1).T)
def decisionLine(self):
w = self.w
slope = -(w[0]/w[2]) / (w[0]/w[1])
intercept = -w[0]/w[2]
return slope, intercept
p = Perceptron()
line_properties = []
errs = []
for i in range(20):
errs.append(p.train(x, y, True if i == 999 else False))
line_properties.append(p.decisionLine())
print(p.predict(x)) # works like a charm!
#interact
def showLine(i:(0,len(line_properties)-1,1)=0):
xs = np.linspace(1, 4)
a, b = line_properties[i]
ys = a * xs + b
plt.scatter(*x.T)
plt.plot(xs, ys, "k--")
At the end, I am calculating the decision boundary, i.e. the linear eq. separating class 0 and 1. However, it seems to be off. I tried inversion etc but have no clue what is wrong. Interestingly, if I change the learning rule to
self.w = self.w + self.lr * err / x_
i.e. dividing by x_, it works properly - I am totally confused. Anyone an idea?
Solved for real
Now I added one small, but very important part to the Perceptron that I just forgot (and maybe others may forget it as well). You have to do the thresholded activation! r = r >= 0 - and now it is centered on 0 and then it does work - this is basically the answer below. If you don't do this, you have to change the classes to get again the center at 0. Currently, I prefer having the classes -1 and 1 as this gives a better decision line (centered) instead of a line that is very close to one of the data clouds.
Before:
Now:
You are creating a linear regression (not logistic regression!) with targets 0 and 1. And the line you plot is the line where the model predicts 0, so it should ideally cut through the cloud of points labeled 0, as in your first plot.
If you don't want to implement the sigmoid for logistic regression, then at least you will want to display a boundary line that corresponds to a value of 0.5 rather than 0.
As for inverting the weights providing a plot that looks like what you want, I think that's just a coincidence of this data.

How to show precentage in Seaborn countplot [duplicate]

I was wondering if it is possible to create a Seaborn count plot, but instead of actual counts on the y-axis, show the relative frequency (percentage) within its group (as specified with the hue parameter).
I sort of fixed this with the following approach, but I can't imagine this is the easiest approach:
# Plot percentage of occupation per income class
grouped = df.groupby(['income'], sort=False)
occupation_counts = grouped['occupation'].value_counts(normalize=True, sort=False)
occupation_data = [
{'occupation': occupation, 'income': income, 'percentage': percentage*100} for
(income, occupation), percentage in dict(occupation_counts).items()
]
df_occupation = pd.DataFrame(occupation_data)
p = sns.barplot(x="occupation", y="percentage", hue="income", data=df_occupation)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
Result:
I'm using the well known adult data set from the UCI machine learning repository. The pandas dataframe is created like this:
# Read the adult dataset
df = pd.read_csv(
"data/adult.data",
engine='c',
lineterminator='\n',
names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income'],
header=None,
skipinitialspace=True,
na_values="?"
)
This question is sort of related, but does not make use of the hue parameter. And in my case I cannot just change the labels on the y-axis, because the height of the bar must depend on the group.
With newer versions of seaborn you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
(df
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
output
Update: Also show percentages on top of barplots
If you also want percentages, you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)
for p in g.ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
g.ax.text(txt_x,txt_y,txt)
I might be confused. The difference between your output and the output of
occupation_counts = (df.groupby(['income'])['occupation']
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
.sort_values('occupation'))
p = sns.barplot(x="occupation", y="percentage", hue="income", data=occupation_counts)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
is, it seems to me, only the order of the columns.
And you seem to care about that, since you pass sort=False. But then, in your code the order is determined uniquely by chance (and the order in which the dictionary is iterated even changes from run to run with Python 3.5).
You could do this with sns.histplot by setting the following properties:
stat = 'density' (this will make the y-axis the density rather than count)
common_norm = False (this will normalize each density independently)
See the simple example below:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(x = df['class'], hue=df['survived'], multiple="dodge",
stat = 'density', shrink = 0.8, common_norm=False)
You can use the library Dexplot to do counting as well as normalizing over any variable to get relative frequencies.
Pass the count function the name of the variable you would like to count and it will automatically produce a bar plot of the counts of all unique values. Use split to subdivide the counts by another variable. Notice that Dexplot automatically wraps the x-tick labels.
dxp.count('occupation', data=df, split='income')
Use the normalize parameter to normalize the counts over any variable (or combination of variables with a list). You can also use True to normalize over the grand total of counts.
dxp.count('occupation', data=df, split='income', normalize='income')
It boggled my mind that Seaborn doesn't provide anything like this out of the box.
Still, it was pretty easy to tweak the source code to get what you wanted.
The following code, with the function "percentageplot(x, hue, data)" works just like sns.countplot, but norms each bar per group (i.e. divides each green bar's value by the sum of all green bars)
In effect, it turns this (hard to interpret because different N of Apple vs. Android):
sns.countplot
into this (Normed so that bars reflect proportion of total for Apple, vs Android):
Percentageplot
Hope this helps!!
from seaborn.categorical import _CategoricalPlotter, remove_na
import matplotlib as mpl
class _CategoricalStatPlotter(_CategoricalPlotter):
#property
def nested_width(self):
"""A float with the width of plot elements when hue nesting is used."""
return self.width / len(self.hue_names)
def estimate_statistic(self, estimator, ci, n_boot):
if self.hue_names is None:
statistic = []
confint = []
else:
statistic = [[] for _ in self.plot_data]
confint = [[] for _ in self.plot_data]
for i, group_data in enumerate(self.plot_data):
# Option 1: we have a single layer of grouping
# --------------------------------------------
if self.plot_hues is None:
if self.plot_units is None:
stat_data = remove_na(group_data)
unit_data = None
else:
unit_data = self.plot_units[i]
have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1)
stat_data = group_data[have]
unit_data = unit_data[have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic.append(np.nan)
else:
statistic.append(estimator(stat_data, len(np.concatenate(self.plot_data))))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint.append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint.append(utils.ci(boots, ci))
# Option 2: we are grouping by a hue layer
# ----------------------------------------
else:
for j, hue_level in enumerate(self.hue_names):
if not self.plot_hues[i].size:
statistic[i].append(np.nan)
if ci is not None:
confint[i].append((np.nan, np.nan))
continue
hue_mask = self.plot_hues[i] == hue_level
group_total_n = (np.concatenate(self.plot_hues) == hue_level).sum()
if self.plot_units is None:
stat_data = remove_na(group_data[hue_mask])
unit_data = None
else:
group_units = self.plot_units[i]
have = pd.notnull(
np.c_[group_data, group_units]
).all(axis=1)
stat_data = group_data[hue_mask & have]
unit_data = group_units[hue_mask & have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic[i].append(np.nan)
else:
statistic[i].append(estimator(stat_data, group_total_n))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint[i].append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint[i].append(utils.ci(boots, ci))
# Save the resulting values for plotting
self.statistic = np.array(statistic)
self.confint = np.array(confint)
# Rename the value label to reflect the estimation
if self.value_label is not None:
self.value_label = "{}({})".format(estimator.__name__,
self.value_label)
def draw_confints(self, ax, at_group, confint, colors,
errwidth=None, capsize=None, **kws):
if errwidth is not None:
kws.setdefault("lw", errwidth)
else:
kws.setdefault("lw", mpl.rcParams["lines.linewidth"] * 1.8)
for at, (ci_low, ci_high), color in zip(at_group,
confint,
colors):
if self.orient == "v":
ax.plot([at, at], [ci_low, ci_high], color=color, **kws)
if capsize is not None:
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_low, ci_low], color=color, **kws)
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_high, ci_high], color=color, **kws)
else:
ax.plot([ci_low, ci_high], [at, at], color=color, **kws)
if capsize is not None:
ax.plot([ci_low, ci_low],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
ax.plot([ci_high, ci_high],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
class _BarPlotter(_CategoricalStatPlotter):
"""Show point estimates and confidence intervals with bars."""
def __init__(self, x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation, errcolor, errwidth=None,
capsize=None):
"""Initialize the plotter."""
self.establish_variables(x, y, hue, data, orient,
order, hue_order, units)
self.establish_colors(color, palette, saturation)
self.estimate_statistic(estimator, ci, n_boot)
self.errcolor = errcolor
self.errwidth = errwidth
self.capsize = capsize
def draw_bars(self, ax, kws):
"""Draw the bars onto `ax`."""
# Get the right matplotlib function depending on the orientation
barfunc = ax.bar if self.orient == "v" else ax.barh
barpos = np.arange(len(self.statistic))
if self.plot_hues is None:
# Draw the bars
barfunc(barpos, self.statistic, self.width,
color=self.colors, align="center", **kws)
# Draw the confidence intervals
errcolors = [self.errcolor] * len(barpos)
self.draw_confints(ax,
barpos,
self.confint,
errcolors,
self.errwidth,
self.capsize)
else:
for j, hue_level in enumerate(self.hue_names):
# Draw the bars
offpos = barpos + self.hue_offsets[j]
barfunc(offpos, self.statistic[:, j], self.nested_width,
color=self.colors[j], align="center",
label=hue_level, **kws)
# Draw the confidence intervals
if self.confint.size:
confint = self.confint[:, j]
errcolors = [self.errcolor] * len(offpos)
self.draw_confints(ax,
offpos,
confint,
errcolors,
self.errwidth,
self.capsize)
def plot(self, ax, bar_kws):
"""Make the plot."""
self.draw_bars(ax, bar_kws)
self.annotate_axes(ax)
if self.orient == "h":
ax.invert_yaxis()
def percentageplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None,
orient=None, color=None, palette=None, saturation=.75,
ax=None, **kwargs):
# Estimator calculates required statistic (proportion)
estimator = lambda x, y: (float(len(x))/y)*100
ci = None
n_boot = 0
units = None
errcolor = None
if x is None and y is not None:
orient = "h"
x = y
elif y is None and x is not None:
orient = "v"
y = x
elif x is not None and y is not None:
raise TypeError("Cannot pass values for both `x` and `y`")
else:
raise TypeError("Must pass values for either `x` or `y`")
plotter = _BarPlotter(x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation,
errcolor)
plotter.value_label = "Percentage"
if ax is None:
ax = plt.gca()
plotter.plot(ax, kwargs)
return ax
You can provide estimators for the height of the bar (along y axis) in a seaborn countplot by using the estimator keyword.
ax = sns.barplot(x="x", y="x", data=df, estimator=lambda x: len(x) / len(df) * 100)
The above code snippet is from https://github.com/mwaskom/seaborn/issues/1027
They have a whole discussion about how to provide percentages in a countplot. This answer is based off the same thread linked above.
In the context of your specific problem, you can probably do something like this:
ax = sb.barplot(x='occupation', y='some_numeric_column', data=raw_data, estimator=lambda x: len(x) / len(raw_data) * 100, hue='income')
ax.set(ylabel="Percent")
The above code worked for me (on a different dataset with different attributes). Note that you need to put in some numeric column for y else, it gives an error: "ValueError: Neither the x nor y variable appears to be numeric."
From this answer, and using "probability" worked best.
Taken from sns.histplot documentation on the "stat" parameter:
Aggregate statistic to compute in each bin.
count: show the number of observations in each bin
frequency: show the number of observations divided by the bin width
probability: or proportion: normalize such that bar heights sum to 1
percent: normalize such that bar heights sum to 100
density: normalize such that the total area of the histogram equals 1
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(
x = df['class'],
hue=df['survived'],
multiple="dodge",
stat = 'probability',
shrink = 0.5,
common_norm=False
)

Abysmal tf.GradientTape performance compared to tf.gradients() for computing jacobians

SOLUTION BELOW:
Scenario:
I am trying to compute the jacobian of a user defined function many, many times in a loop. I am able to do this with TF 2's GradientTape as well as the older session based tf.gradients() method. The problem is that GradientTape is terribly slow (100x slower) than tf.gradients(). It has features i'd like to use (bath_jacobian, hessian support, etc), but if it's 100x slower then i can't use it.
The Question:
It's not clear to me if i'm simply misusing GradientTape, or if it will always be slower because it has to re-differentiate the provided function every time its called (my suspicion). I'm asking for tips to fix my use of GradientTape or a confirmation that it will always be fundamentally slower than tf.gradients by orders of magnitude.
Related Questions:
Repeated use of GradientTape for multiple Jacobian calculations - same scenario, unanswered
Does `GradientTape` need to re-differentiate each evaluation of a derivative? - same scenario, unanswered
using one GradientTape with global context - loosely related, having trouble applyng that solution to my scenario
Fully contained minimum example to compare GradientTape and tf.gradients():
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
# from tensorflow.python.ops.parallel_for.gradients import jacobian, batch_jacobian
import timeit
class FunctionCaller(object):
def __init__(self, func, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i)
# THIS FUNCTION IS SUPER INEFFICIENT.
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
xTensor = tf.Variable(x_i, dtype=self.dtype) # is this my problem??? i recreate x every time?
y = tf.reshape(self.func(xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, xTensor)
# del g
return jac_x_at_i.numpy()
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
#tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.arange(nT*nX).reshape([-1, nX])
nTrials = 100
# try the eager version first
caller_eager = FunctionCaller(Xdot, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nX, useSessions=True)
start_time = timeit.default_timer()
caller_sessions.jac(x_i) # call it once to do its graph building stuff?
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()
EDIT - SOLUTION:
xdurch0 pointed out below that I should wrap _useGradientTape() in a #tf.function - something I was unsuccessful with before for other reasons. Once I did that, I had to move xTensor's definition outside the #tf.function wrapper by making it a member variable and using tf.assign().
With all this done, I find that GradientTape (for this simple example) is now on the same order of magnitude as tf.gradints. When running enough trials (~1E5), it's twice as fast as tf.gradients. awesome!
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
import timeit
class FunctionCaller(object):
def __init__(self, func, nT, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
# you should be able to create without an initial value, but tf is demanding one
# despite what the docs say. bug?
# tf.Variable(initial_value=None, shape=[None, nX], validate_shape=False, dtype=self.dtype)
self.xTensor = tf.Variable([[0]*nX]*nT, dtype=self.dtype) # x needs to be properly sized once
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i).numpy()
#tf.function # THIS IS CRUCIAL
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
self.xTensor.assign(x_i) # you need to create the variable once outside the graph
y = tf.reshape(self.func(self.xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, self.xTensor)
# del g
return jac_x_at_i
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
#tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.random.random([nT, nX])
nTrials = 1000 # i find that nTrials<=1E3, eager is slower, it's faster for >=1E4, it's TWICE as fast for >=1E5
# try the eager version first
caller_eager = FunctionCaller(Xdot, nT, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nT, nX, useSessions=True)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()

tensorboard with numpy array

Can someone give a example on how to use tensorboard visualize numpy array value?
There is a related question here, I don't really get it.
Tensorboard logging non-tensor (numpy) information (AUC)
For example,
If I have
for i in range(100):
foo = np.random.rand(3,2)
How can I keep tracking the distribution of foo using tensorboard for 100 iterations? Can someone give a code example?
Thanks.
For simple values (scalar), you can use this recipe
summary_writer = tf.train.SummaryWriter(FLAGS.logdir)
summary = tf.Summary()
summary.value.add(tag=tagname, simple_value=value)
summary_writer.add_summary(summary, global_step)
summary_writer.flush()
As far as using array, perhaps you can add 6 values in a sequence, ie
for value in foo:
summary.value.add(tag=tagname, simple_value=value)
Another (simplest) way is just using placeholders. First, you can make a placeholder for your numpy array shape.
# Some place holders for summary
summary_reward = tf.placeholder(tf.float32, shape=(), name="reward")
tf.summary.scalar("reward", summary_reward)
Then, just call session.run the merged summary with the feed_dict.
# Summary
summ = tf.summary.merge_all()
...
s = sess.run(summ, feed_dict={summary_reward: reward})
writer.add_summary(s, i)
if you install this package via pip install tensorboard-pytorch it becomes as straightforward as it can get:
import numpy as np
from tensorboardX import SummaryWriter
writer = SummaryWriter()
for i in range(50):
writer.add_histogram("moving_gauss", np.random.normal(i, i, 1000), i, bins="auto")
writer.close()
Will generate the corresponding histogram data in the runs directory:
Found a way to work around, create a variable and assign the value of numpy array to the variable, use tensorboard to track the variable
mysummary_writer = tf.train.SummaryWriter("./tmp/test/")
a = tf.Variable(tf.zeros([3,2]), name="a")
sum1 = tf.histogram_summary("nparray1", a)
summary_op = tf.merge_all_summaries()
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for ii in range(10):
foo = np.random.rand(3, 2)
assign_op = a.assign(foo)
summary, _ = sess.run([summary_op, assign_op])
mysummary_writer.add_summary(tf.Summary.FromString(summary), global_step=ii)
mysummary_writer.flush()
sess = tf.Session()
writer = tf.summary.FileWriter('tensorboard_test')
var = tf.Variable(0.0,trainable=False,name='loss')
sess.run(var.initializer)
summary_op = tf.summary.scalar('scalar1',var)
for value in array:
sess.run(var.assign(value))
summary = sess.run(summary_op)
writer.add_summary(summary,i)
It works, but slow.
You could define a function like this (taken from gyglim's gist):
def add_histogram(writer, tag, values, step, bins=1000):
"""
Logs the histogram of a list/vector of values.
From: https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
"""
# Create histogram using numpy
counts, bin_edges = np.histogram(values, bins=bins)
# Fill fields of histogram proto
hist = tf.HistogramProto()
hist.min = float(np.min(values))
hist.max = float(np.max(values))
hist.num = int(np.prod(values.shape))
hist.sum = float(np.sum(values))
hist.sum_squares = float(np.sum(values ** 2))
# Requires equal number as bins, where the first goes from -DBL_MAX to bin_edges[1]
# See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto#L30
# Therefore we drop the start of the first bin
bin_edges = bin_edges[1:]
# Add bin edges and counts
for edge in bin_edges:
hist.bucket_limit.append(edge)
for c in counts:
hist.bucket.append(c)
# Create and write Summary
summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
writer.add_summary(summary, step)
And then add to the summary writer like this:
add_histogram(summary_writer, "Histogram_Name", your_numpy_array, step)
You can plot the vector with matplotlib, convert the plot to numpy array along the lines of
https://stackoverflow.com/a/35362787/10873169, and then add it to Tensorboard as image
import numpy as np
from matplotlib.backends.backend_agg import FigureCanvasAgg
from matplotlib.figure import Figure
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/example")
for step in range(1, 10):
# Time-dependent vector we want to plot
example_vector = np.sin(np.arange(100) / step)
# Plot it in matplotlib first. Default DPI doesn't look good in Tensorboard
fig = Figure(figsize=(5, 2), dpi=200)
canvas = FigureCanvasAgg(fig)
fig.gca().plot(example_vector)
canvas.draw()
# Get the image as a string of bytes
image_as_string = np.frombuffer(canvas.tostring_rgb(), dtype='uint8')
# Need to reshape to (height, width, channels)
target_shape = canvas.get_width_height()[::-1] + (3,)
reshaped_image = image_as_string.reshape(target_shape)
# Write to Tensorboard logs
writer.add_image("example_vector", reshaped_image,
dataformats="HWC", global_step=step)
writer.close()