I am trying to compute the jacobian of a user defined function many, many times in a loop. I am able to do this with TF 2's GradientTape as well as the older session based tf.gradients() method. The problem is that GradientTape is terribly slow (100x slower) than tf.gradients(). It has features i'd like to use (bath_jacobian, hessian support, etc), but if it's 100x slower then i can't use it.
The Question:
It's not clear to me if i'm simply misusing GradientTape, or if it will always be slower because it has to re-differentiate the provided function every time its called (my suspicion). I'm asking for tips to fix my use of GradientTape or a confirmation that it will always be fundamentally slower than tf.gradients by orders of magnitude.
Related Questions:
Repeated use of GradientTape for multiple Jacobian calculations - same scenario, unanswered
Does `GradientTape` need to re-differentiate each evaluation of a derivative? - same scenario, unanswered
using one GradientTape with global context - loosely related, having trouble applyng that solution to my scenario
Fully contained minimum example to compare GradientTape and tf.gradients():
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
# from tensorflow.python.ops.parallel_for.gradients import jacobian, batch_jacobian
import timeit
class FunctionCaller(object):
def __init__(self, func, nX, dtype=tf.float64, useSessions=True):
if useSessions:
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return, {self.xTensor: x_i})
return self._useGradientTape(x_i)
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
xTensor = tf.Variable(x_i, dtype=self.dtype) # is this my problem??? i recreate x every time?
y = tf.reshape(self.func(xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, xTensor)
# del g
return jac_x_at_i.numpy()
def __del__(self):
if self.sess is not None:
def main():
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.arange(nT*nX).reshape([-1, nX])
nTrials = 100
# try the eager version first
caller_eager = FunctionCaller(Xdot, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nX, useSessions=True)
start_time = timeit.default_timer()
caller_sessions.jac(x_i) # call it once to do its graph building stuff?
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
xdurch0 pointed out below that I should wrap _useGradientTape() in a #tf.function - something I was unsuccessful with before for other reasons. Once I did that, I had to move xTensor's definition outside the #tf.function wrapper by making it a member variable and using tf.assign().
With all this done, I find that GradientTape (for this simple example) is now on the same order of magnitude as tf.gradints. When running enough trials (~1E5), it's twice as fast as tf.gradients. awesome!
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
import timeit
class FunctionCaller(object):
def __init__(self, func, nT, nX, dtype=tf.float64, useSessions=True):
if useSessions:
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
# you should be able to create without an initial value, but tf is demanding one
# despite what the docs say. bug?
# tf.Variable(initial_value=None, shape=[None, nX], validate_shape=False, dtype=self.dtype)
self.xTensor = tf.Variable([[0]*nX]*nT, dtype=self.dtype) # x needs to be properly sized once
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return, {self.xTensor: x_i})
return self._useGradientTape(x_i).numpy()
#tf.function # THIS IS CRUCIAL
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
self.xTensor.assign(x_i) # you need to create the variable once outside the graph
y = tf.reshape(self.func(self.xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, self.xTensor)
# del g
return jac_x_at_i
def __del__(self):
if self.sess is not None:
def main():
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.random.random([nT, nX])
nTrials = 1000 # i find that nTrials<=1E3, eager is slower, it's faster for >=1E4, it's TWICE as fast for >=1E5
# try the eager version first
caller_eager = FunctionCaller(Xdot, nT, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nT, nX, useSessions=True)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
Below is a code I wrote for Hyperparameter tuning of XGboost using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, auc
from pprint import pprint
from xgboost import XGBClassifier
import time
# instantiate XGBoost model
clf = XGBClassifier(missing=np.nan, nthreads=-1)
# Define scoring metrics
scorers = {
'accuracy_score': make_scorer(accuracy_score),
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score)
param_grid_dummy = {
"n_estimators": [25, 250],
"max_depth": [3,5],
"learning_rate": [0.0005, 0,005],
def random_search_wrapper(refit_score = 'precision_score'):
fits a RandomizedSearchCV classifier using refit_score for optimization
prints classifier performance metrics
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = param_grid_dummy, n_iter = 3, scoring=scorers, refit = refit_score, cv = 3, return_train_score= True, n_jobs= -1), Y_train)
# make the predictions
Y_pred = rf_random.predict(X_test_df)
print('Best params for {}'.format(refit_score))
# confusion matrix on test data
print('\nConfusion matrix of Random Forest optimized for {} on the test data: '.format(refit_score))
print(pd.DataFrame(confusion_matrix(Y_test, Y_pred),
columns = ['pred_neg', 'pred_pos'], index = ['neg', 'pos']))
return rf_random
# Optimize classifier for recall score
start = time.time()
rf_random_cl = random_search_wrapper(refit_score='precision_score')
# Print time
end = time.time()
print((end - start)/60, "minutes")
I get a wired warning.
/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/ DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
if diff:
Can someone pls help me understand what wrong am I doing here?
when I do simple, Y_train). It works perfectly fine
This is an issue with sklearn version. few versions < 0.20.1 throw this this error
Code is correct.
i would like to ask a question about a problem that i have for the last couple days.
First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done.
I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic )
The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously.
For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining.
I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working
Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset.
I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own.
Here is my load and predict code
import os
import pandas as pd
from pandas.compat import StringIO
from datetime import datetime
from langid.langid import LanguageIdentifier, model
import langid
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from ggplot import ggplot, aes, geom_line
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import pickle
df = pd.read_csv('big_test.csv')
df3 = pd.read_csv('small_test.csv')
#This one is necessary for the loaded_model
class ColumnSelector(BaseEstimator, TransformerMixin):
def init(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
return x[self.column_list].to_dict(orient='records')
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict(df)
result2 = loaded_model.predict(df2)
result3 = loaded_model.predict(df3)
The results i get are these:
[1 0 1 ... 0 0 0]
[1 0 1 0 1]
[0 0 0 0 0]
I can provide any code even from training or my dataset if necessary.
*EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below :
df = pd.read_csv('big_test.csv')
# Split Dataset
attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ]
x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2,
stratify=df['Scan'], random_state=0)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2,
stratify=y_train, random_state=0)
# print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test))
# set up graph function
def plot_precision_recall_curve(y_true, y_pred_scores):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores)
return ggplot(aes(x='recall', y='precision'),
data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line()
# XGBClassifier
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
return x[self.column_list].to_dict(orient='records')
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10)
dict_vectorizer = DictVectorizer()
xgb = XGBClassifier(seed=0)
pipeline = Pipeline([
("feature_union", FeatureUnion([
('text_features', Pipeline([
('selector', ColumnSelector(['uri'])),
('count_vectorizer', count_vectorizer)
('categorical_features', Pipeline([
('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])),
('dict_vectorizer', dict_vectorizer)
('xgb', xgb)
]), y_train)
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file.
When you do:
df = pd.read_csv('big_test.csv')
The dtypes are these:
# Output
uri object
code object # <== Observe this
r_size object # <== Observe this
Scan int64
Now when you do:
df3 = pd.read_csv('small_test.csv')
the dtypes are changed:
# Output
uri object
code int64 # <== Now this has changed
r_size int64 # <== Now this has changed
Scan int64
You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv.
Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed.
If you do this:
df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str)
and then call the predict(), the results are same again.
I'd like to train tens of small neural networks in parallel on the CPU
in Keras with Tensorflow backend.
By default Tensorflow splits the batches over the cores when training a single nn but my average core utilization is only around 50%.
So it seems like a good idea to assign the complete training of a neural net to a core so less data has to be moved around.
I can't seem to find how I can specify these actions.
Also note the neural nets have a different architecture so combining everything into a single graph will lead to sparser matrices and slower
There are some key points to making this work:
Use processes, not threads. Threads will result in asynchronous execution, but not parallel so only one CPU core would be used.
For practical purposes building, compiling and fitting a neural net should happen in the same process.
For each process a separate tensorflow graph and session need to be initialized.
After training the nets, you likely will want to serialize them for later use. It's important to use Keras', not regular pickling.
extend the python Process class:
from keras.layers import Dense
from keras.models import Sequential
from multiprocessing import Process, Queue
import tensorflow as tf
from train_val_set import TrainValSet
class NNProcess(Process):
def __init__(self, process_id: int, nr_nets: int, ret_queue: Queue):
super(NNProcess, self).__init__()
self.process_id = process_id
self.neural_nets = []
self.train_val_set = None
self.nr_nets = nr_nets
self.ret_queue = ret_queue
def set_train_val(self, train_val_set: TrainValSet):
self.train_val_set = train_val_set
def get_session_config(self):
num_cores = 1
num_CPU = 1
num_GPU = 0
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
inter_op_parallelism_threads=num_cores, allow_soft_placement=False,
device_count={'CPU': num_CPU, 'GPU': num_GPU})
return config
def run(self):
print("process " + str(self.process_id) + " starting...")
with tf.Session(graph=tf.Graph(), config=self.get_session_config()) as session:
for i in range(0, self.nr_nets):
file_name = self.neural_nets[i].name + "_" + str(i) + ".pickle"
print("process " + str(self.process_id) + " finished.")
def compile(self):
for neural_net in self.neural_nets:
def init_nets(self):
for i in range(0, self.nr_nets):
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=10, activation='softmax'))
def fit_nets(self, train_val_set: TrainValSet):
for i in range(0, self.nr_nets):
Helper class:
from pandas import DataFrame
class TrainValSet:
def __init__(self, df_train: DataFrame, df_val: DataFrame):
self.x_train, self.y_train = self.get_x_y(df_train)
self.x_val, self.y_val = self.get_x_y(df_val)
def get_x_y(self, df: DataFrame):
X = df.iloc[:, 0:-1].values
y = df.iloc[:, -1].values
return X, y
main file:
import pandas as pd
from multiprocessing import Manager
import tensorflow as tf
from keras import backend as K
from train_val_set import TrainValSet
from nn_process import NNProcess
def load_train_val_test_datasets(dataset_dir: str, dataset_name: str):
df_train = pd.read_csv(dataset_dir + dataset_name + "/" + dataset_name + "_train.csv", header=None)
df_val = pd.read_csv(dataset_dir + dataset_name + "/" + dataset_name + "_val.csv", header=None)
df_test = pd.read_csv(dataset_dir + dataset_name + "/" + dataset_name + "_test.csv", header=None)
return df_train, df_val, df_test
# config for prediction and evaluation only
def get_session_config(num_cores):
num_CPU = 1
num_GPU = 0
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
inter_op_parallelism_threads=num_cores, allow_soft_placement=True,
device_count={'CPU': num_CPU, 'GPU': num_GPU})
return config
def train_test(nr_nets: int, nr_processes: int):
df_train, df_val, df_test = load_train_val_test_datasets('MNIST')
train_val_set = TrainValSet(df_train, df_val)
nets_per_proc = int(nr_nets/nr_processes)
nn_queue = Manager().Queue()
processes = []
for i in range(0, nr_processes):
nn_process = NNProcess(i, nets_per_proc, nn_queue)
for nn_process in processes:
for nn_process in processes:
tf_session = tf.Session(config=get_session_config(4))
# ...
# load neural nets from files
# do predictions
I am trying to speed up the conversion of select tfrecords to a series of python dictionaries. Here's what I have. Initially the CPU utilization spikes, but then goes to almost zero, suggesting my code is not working correctly.
My goal is to have 3 dictionaries saved and pickled. There are 14,000+ tfrecord files (2 gigs appx). At the current rate, it will take about 84 hours to run on a single process.
Are there any problems with my use of manage dicts
import glob
import tensorflow as tf
import cPickle
import numpy as np
from tqdm import tqdm
import collections
from multiprocessing import Process, Manager, Pool
def get_multihot_encoding(example_label):
enc = np.zeros(10)
for label in example_label:
if label in lookup.values():
index = lookup_inverted[label]
enc[index] = 1
return list(enc)
# Set-up MultiProcessing
manager = Manager()
audio_embeddings_dict = manager.dict()
audio_labels_dict = manager.dict()
audio_multihot_dict = manager.dict()
sess = tf.Session()
# The iterable which gets passed to the function
all_tfrecord_filenames = glob.glob('/Users/jeff/features/audioset_v1_embeddings/unbal_train/*.tfrecord')
def process_tfrecord(tfrecord):
for idx, example in enumerate(tf.python_io.tf_record_iterator(tfrecord)):
tf_example = tf.train.Example.FromString(example)
vid_id = tf_example.features.feature['video_id'].bytes_list.value[0].decode(encoding='UTF-8')
example_label = list(np.asarray(tf_example.features.feature['labels'].int64_list.value))
# Non zero intersect of 2 sets is True - only create dict entries if this is true!
if set(example_label) & label_filters:
print(set(example_label) & label_filters, " Is the intersection of the two")
tf_seq_example = tf.train.SequenceExample.FromString(example)
n_frames = len(tf_seq_example.feature_lists.feature_list['audio_embedding'].feature)
audio_frame = []
for i in range(n_frames):
audio_embeddings_dict[vid_id] = audio_frame
audio_labels_dict[vid_id] = example_label
audio_multihot_dict[vid_id] = get_multihot_encoding(example_label)
#print(get_multihot_encoding(example_label), "Is the encoded label")
if idx % 100 == 0:
print ("Saving dictionary at loop: {}".format(idx))
cPickle.dump(audio_embeddings_dict, open('audio_embeddings_dict_unbal_train_multi_{}.pkl'.format(idx), 'wb'))
cPickle.dump(audio_multihot_dict, open('audio_multihot_dict_bal_untrain_multi_{}.pkl'.format(idx), 'wb'))
cPickle.dump(audio_multihot_dict, open('audio_labels_unbal_dict_multi_{}.pkl'.format(idx), 'wb'))
pool = Pool(50)
result =, all_tfrecord_filenames)