MXNET multi-iterators (combine a .rec iterator with an NDArray iterator) - mxnet

How do I create a combined iterator in MXNET? For example, given a record (.rec) iterator if I want to change the labels corresponding to each image then there are two options:
a) Create a new rec iterator with the same data(images) and new labels.
b) Create a multi-iterator using the original rec iterator and an NDArray iterator such that the multi-iterator reads data(images) from the original .rec iterator and labels from the NDArray iterator.
The option (a) is tedious. Any suggestions on how to create such a multi-iterator?

class MultiIter(mx.io.DataIter):
def __init__(self, iter_list):
self.iters = iter_list
self.batch_size = 1000
def next(self):
batches = [i.next() for i in self.iters]
return mx.io.DataBatch(data=[t for t in batches[0].data]+ [t for t in batches[1].data], label= [t for t in batches[0].label] + [t for t in batches[1].label],pad=0)
def reset(self):
for i in self.iters:
i.reset()
#property
def provide_data(self):
return [t for t in self.iters[0].provide_data] + [t for t in self.iters[1].provide_data]
#property
def provide_label(self):
return [t for t in self.iters[0].provide_label] + [t for t in self.iters[1].provide_label]
train = MultiIter([train1,train2])
Where train1 and train2 can be any two DataIter. In particular, train1 can be a .rec iterator and train2 can be an NDArray iterator. The additional argument "pad=0" is required for calling predict method using the combined iterator if either of train1 or train2 is an NDArray iterator.
MultiIter returns a list of data and a list of labels combined from the two iterators. If you need only data from the first iterator and labels from the second iterator, the code below will work.
class MultiIter(mx.io.DataIter):
def __init__(self, iter_list):
self.iters = iter_list
self.batch_size = 1000
def next(self):
batches = [i.next() for i in self.iters]
return mx.io.DataBatch(data=[t for t in batches[0].data], label= [t for t in batches[1].label],pad=0)
def reset(self):
for i in self.iters:
i.reset()
#property
def provide_data(self):
return [t for t in self.iters[0].provide_data]
#property
def provide_label(self):
return [t for t in self.iters[1].provide_label]
train = MultiIter([train1,train2])

Related

Slicing of a DataFrame Subclass

I'm trying to add units to a pandas DataFrame and it's containing Series, but if I slice it seems not to call init and I loose all the setup.
class QSeries(pd.Series):
_metadata = ["unit", "descriptor"]
#property
def _constructor(self):
return QSeries
#property
def _constructor_expanddim(self):
return QDataFrame
class QDataFrame(pd.DataFrame):
# normal properties
_metadata = ["units"]
def __init__(self, *args, units=None, **kwargs):
super(QDataFrame, self).__init__(*args, **kwargs)
if units is None:
self.units = {}
else:
self.units = {key:value for key, value in units.items() if key in self.columns}
for col in self.columns:
if col in self.units:
self[col].unit = self.units[col]
#property
def _constructor(self):
return QDataFrame
#property
def _constructor_sliced(self):
return QSeries
https://gist.github.com/wkerzendorf/52459080be83c7c382bac11ef9ac3195
test = QDataFrame(index=np.arange(10), columns=['x', 'y', 'z'], data=1., units={'x':u.erg, 'y':u.s, 'z':u.cm})
test.x.unit
output -> erg
test2 = test[['x', 'y']]
test2.x.unit
AttributeError
How do I fix this?

keras.utils.Sequence with multiple files

I understand how to use keras.utils.Sequence with one data file. You subclass the keras.utils.Sequence class and implement its interface: __len__ and __getitem__.
For example:
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.ceil(self.no_examples / float(self.batch_size)))
def __getitem__(self, idx):
#build the batch w/ idx and self.batch_size
But what if your data is spread across multiple files? For example:
train_part1.csv
train_part2.csv
train_partn.csv
How can you iterate through all batches with only one pointer idx?
you can set up a mapping of (range, file_path)
def __init__(self, file_paths, batch_size):
self.batch_size = batch_size
self._mapping = dict()
count = 0
for file_path in file_paths:
with open(file_path, 'r') as f:
size = len(f.readlines())
self._mapping[(count, count+size)] = file_path
count += size
self.no_examples = count
def _find_file_path(self, idx):
for range, file_path in self._mapping.items():
start, end = range[0], range[1]
if start <= idx and idx <= end:
in_file_idx = idx - start
return (in_file_idx, file_path)
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.ceil(self.no_examples / float(self.batch_size)))
#functools.lru_cache(maxsize=128) # add memoize for file caching
def _read_file_data(self, file_path):
with open(file_path, 'r') as f:
return list(f.readlines())
def __getitem__(self, idx):
in_file_idx, file_path = self._find_file_path(idx)
lines = self._read_file_data(file_path)
return lines[in_file_idx]
Further optimization:
stats memory consumed, delete memoized file content(if your files are too large to fit your memory) due to your memory size;
implement more efficient _find_file_path if large number of files, the current implementation is O(n);

how to iterate over tensorflow tensor_list referencing both index and individual tensor

I tried following code, used tf.map_fn, and still got error as: "Tensor objects are only iterable when eager execution is enabled. To iterate over this tensor use tf.map_fn."
def get_tensor_A(i):
return mask[i] #another tensor returned
def get_tensor_B(x):
return tf.nn.softmax(x, 1)
def final_rt(i, inp):
return get_tensor_A(idx)*get_tensor_B(inp)
tensor_2_list = [ tf.map_fn(lambda x: final_rt(x(0), x(1)), (idx, inp)) for idx, inp in enumerate(tensor_1_list)]

Dequeueing from RandomShuffleQueue does not reduce size

In order to train a model I have encapsulated my model in a class.
I use a tf.RandomShuffleQueue to enqueue a list of filenames to.
However when I dequeue the elements they get dequeued but the size of the queue does not reduce.
Following are more specific questions followed by the code snippet :
If I have only 5 images for example, but steps range upto 100, would this result in the addfilenames called repeatedly automatically ? It does not give me any error on dequeuing so I am thinking that it is getting called automatically.
Why the size of the tf.RandomShuffleQueue is not changing ? It remains constant.
import os
import time
import functools
import tensorflow as tf
from Read_labelclsloc import readlabel
def ReadTrain(traindir):
# Returns a list of training images, their labels and a dictionay.
# The dictionary maps label names to integer numbers.
return trainimgs, trainlbls, classdict
def ReadVal(valdir, classdict):
# Reads the validation image labels.
# Returns a dictionary with filenames as keys and
# corresponding labels as values.
return valdict
def lazy_property(function):
# Just a decorator to make sure that on repeated calls to
# member functions, ops don't get created repeatedly.
# Acknowledgements : https://danijar.com/structuring-your-tensorflow-models/
attribute= '_cache_' + function.__name__
#property
#functools.wraps(function)
def decorator(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return decorator
class ModelInitial:
def __init__(self, traindir, valdir):
self.graph
self.traindir = traindir
self.valdir = valdir
self.traininginfo()
self.epoch = 0
def traininginfo(self):
self.trainimgs, self.trainlbls, self.classdict = ReadTrain(self.traindir)
self.valdict = ReadVal(self.valdir, self.classdict)
with self.graph.as_default():
self.trainimgs_tensor = tf.constant(self.trainimgs)
self.trainlbls_tensor = tf.constant(self.trainlbls, dtype=tf.uint16)
self.trainimgs_dict = {}
self.trainimgs_dict["ImageFile"] = self.trainimgs_tensor
return None
#lazy_property
def graph(self):
g = tf.Graph()
with g.as_default():
# Layer definitions go here
return g
#lazy_property
def addfilenames (self):
# This is the function where filenames are pushed to a RandomShuffleQueue
filename_queue = tf.RandomShuffleQueue(capacity=len(self.trainimgs), min_after_dequeue=0,\
dtypes=[tf.string], names=["ImageFile"],\
seed=0, name="filename_queue")
sz_op = filename_queue.size()
dq_op = filename_queue.dequeue()
enq_op = filename_queue.enqueue_many(self.trainimgs_dict)
return filename_queue, enq_op, sz_op, dq_op
def Train(self):
# The function for training.
# I have not written the training part yet.
# Still struggling with preprocessing
with self.graph.as_default():
filename_q, filename_enqueue_op, sz_op, dq_op= self.addfilenames
qr = tf.train.QueueRunner(filename_q, [filename_enqueue_op])
filename_dequeue_op = filename_q.dequeue()
init_op = tf.global_variables_initializer()
sess = tf.Session(graph=self.graph)
sess.run(init_op)
coord = tf.train.Coordinator()
enq_threads = qr.create_threads(sess, coord=coord, start=True)
counter = 0
for step in range(100):
print(sess.run(dq_op["ImageFile"]))
print("Epoch = %d "%(self.epoch))
print("size = %d"%(sess.run(sz_op)))
counter+=1
names = [n.name for n in self.graph.as_graph_def().node]
coord.request_stop()
coord.join(enq_threads)
print("Counter = %d"%(counter))
return None
if __name__ == "__main__":
modeltrain = ModelInitial(<Path to training images>,\
<Path to validation images>)
a = modeltrain.graph
print(a)
modeltrain.Train()
print("Success")
The mystery is caused by the tf.train.QueueRunner that you created for the queue, which causes it to be filled in the background.
The following lines cause a background "queue runner" thread to be created:
qr = tf.train.QueueRunner(filename_q, [filename_enqueue_op])
# ...
enq_threads = qr.create_threads(sess, coord=coord, start=True)
This thread calls filename_enqueue_op in a loop, which causes the queue to be filled up as you remove elements from it.
The background thread from step 1 will almost always have a pending enqueue operation (filename_enqueue_op) on the queue. This means that after you dequeue a filename, the pending enqueue will run add fill the queue back up to capacity. (Technically there is a race condition here and you could see a size of capacity - 1, but this is quite unlikely).

Using TensorArrays in the context of a while_loop to accumulate values

Below I have an implementation of a Tensorflow RNN Cell, designed to emulate Alex Graves' algorithm ACT in this paper: http://arxiv.org/abs/1603.08983.
At a single timestep in the sequence called via rnn.rnn(with a static sequence_length parameter, so the rnn is unrolled dynamically - I am using a fixed batch size of 20), we recursively call ACTStep, producing outputs of size(1,200) where the hidden dimension of the RNN cell is 200 and we have a batch size of 1.
Using the while loop in Tensorflow, we iterate until the accumulated halting probability is high enough. All of this works reasonably smoothly, but I am having problems accumulating states, probabilities and outputs within the while loop, which we need to do in order to create weighted combinations of these as the final cell output/state.
I have tried using a simple list, as below, but this fails when the graph is compiled as the outputs are not in the same frame(is it possible to use the "switch" function in control_flow_ops to forward the tensors to the point at which they are required, ie the add_n function just before we return the values?). I have also tried using the TensorArray structure, but I am finding this difficult to use as it seems to destroy shape information and replacing it manually hasn't worked. I also haven't been able to find much documentation on TensorArrays, presumably as they are, I imagine, mainly for internal TF use.
Any advice on how it might be possible to correctly accumulate the variables produced by ACTStep would be much appreciated.
class ACTCell(RNNCell):
"""An RNN cell implementing Graves' Adaptive Computation time algorithm"""
def __init__(self, num_units, cell, epsilon, max_computation):
self.one_minus_eps = tf.constant(1.0 - epsilon)
self._num_units = num_units
self.cell = cell
self.N = tf.constant(max_computation)
#property
def input_size(self):
return self._num_units
#property
def output_size(self):
return self._num_units
#property
def state_size(self):
return self._num_units
def __call__(self, inputs, state, scope=None):
with vs.variable_scope(scope or type(self).__name__):
# define within cell constants/ counters used to control while loop
prob = tf.get_variable("prob", [], tf.float32,tf.constant_initializer(0.0))
counter = tf.get_variable("counter", [],tf.float32,tf.constant_initializer(0.0))
tf.assign(prob,0.0)
tf.assign(counter, 0.0)
# the predicate for stopping the while loop. Tensorflow demands that we have
# all of the variables used in the while loop in the predicate.
pred = lambda prob,counter,state,input,\
acc_state,acc_output,acc_probs:\
tf.logical_and(tf.less(prob,self.one_minus_eps), tf.less(counter,self.N))
acc_probs = []
acc_outputs = []
acc_states = []
_,iterations,_,_,acc_states,acc_output,acc_probs = \
control_flow_ops.while_loop(pred,
self.ACTStep,
[prob,counter,state,input,acc_states,acc_outputs,acc_probs])
# TODO:fix last part of this, need to use the remainder.
# TODO: find a way to accumulate the regulariser
# here we take a weighted combination of the states and outputs
# to use as the actual output and state which is passed to the next timestep.
next_state = tf.add_n([tf.mul(x,y) for x,y in zip(acc_probs,acc_states)])
output = tf.add_n([tf.mul(x,y) for x,y in zip(acc_probs,acc_outputs)])
return output, next_state
def ACTStep(self,prob,counter,state,input, acc_states,acc_outputs,acc_probs):
output, new_state = rnn.rnn(self.cell, [input], state, scope=type(self.cell).__name__)
prob_w = tf.get_variable("prob_w", [self.cell.input_size,1])
prob_b = tf.get_variable("prob_b", [1])
p = tf.nn.sigmoid(tf.matmul(prob_w,new_state) + prob_b)
acc_states.append(new_state)
acc_outputs.append(output)
acc_probs.append(p)
return [tf.add(prob,p),tf.add(counter,1.0),new_state, input,acc_states,acc_outputs,acc_probs]
I'm going to preface this response that this is NOT a complete solution, but rather some commentary on how to improve your cell.
To start off, in your ACTStep function, you call rnn.rnn for one timestep (as defined by [input]. If you're doing a single timestep, it is probably more efficient to simple use the actual self.cell call function. You'll see this same mechanism used in tensorflow rnncell wrappers
You mentioned that you have tried using TensorArrays. Did you pack and unpack the tensorarrays appropriately? Here is a repo where you'll find under model.py the tensorarrays are packed and unpacked properly.
You also asked if there is a function in control_flow_ops that will require all the tensors to be accumulated. I think you are looking for tf.control_dependencies
You can list all of your output tensors operations in control_dependicies and that will require tensorflow to compute all tensors up into that point.
Also, it looks like your counter variable is trainable. Are you sure you want this to be the case? If you're adding plus one to your counter, that probably wouldn't yield the correct result. On the other hand, you could have purposely kept it trainable to differentiate it at the end for the ponder cost function.
Also I believe the Remainder function should be in your script:
remainder = 1.0 - tf.add_n(acc_probs[:-1])
#note that there is a -1 in the list as you do not want to grab the last probability
Here is my version of your code edited:
class ACTCell(RNNCell):
"""An RNN cell implementing Graves' Adaptive Computation time algorithm
Notes: https://www.evernote.com/shard/s189/sh/fd165646-b630-48b7-844c-86ad2f07fcda/c9ab960af967ef847097f21d94b0bff7
"""
def __init__(self, num_units, cell, max_computation = 5.0, epsilon = 0.01):
self.one_minus_eps = tf.constant(1.0 - epsilon) #episolon is 0.01 as found in the paper
self._num_units = num_units
self.cell = cell
self.N = tf.constant(max_computation)
#property
def input_size(self):
return self._num_units
#property
def output_size(self):
return self._num_units
#property
def state_size(self):
return self._num_units
def __call__(self, inputs, state, scope=None):
with vs.variable_scope(scope or type(self).__name__):
# define within cell constants/ counters used to control while loop
prob = tf.constant(0.0, shape = [batch_size])
counter = tf.constant(0.0, shape = [batch_size])
# the predicate for stopping the while loop. Tensorflow demands that we have
# all of the variables used in the while loop in the predicate.
pred = lambda prob,counter,state,input,acc_states,acc_output,acc_probs:\
tf.logical_and(tf.less(prob,self.one_minus_eps), tf.less(counter,self.N))
acc_probs, acc_outputs, acc_states = [], [], []
_,iterations,_,_,acc_states,acc_output,acc_probs = \
control_flow_ops.while_loop(
pred,
self.ACTStep, #looks like he purposely makes the while loop here
[prob, counter, state, input, acc_states, acc_outputs, acc_probs])
'''mean-field updates for states and outputs'''
next_state = tf.add_n([x*y for x,y in zip(acc_probs,acc_states)])
output = tf.add_n([x*y for x,y in zip(acc_probs,acc_outputs)])
remainder = 1.0 - tf.add_n(acc_probs[:-1]) #you take the last off to avoid a negative ponder cost #the problem here is we need to take the sum of all the remainders
tf.add_to_collection("ACT_remainder", remainder) #if this doesnt work then you can do self.list based upon timesteps
tf.add_to_collection("ACT_iterations", iterations)
return output, next_state
def ACTStep(self,prob, counter, state, input, acc_states, acc_outputs, acc_probs):
'''run rnn once'''
output, new_state = rnn.rnn(self.cell, [input], state, scope=type(self.cell).__name__)
prob_w = tf.get_variable("prob_w", [self.cell.input_size,1])
prob_b = tf.get_variable("prob_b", [1])
halting_probability = tf.nn.sigmoid(tf.matmul(prob_w,new_state) + prob_b)
acc_states.append(new_state)
acc_outputs.append(output)
acc_probs.append(halting_probability)
return [p + prob, counter + 1.0, new_state, input,acc_states,acc_outputs,acc_probs]
def PonderCostFunction(self, time_penalty = 0.01):
'''
note: ponder is completely different than probability and ponder = roe
the ponder cost function prohibits the rnn from cycling endlessly on each timestep when not much is needed
'''
n_iterations = tf.get_collection_ref("ACT_iterations")
remainder = tf.get_collection_ref("ACT_remainder")
return tf.reduce_sum(n_iterations + remainder) #completely different from probability
This is a complicated paper to implement that I have been working on myself. I wouldn't mind collaborating with you to get it done in Tensorflow. If you're interested, please add me at LeavesBreathe on Skype and we can go from there.