I am trying to create a tf.data.Dataset from a generator I wrote, and following this great answer: Split .tfrecords file into many .tfrecords files
Generator Code
def get_examples_generator(num_variants, vcf_reader):
def generator():
counter = 0
for vcf_read in vcf_reader:
is_vcf_ok = ... # checking whether this "vcf" example is ok
if is_vcf_ok and counter < num_variants:
counter += 1
# features extraction ...
# we create an example
example = make_example(img=img, label=label) # returns a SerializedExample
yield example
return generator
TFRecordsWriter Usage Code
def write_sharded_tfrecords(filename, path, vcf_reader,
num_variants,
shard_len):
assert Path(path).exists(), "path does not exist"
generator = get_examples_generator(num_variants=num_variants,
vcf_reader=vcf_reader,
cfdna_bam_reader=cfdna_bam_reader)
dataset = tf.data.Dataset.from_generator(generator,
output_types=tf.string,
output_shapes=())
num_shards = int(np.ceil(num_variants/shard_len))
formatter = lambda batch_idx: f'{path}/{filename}-{batch_idx:05d}-of-' \
f'{num_shards:05d}.tfrecord'
# inspired by https://stackoverflow.com/questions/54519309/split-tfrecords-file-into-many-tfrecords-files
for i in range(num_shards):
shard_path = formatter(i)
writer = tf.data.experimental.TFRecordWriter(shard_path)
shard = dataset.shard(num_shards, index=i)
writer.write(shard)
This is supposed to be a straight-forward use of tfrecords writer. However, It does not write any files at all. Does anyone understand why this doesn't work?
In my functions, I call the writer with tf.io.TFRecordWriter. Try changing your writer and see if it works:
writer = tf.io.TFRecordWriter
...
As a further reference, this answer helped me:
https://stackoverflow.com/a/60283571
Related
I am trying to profile GPflow using timeline and visualizing it with chrome tracing. But the trace does not seem to show the optimization process (only model construction and prediction). I define a custom config:
custom_config = gpflow.settings.get_settings()
custom_config.profiling.output_file_name = 'gpflow_timeline'
custom_config.profiling.dump_timeline = True
And try to make a simple prediction after optimization:
with gpflow.settings.temp_settings(custom_config), gpflow.session_manager.get_session().as_default():
k = gpflow.kernels.RBF()
m = gpflow.models.GPR(X_train, y_train, kern=k)
run_adam(m, lr=0.1, iterations=100, callback=__PrintAction(m, 'GPR with Adam'))
mean, var = m.predict_y(X_test)
where Adam optimizer is defined as:
class __PrintAction(Action):
def __init__(self, model, text):
self.model = model
self.text = text
def run(self, ctx):
likelihood = ctx.session.run(self.model.likelihood_tensor)
print('{}: iteration {} likelihood {:.4f}'.format(self.text, ctx.iteration, likelihood))
def run_adam(model, lr, iterations, callback=None):
adam = gpflow.train.AdamOptimizer(lr).make_optimize_action(model)
actions = [adam] if callback is None else [adam, callback]
loop = Loop(actions, stop=iterations)()
model.anchor(model.enquire_session())
Is it somehow possible to also show the optimization trace on the timeline?
Extension to #tadejk answer:
You can modify gpflowrc in GPflow/gpflow project folder instead or create it in the same folder where you run the code and tune your profiling parameters there.
[logging]
# possible levels: CRITICAL, ERROR, WARNING, INFO, DEBUG, NOTSET
level = WARNING
[verbosity]
tf_compile_verb = False
[dtypes]
float_type = float64
int_type = int32
[numerics]
jitter_level = 1e-6
# quadrature can be set to: allow, warn, error
ekern_quadrature = warn
[profiling]
dump_timeline = False
dump_tensorboard = False
output_file_name = timeline
output_directory = ./
each_time = False
[session]
intra_op_parallelism_threads = 0
inter_op_parallelism_threads = 0
Not 100% sure, but merging everything into one json file might be a bad idea. Single file produced by a session.run, therefore merging everything into one can mess things up.
I have set:
custom_config.profiling.each_time = True
to get the trace files after each run. I then merged the traces using jq:
jq -s '{traceEvents: map(.traceEvents[])}' gpflow_timeline_* >> gpflow_timeline_all.json
I was trying to read tflite model and pull all the parameters of the layers out.
My steps:
I generated flatbuffers model representation by running (please build flatc before):
flatc -python tensorflow/tensorflow/lite/schema/schema.fbs
Result is tflite/ folder that contains layer description files (*.py) and some utilitarian files.
I successfully loaded model:
in case of import Error: set PYTHONPATH to point to the folder where tflite/ is
from tflite.Model import Model
def read_tflite_model(file):
buf = open(file, "rb").read()
buf = bytearray(buf)
model = Model.GetRootAsModel(buf, 0)
return model
I partly pulled model and node parameters out and stacked in iterating over nodes:
Model part:
def print_model_info(model):
version = model.Version()
print("Model version:", version)
description = model.Description().decode('utf-8')
print("Description:", description)
subgraph_len = model.SubgraphsLength()
print("Subgraph length:", subgraph_len)
Nodes part:
def print_nodes_info(model):
# what does this 0 mean? should it always be zero?
subgraph = model.Subgraphs(0)
operators_len = subgraph.OperatorsLength()
print('Operators length:', operators_len)
from collections import deque
nodes = deque(subgraph.InputsAsNumpy())
STEP_N = 0
MAX_STEPS = operators_len
print("Nodes info:")
while len(nodes) != 0 and STEP_N <= MAX_STEPS:
print("MAX_STEPS={} STEP_N={}".format(MAX_STEPS, STEP_N))
print("-" * 60)
node_id = nodes.pop()
print("Node id:", node_id)
tensor = subgraph.Tensors(node_id)
print("Node name:", tensor.Name().decode('utf-8'))
print("Node shape:", tensor.ShapeAsNumpy())
# which type is it? what does it mean?
type_of_tensor = tensor.Type()
print("Tensor type:", type_of_tensor)
quantization = tensor.Quantization()
min = quantization.MinAsNumpy()
max = quantization.MaxAsNumpy()
scale = quantization.ScaleAsNumpy()
zero_point = quantization.ZeroPointAsNumpy()
print("Quantization: ({}, {}), s={}, z={}".format(min, max, scale, zero_point))
# I do not understand it again. what is j, that I set to 0 here?
operator = subgraph.Operators(0)
for i in operator.OutputsAsNumpy():
nodes.appendleft(i)
STEP_N += 1
print("-"*60)
Please point me to documentation or some example of using this API.
My problems are:
I can not get documentation on this API
Iterating over Tensor objects seems not possible for me, as it doesn't have Inputs and Outputs methods. + subgraph.Operators(j=0) I do not understand what j means in here. Because of that my cycle goes through two nodes: input (once) and the next one over and over again.
Iterating over Operator objects is surely possible:
Here we iterate over them all but I can not get how to map Operator and Tensor.
def print_in_out_info_of_all_operators(model):
# what does this 0 mean? should it always be zero?
subgraph = model.Subgraphs(0)
for i in range(subgraph.OperatorsLength()):
operator = subgraph.Operators(i)
print('Outputs', operator.OutputsAsNumpy())
print('Inputs', operator.InputsAsNumpy())
I do not understand how to pull parameters out Operator object. BuiltinOptions method gives me Table object, that I do not know what to map at.
subgraph = model.Subgraphs(0)
What does this 0 mean? should it always be zero? obviously no, but what is it? Id of the subgraph? If so - I'm happy. If no, please try to explain it.
I am trying to print the value of a Tensorflow variable that is defined inside an object. To better illustrate my issue, I am currently trying to run the monodepth library. It has 2 main files, dataloader and main. Basically dataloader iterates over a text file of
class MonodepthDataloader(object):
"""monodepth dataloader"""
def __init__(self, data_path, filenames_file, params, dataset, mode):
self.data_path = data_path
self.params = params
self.dataset = dataset
self.mode = mode
self.left_image_batch = None
self.right_image_batch = None
input_queue = tf.train.string_input_producer([filenames_file], shuffle=False)
line_reader = tf.TextLineReader()
_, line = line_reader.read(input_queue)
split_line = tf.string_split([line]).values
...
left_image_path = tf.string_join([self.data_path, split_line[0]])
left_image_o = self.read_image(left_image_path)
I am trying to print out left_image_path to verify that it is being generated correctly. However this is in an object being called by monodepth_main. That is monodepth_main calls the dataloader with the following lines:
dataloader = MonodepthDataloader(args.data_path, args.filenames_file, params, args.dataset, args.mode)
left = dataloader.left_image_batch
As a result I can't just use sess.run(x). I have also tried using tf.Print(line, [line]) but nothing shows up.
How do I print out the value of a tensorflow variable inside an object? Specifically left_image_path?
I am using the Dataset API list_files in order to get a list of files in a source directory and target directory, something like:
source_path = '/tmp/data/source/*.ext1'
target_path = '/tmp/data/target/*.ext2'
source_dataset = tf.data.Dataset.list_files(source_path)
target_dataset = tf.data.Dataset.list_files(data_path)
dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
Source and target dir contents have same sequential filenames, but different extensions (e.g, source 0001.ext1 <-> target 0001.ext2).
But since list_files is not ordered in anyway, the zipped dataset contains missmatches between the source and the target.
How can I solve this within the new dataset API?
The default behavior of this method is to return filenames in a non-deterministic random shuffled order. Pass a seed or shuffle=False to get results in a deterministic order.
source_dataset = tf.data.Dataset.list_files(source_path, shuffle=False)
or
val = 5
source_dataset = tf.data.Dataset.list_files(source_path, seed = val)
target_dataset = tf.data.Dataset.list_files(data_path, seed = val)
I had the same issue and I solved it by sorting the file paths first.
My files are named like in OP's case:
input image -> corresponding output
data/mband/01.tif -> data/gt_mband/01.tif
data/mband/02.tif -> data/gt_mband/02.tif
The code looks like this:
from pathlib import Path
import tensorflow as tf
DATA_PATH = Path("data")
# Sort the PATHS
img_paths = sorted(map(str, (DATA_PATH / 'mband').glob('*.tif')))
mask_paths = sorted(map(str, (DATA_PATH / 'gt_mband').glob('*.tif')))
# These are tensors of PATHS
# Paths are strings, so order will be preserved
img_paths = tf.data.Dataset.from_tensor_slices(img_paths)
mask_paths = tf.data.Dataset.from_tensor_slices(mask_paths)
# Load the actual images
def parse_image(image_path: 'some_tensor'):
# Load the image somehow...
return image_as_tensor
imgs = img_paths.map(parse_image)
masks = mask_paths.map(parse_mask)
My goal is as follows:
1). Use tf.train.string_input_producer and tf.TextLineReader to read lines from files.
2). Convert the resulting tensors containing the files' lines into ordinary strings using eval to do preprocessing before batching (TensorFlow's limited string operations are insufficient for my purposes)
3). Convert these preprocessed strings back to tensors (presumably using tf.constant ?)
4). Use tf.train.batch on the resulting tensors.
The following code is a simplified version of what I'm working on.
The "After batch" print statement gets executed, the REPL hangs on the print statement with the final eval.
From what I've read, I have a feeling this is because
threads = tf.train.start_queue_runners(coord = coord, sess = sess)
needs to be run after calling tf.train.batch. But if I do this, then the REPL will of course hang on the first eval
evalue = value.eval(session = sess)
needed to do the preprocessing.
What is the best way to convert back and forth between tensors and their values inbetween queues? (I'm really hoping I can do this without preprocessing my data files beforehand.)
import tensorflow as tf
import os
def process(string):
return string.upper()
def main():
sess = tf.Session()
filenames = tf.constant(["test_data/" + f for f in os.listdir("./test_data")])
filename_queue = tf.train.string_input_producer(filenames)
file_reader = tf.TextLineReader()
key, value = file_reader.read(filename_queue)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord = coord, sess = sess)
evalue = value.eval(session = sess)
proc_value = process(evalue)
tensor_value = tf.constant(proc_value)
batch = tf.train.batch([tensor_value], batch_size = 2, capacity = 2)
print "After batch."
print batch.eval(session = sess)
We discussed a slightly different approach, which I think achieves what you need here:
Converting TensorFlow tutorial to work with my own data
Not sure what file formats you are reading, but the above example reads CSVs row-by-row and packs them into randomized batches.
If you are reading from a CSV, then, in a nutshell, I think what you might want to do is instead of returning value from file_reader.read(filename_queue) immediately, you could try to do some pre-processing first, and return THAT instead, something like this:
rDefaults = [['a'] for row in range((ROW_LENGTH))]
_, value = reader.read(filename_queue)
whole_row = tf.decode_csv(value, record_defaults=rDefaults)
cell1 = tf.slice(whole_row, [0], [1]) # one specific cell that contains a string
cell2 = tf.slice(whole_row, [1], [2]) # another cell that contains a string
# do some processing on cell1 and cell2
return cell1, cell2