How to load pickle files by tensorflow's tf.data API - tensorflow

I have my data in multiple pickle files stored on disk. I want to use tensorflow's tf.data.Dataset to load my data into training pipeline. My code goes:
def _parse_file(path):
image, label = *load pickle file*
return image, label
paths = glob.glob('*.pkl')
print(len(paths))
dataset = tf.data.Dataset.from_tensor_slices(paths)
dataset = dataset.map(_parse_file)
iterator = dataset.make_one_shot_iterator()
Problem is I don't know how to implement the _parse_file fuction. The argument to this function, path, is of tensor type. I tried
def _parse_file(path):
with tf.Session() as s:
p = s.run(path)
image, label = pickle.load(open(p, 'rb'))
return image, label
and got error message:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'arg0' with dtype string
[[Node: arg0 = Placeholder[dtype=DT_STRING, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
After some search on the Internet I still have no idea how to do it. I will be grateful to anyone providing me a hint.

I have solved this myself. I should use tf.py_func as in this doc.

This is how I solved this issue. I didn't use the tf.py_func; check out function "load_encoding()" below, which is what's doing the pickle reading. The FACELIB_DIR contains directories of pickled vggface2 encodings, each directory named for the person of those face encodings.
import tensorflow as tf
import pickle
import os
FACELIB_DIR='/var/noggin/FaceEncodings'
# Get list of all classes & build a quick int-lookup dictionary
labelNames = sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')])
labelStrToInt = dict([(x,i) for i,x in enumerate(labelNames)])
# Function load_encoding - Loads Encoding data from enc2048 file in filepath
# This reads an encoding from disk, and through the file path gets the label oneHot value, returns both
def load_encoding(file_path):
with open(os.path.join(FACELIB_DIR,file_path),'rb') as fin:
A,_ = pickle.loads(fin.read()) # encodings, source_image_name
label_str = tf.strings.split(file_path, os.path.sep)[-2]
return (A, labelStrToInt[label_str])
# Build the dataset of every enc2048 file in our data library
encpaths = []
for D in sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')]):
# All the encoding files
encfiles = sorted(filter((lambda x: x.endswith('.enc2048')), os.listdir(os.path.join(FACELIB_DIR, D))))
encpaths += [os.path.join(D,x) for x in encfiles]
dataset = tf.data.Dataset.from_tensor_slices(encpaths)
# Shuffle and speed improvements on the dataset
BATCH_SIZE = 64
from tensorflow.data import AUTOTUNE
dataset = (dataset
.shuffle(1024)
.cache()
.repeat()
.batch(BATCH_SIZE)
.prefetch(AUTOTUNE)
)
# Benchmark our tf.data pipeline
import time
datasetGen = iter(dataset)
NUM_STEPS = 10000
start_time = time.time()
for i in range(0, NUM_STEPS):
X = next(datasetGen)
totalTime = time.time() - start_time
print('==> tf.data generated {} tensors in {:.2f} seconds'.format(BATCH_SIZE * NUM_STEPS, totalTime))

tf.py_func
This function is used to solved that problem and also as menstion in doc.

Related

Why ImportExampleGen reads TFRecords as SparseTensor instead of Tensor?

I'm converting a CSV file into a TFRecords file like this:
File: ./dataset/csv/file.csv
feature_1, feture_2, output
1, 1, 1
2, 2, 2
3, 3, 3
import tensorflow as tf
import csv
import os
print(tf.__version__)
def create_csv_iterator(csv_file_path, skip_header):
with tf.io.gfile.GFile(csv_file_path) as csv_file:
reader = csv.reader(csv_file)
if skip_header: # Skip the header
next(reader)
for row in reader:
yield row
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def create_example(row):
"""
Returns a tensorflow.Example Protocol Buffer object.
"""
features = {}
for feature_index, feature_name in enumerate(["feature_1", "feture_2", "output"]):
feature_value = row[feature_index]
features[feature_name] = _int64_feature(int(feature_value))
return tf.train.Example(features=tf.train.Features(feature=features))
def create_tfrecords_file(input_csv_file):
"""
Creates a TFRecords file for the given input data
"""
output_tfrecord_file = input_csv_file.replace("csv", "tfrecords")
writer = tf.io.TFRecordWriter(output_tfrecord_file)
print("Creating TFRecords file at", output_tfrecord_file, "...")
for i, row in enumerate(create_csv_iterator(input_csv_file, skip_header=True)):
if len(row) == 0:
continue
example = create_example(row)
content = example.SerializeToString()
writer.write(content)
writer.close()
print("Finish Writing", output_tfrecord_file)
create_tfrecords_file("./dataset/csv/file.csv")
Then I'll read the generated TFRecords files using ImportExampleGen class:
import os
import absl
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip
context = InteractiveContext()
example_gen = tfx.components.ImportExampleGen(input_base="./dataset/tfrecords")
context.run(example_gen, enable_cache=True)
statistics_gen = tfx.components.StatisticsGen(
examples=example_gen.outputs['examples'])
context.run(statistics_gen, enable_cache=True)
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
context.run(schema_gen, enable_cache=True)
File: ./transform.py
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
print(inputs)
return inputs
transform = tfx.components.Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=os.path.abspath("./transform.py"))
context.run(transform, enable_cache=True)
In the preprocessing_fn function shows that inputs is a SparseTensor objects. My question is why? As far as I can tell, my dataset's samples are dense and they should be Tensor instead. Am I doing something wrong?
For anyone else who might be struggling with the same issue, I found the culprit. It's the SchemaGen class. This is how I was instantiating its object:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
I don't know what's the use case for asking SchemaGen class not to infer the shape of the features but the tutorial I was following had it set to False and I had just copied and pasted the same thing. Comparing with some other tutorials, I realized that it could be the reason why I was getting SparseTensor.
So, if you let SchemaGen infer the shape of your features or you load a hand crafted schema in which you've set the shapes yourself, you'll be getting a Tensor in your preprocessing_fn. But if the shapes are not set, the features will be instances of SparseTensor.
For the sake of completeness, this is the fixed snippet:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=True)

Receiving the same (not random) augmentations of image dataset

dataset = tf.data.Dataset.range(1, 6)
def aug(y):
x = np.random.uniform(0,1)
if x > 0.5:
y = 100
return y
dataset = dataset.map(aug)
print(list(dataset))
Run this code, then all the elements in the dataset are as they were, or all equal to 100. How do I make it so each element is individually transformed?
My more specific question below is basically asking this
I create my segmentation training set by:
dataset = tf.data.Dataset.from_tensor_slices((image_paths, mask_paths))
I then apply my augmentation function to the dataset:
def augment(image_path, mask_path)):
//use tf.io.read_file and tf.io.decode_jpeg to convert paths to tensors
x = np.random.choice([0,1])
if x == 1:
image = tf.image.flip_up_down(image)
mask = tf.image.flip_up_down(mask)
return image, mask
training_dataset = dataset.map(augment)
BATCH_SIZE=2
training_dataset = training_dataset.shuffle(100, reshuffle_each_iteration=True)
training_dataset = training_dataset.batch(BATCH_SIZE)
training_dataset = training_dataset.repeat()
training_dataset = training_dataset.prefetch(-1)
However when I visualise my training dataset, all the images have same flip applied- the are all either flipped upside down or not flipped. Where as I'm expecting them to have different flips- some upside down and some not.
Why is this happening?
You need to use tensorflow operations (not numpy or normal python) because tf.data.Dataset.map() executes the mapped function as a graph. When converting a function to a graph, numpy and base python are converted to constants. The augmentation function is only running np.random.uniform(0,1) once and storing it as a constant.
Note that irrespective of the context in which map_func is defined (eager vs. graph), tf.data traces the function and executes it as a graph.
The source for the above is here.
One solution is to use tensorflow operations. I have included an example below. Note that the y value in the if has to be cast to the same dtype as the input.
dataset = tf.data.Dataset.range(1, 6)
def aug(y):
x = tf.random.uniform([], 0, 1)
if x > 0.5:
y = tf.cast(100, y.dtype)
return y
dataset = dataset.map(aug)
print(list(dataset))
You can use a uniform random function or other probability distribution
tf.random.uniform(
shape, minval=0, maxval=None, dtype=tf.dtypes.float32, seed=None, name=None
)
even you can use prebuild method in TensorFlow or Keras for fliping
tf.keras.layers.experimental.preprocessing.RandomFlip(
mode=HORIZONTAL_AND_VERTICAL, seed=None, name=None, **kwargs
)

A bytes-like object is required, not 'Tensor' when calling map on string tensors in eager mode

I am trying to use TF.dataset.map to port over this old code because I get a deprecation warning.
Old code which reads a set of custom protos from a TFRecord file:
record_iterator = tf.python_io.tf_record_iterator(path=filename)
for record in record_iterator:
example = MyProto()
example.ParseFromString(record)
I am trying to use eager mode and map, but I get this error.
def parse_proto(string):
proto_object = MyProto()
proto_object.ParseFromString(string)
dataset = tf.data.TFRecordDataset(dataset_paths)
parsed_protos = raw_tf_dataset.map(parse_proto)
This code works:
for raw_record in raw_tf_dataset:
proto_object = MyProto()
proto_object.ParseFromString(raw_record.numpy())
But the map gives me an error:
TypeError: a bytes-like object is required, not 'Tensor'
What is the right way to take use the argument the function results of the map and treat them like a string?
You need to extract string form the tensor and use in the map function. Below are the steps to be implemented in the code to achieve this.
You have to decorate the map function with tf.py_function(get_path, [x], [tf.float32]). You can find more about tf.py_function here. In tf.py_function, first argument is the name of map function, second argument is the element to be passed to map function and final argument is the return type.
You can get your string part by using bytes.decode(file_path.numpy()) in map function.
So modify your program as below,
parsed_protos = raw_tf_dataset.map(parse_proto)
to
parsed_protos = raw_tf_dataset.map(lambda x: tf.py_function(parse_proto, [x], [function return type]))
Also modify parse_proto as below,
def parse_proto(string):
proto_object = MyProto()
proto_object.ParseFromString(string)
to
def parse_proto(string):
proto_object = MyProto()
proto_object.ParseFromString(bytes.decode(string.numpy()))
In the below simple program, we are using tf.data.Dataset.list_files to read path of the image. Next in the map function we are reading the image using load_img and later doing the tf.image.central_crop function to crop central part of the image.
Code -
%tensorflow_version 2.x
import tensorflow as tf
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array, array_to_img
from matplotlib import pyplot as plt
import numpy as np
def load_file_and_process(path):
image = load_img(bytes.decode(path.numpy()), target_size=(224, 224))
image = img_to_array(image)
image = tf.image.central_crop(image, np.random.uniform(0.50, 1.00))
return image
train_dataset = tf.data.Dataset.list_files('/content/bird.jpg')
train_dataset = train_dataset.map(lambda x: tf.py_function(load_file_and_process, [x], [tf.float32]))
for f in train_dataset:
for l in f:
image = np.array(array_to_img(l))
plt.imshow(image)
Output -
Hope this answers your question. Happy Learning.

Caffe always returns one label

I have trained a model with caffe tools under bin and now I am trying to do testing using python script, I read in an image and preprocess it myself (as I did for my training dataset) and I load the pretrained weights to the net, but I am almost always (99.99% of the time) receiving the same result -0- for every test image. I did consider that my model might be overfitting but after training a few models, I have come to realize the labels I get from predictions are most likely the cause. I have also increased dropout and took random crops to overcome overfitting and I have about 60K for training. The dataset is also roughly balanced. I get between 77 to 87 accuracy during evaluation step of training (depending on how I process data, what architecture I use etc)
Excuse my super hacky code, I have been distant to caffe testing for some time so I suspect the problem is how I pass the input data to the network, but I can't put my finger on it:
import h5py, os
import sys
sys.path.append("/home/X/Desktop/caffe-caffe-0.16/python")
from caffe.io import oversample
from caffe.io import resize_image
import caffe
from random import randint
import numpy as np
import cv2
import matplotlib.pyplot as plt
from collections import Counter as Cnt
meanImg = cv2.imread('/home/caffe/data/Ch/Final_meanImg.png')
model_def = '/home/X/Desktop/caffe-caffe-0.16/models/bvlc_googlenet/deploy.prototxt'
model_weights = '/media/X/DATA/SDet/Google__iter_140000.caffemodel'
# load the model
#caffe.set_mode_gpu()
#caffe.set_device(0)
net = caffe.Net(model_def, # defines the structure of the model
model_weights, # contains the trained weights
caffe.TEST) # use test mode (e.g., don't perform dropout)
with open( '/home/caffe/examples/sdet/SDet/test_random.txt', 'r' ) as T, open('/media/X/DATA/SDet/results/testResults.txt','w') as testResultsFile:
readImgCounter = 0
runningCorrect = 0
runningAcc = 0.0
#testResultsFile.write('filename'+' '+'prediction'+' '+'GT')
lines = T.readlines()
for i,l in enumerate(lines):
sp = l.split(' ')
video = sp[0].split('_')[0]
impath = '/home/caffe/data/Ch/images/'+video+'/'+sp[0] +'.jpg'
img = cv2.imread(impath)
resized_img = resize_image(img, (255,255))
oversampledImages = oversample([resized_img], (224,224)) #5 crops x 2 mirror flips = return 10 images
transposed_img = np.zeros( (10, 3, 224, 224), dtype='f4' )
tp = np.zeros( (1, 3, 224, 224), dtype='f4' )
predictedLabels = []
for j in range(0,oversampledImages.shape[0]-1):
transposed_img[j] = oversampledImages[j].transpose((2,0,1))
tp[0] = transposed_img[j]
net.blobs['data'].data[0] = tp
pred = net.forward(data=tp)
predictedLabels.append(pred['prob'].argmax())
print(predictedLabels)
prediction,num_most_common = Cnt(predictedLabels).most_common(1)[0]
print(prediction)
readImgCounter = readImgCounter + 1
if (prediction == int(sp[1])):
runningCorrect = runningCorrect + 1
runningAcc = runningCorrect / readImgCounter
print('runningAcc:')
print(runningAcc)
print('-----------')
print('runningCorrect:')
print(runningCorrect)
print('-----------')
print('totalImgRead:')
print(readImgCounter)
print('-----------')
testResultsFile.write(sp[0]+' '+str(prediction)+' '+sp[1])
testResultsFile.write('\n')
I have fixed this problem eventually. I am not 100% sure what worked but it was most likely changing the bias to 0 while learning.

tensorboard with numpy array

Can someone give a example on how to use tensorboard visualize numpy array value?
There is a related question here, I don't really get it.
Tensorboard logging non-tensor (numpy) information (AUC)
For example,
If I have
for i in range(100):
foo = np.random.rand(3,2)
How can I keep tracking the distribution of foo using tensorboard for 100 iterations? Can someone give a code example?
Thanks.
For simple values (scalar), you can use this recipe
summary_writer = tf.train.SummaryWriter(FLAGS.logdir)
summary = tf.Summary()
summary.value.add(tag=tagname, simple_value=value)
summary_writer.add_summary(summary, global_step)
summary_writer.flush()
As far as using array, perhaps you can add 6 values in a sequence, ie
for value in foo:
summary.value.add(tag=tagname, simple_value=value)
Another (simplest) way is just using placeholders. First, you can make a placeholder for your numpy array shape.
# Some place holders for summary
summary_reward = tf.placeholder(tf.float32, shape=(), name="reward")
tf.summary.scalar("reward", summary_reward)
Then, just call session.run the merged summary with the feed_dict.
# Summary
summ = tf.summary.merge_all()
...
s = sess.run(summ, feed_dict={summary_reward: reward})
writer.add_summary(s, i)
if you install this package via pip install tensorboard-pytorch it becomes as straightforward as it can get:
import numpy as np
from tensorboardX import SummaryWriter
writer = SummaryWriter()
for i in range(50):
writer.add_histogram("moving_gauss", np.random.normal(i, i, 1000), i, bins="auto")
writer.close()
Will generate the corresponding histogram data in the runs directory:
Found a way to work around, create a variable and assign the value of numpy array to the variable, use tensorboard to track the variable
mysummary_writer = tf.train.SummaryWriter("./tmp/test/")
a = tf.Variable(tf.zeros([3,2]), name="a")
sum1 = tf.histogram_summary("nparray1", a)
summary_op = tf.merge_all_summaries()
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for ii in range(10):
foo = np.random.rand(3, 2)
assign_op = a.assign(foo)
summary, _ = sess.run([summary_op, assign_op])
mysummary_writer.add_summary(tf.Summary.FromString(summary), global_step=ii)
mysummary_writer.flush()
sess = tf.Session()
writer = tf.summary.FileWriter('tensorboard_test')
var = tf.Variable(0.0,trainable=False,name='loss')
sess.run(var.initializer)
summary_op = tf.summary.scalar('scalar1',var)
for value in array:
sess.run(var.assign(value))
summary = sess.run(summary_op)
writer.add_summary(summary,i)
It works, but slow.
You could define a function like this (taken from gyglim's gist):
def add_histogram(writer, tag, values, step, bins=1000):
"""
Logs the histogram of a list/vector of values.
From: https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
"""
# Create histogram using numpy
counts, bin_edges = np.histogram(values, bins=bins)
# Fill fields of histogram proto
hist = tf.HistogramProto()
hist.min = float(np.min(values))
hist.max = float(np.max(values))
hist.num = int(np.prod(values.shape))
hist.sum = float(np.sum(values))
hist.sum_squares = float(np.sum(values ** 2))
# Requires equal number as bins, where the first goes from -DBL_MAX to bin_edges[1]
# See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto#L30
# Therefore we drop the start of the first bin
bin_edges = bin_edges[1:]
# Add bin edges and counts
for edge in bin_edges:
hist.bucket_limit.append(edge)
for c in counts:
hist.bucket.append(c)
# Create and write Summary
summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
writer.add_summary(summary, step)
And then add to the summary writer like this:
add_histogram(summary_writer, "Histogram_Name", your_numpy_array, step)
You can plot the vector with matplotlib, convert the plot to numpy array along the lines of
https://stackoverflow.com/a/35362787/10873169, and then add it to Tensorboard as image
import numpy as np
from matplotlib.backends.backend_agg import FigureCanvasAgg
from matplotlib.figure import Figure
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/example")
for step in range(1, 10):
# Time-dependent vector we want to plot
example_vector = np.sin(np.arange(100) / step)
# Plot it in matplotlib first. Default DPI doesn't look good in Tensorboard
fig = Figure(figsize=(5, 2), dpi=200)
canvas = FigureCanvasAgg(fig)
fig.gca().plot(example_vector)
canvas.draw()
# Get the image as a string of bytes
image_as_string = np.frombuffer(canvas.tostring_rgb(), dtype='uint8')
# Need to reshape to (height, width, channels)
target_shape = canvas.get_width_height()[::-1] + (3,)
reshaped_image = image_as_string.reshape(target_shape)
# Write to Tensorboard logs
writer.add_image("example_vector", reshaped_image,
dataformats="HWC", global_step=step)
writer.close()