How to prepare my own data for tensorflow?

How to prepare my own data for tensorflow? - tensorflow

I install Tensorflow on ubuntu 14.04. I completed MNIST For ML Beginners tutorial. I understood it.
Nor, I try to use my own data. I have train datas as T[1000][10]. Labels are L[2], 1 or 0.
How can I access my data mnist.train.images ?

In input_data.py, these two functions do the main job.
1. Download
def maybe_download(filename, work_directory):
"""Download the data from Yann's website, unless it's already here."""
if not os.path.exists(work_directory):
os.mkdir(work_directory)
filepath = os.path.join(work_directory, filename)
if not os.path.exists(filepath):
filepath, _ = urlretrieve(SOURCE_URL + filename, filepath)
statinfo = os.stat(filepath)
print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
return filepath
2 Image to nparray
def extract_images(filename):
"""Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
print('Extracting', filename)
with gzip.open(filename) as bytestream:
magic = _read32(bytestream)
if magic != 2051:
raise ValueError(
'Invalid magic number %d in MNIST image file: %s' %
(magic, filename))
num_images = _read32(bytestream)
rows = _read32(bytestream)
cols = _read32(bytestream)
buf = bytestream.read(rows * cols * num_images)
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_images, rows, cols, 1)
return data
Based on your dataset and location, you can call:
local_file = maybe_download(TRAIN_IMAGES, train_dir)
train_images = extract_images(local_file)
See the full source code at https://github.com/nlintz/TensorFlow-Tutorials/blob/master/input_data.py.

Related

How to use TensorFlow lite on a raspberry pi 4 without keras?

Basically I want to convert this code snippet to code that opens a tflite model and does not use keras. I can not install keras on my raspberry pi 4 as it needs Tensorflow 2+.
model = keras.models.load_model( saved_model_path )
image_url = tf.keras.utils.get_file('Court', origin='https://squashvideo.site/share/court3.jpg' )
img = tf.keras.preprocessing.image.load_img(image_url, target_size=( 224, 224 ) )
os.remove(image_url) # Remove the cached file
img_array = tf.keras.preprocessing.image.img_to_array(img)
prediction_scores = model.predict(np.expand_dims(img_array, axis=0)/255)
score = tf.nn.softmax(prediction_scores[0])
print(
"This image most likely belongs to {} with a {:.2f} percent confidence."
.format(class_names[np.argmax(score)], 100 * np.max(score))
)
Here's what I have tried which gives the error below:
from PIL import Image
def classify_image(interpreter, image, top_k=1):
tensor_index = interpreter.get_input_details()[0]['index']
input_tensor = interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = image
interpreter.invoke()
output_details = interpreter.get_output_details()[0]
output = np.squeeze(interpreter.get_tensor(output_details['index']))
scale, zero_point = output_details['quantization']
output = scale * (output - zero_point)
ordered = np.argpartition(-output, top_k)
return [(i, output[i]) for i in ordered[:top_k]][0]
interpreter = Interpreter('/var/www/html/share/AI/court.tflite')
interpreter.allocate_tensors()
_, height, width, _ = interpreter.get_input_details()[0]['shape']
print("Image Shape (", width, ",", height, ")")
data_folder = "/var/www/html/share/"
image = Image.open(data_folder + "court1.jpg").convert('RGB').resize((width, height))
label_id, prob = classify_image(interpreter, image)
Running gives the error:
squash#court1:/var/www/html/share/AI $ python3 test.py
Image Shape ( 224 , 224 )
Traceback (most recent call last):
File "test.py", line 44, in <module>
label_id, prob = classify_image(interpreter, image)
File "test.py", line 22, in classify_image
interpreter.invoke()
File "/home/squash/.local/lib/python3.7/site-packages/tflite_runtime/interpreter.py", line 539, in invoke
self._ensure_safe()
File "/home/squash/.local/lib/python3.7/site-packages/tflite_runtime/interpreter.py", line 287, in _ensure_safe
data access.""")
RuntimeError: There is at least 1 reference to internal data
in the interpreter in the form of a numpy array or slice. Be sure to
only hold the function returned from tensor() if you are using raw
data access.

The error is in the way you are feeding data to the tflite Interpreter here:
input_tensor = interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = image
The Image.open function return an Image object. You need to convert it into binary data before feeding it to a tensor. An you should use:
interpreter.set_tensor(0, image_data)
to set the data instead of above assignment.

Think I fixed it by doing this:
img = Image.open( image_url ).convert('RGB').resize((224, 224))
img_array = np.array ( img, dtype=np.float32 )
probs_lite = lite_model( np.expand_dims(img_array, axis=0)/255 )[0]
print ( probs_lite )
print (np.argmax(probs_lite))
score = tf.nn.softmax(probs_lite)
print(
"This image most likely belongs to {} with a {:.2f} percent confidence."
.format(class_names[np.argmax(score)], 100 * np.max(score))
)

TensorFlow-Keras generator: Turn off auto-sharding or switch auto_shard_policiy to DATA

While training my model I ran into the issue described in the post Tensorflow - Keras: Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. My question now is: Does the solution mentioned by #Graham501617 work with generators as well? Here is some dummy code for what I use so far:
class BatchGenerator(Sequence):
def __init__(self, some_args):
...
def __len__(self):
num_batches_in_sequence = ...
def __getitem__(self, _):
data, labels = get_one_batch(self.some_args)
return data, labels
In the main script I do something like:
train_generator = BatchGenerator(some_args)
valid_generator = BatchGenerator(some_args)
cross_device_ops = tf.distribute.HierarchicalCopyAllReduce(num_packs=2)
strategy = tf.distribute.MirroredStrategy(cross_device_ops=cross_device_ops)
with strategy.scope():
model = some_model
model.compile(some_args)
history = model.fit(
x=train_generator,
validation_data=valid_generator,
...
)
I would probably have to modify the __getitem__ function somehow, do I?
I appreciate your support!

You'd have to wrap your generator into a single function...
Example below assumes your data is stored as numpy array (.npy), each file already has the correct amount of mini-batch size, is labeled 0_x.npy, 1_x.npy, 2_x.npy, etc.. and both data and label arrays are float64.
from pathlib import Path
import tensorflow as tf
import numpy as np
# Your new generator as a function rather than an object you need to instantiate
def getNextBatch(stop, data_dir):
i = 0
data_dir = data_dir.decode('ascii')
while True:
while i < stop:
x = np.load(str(Path(data_dir + "/" + str(i) + "_x.npy")))
y = np.load(str(Path(data_dir + "/" + str(i) + "_y.npy")))
yield x, y
i += 1
i = 0
# Make a dataset given the directory and strategy
def makeDataset(generator_func, dir, strategy=None):
# Get amount of files
data_size = int(len([name for name in os.listdir(dir) if os.path.isfile(os.path.join(dir, name))])/2)
ds = tf.data.Dataset.from_generator(generator_func, args=[data_size, dir], output_types=(tf.float64, tf.float64)) # Make a dataset from the generator. MAKE SURE TO SPECIFY THE DATA TYPE!!!
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
ds = ds.with_options(options)
# Optional: Make it a distributed dataset if you're using a strategy
if strategy is not None:
ds = strategy.experimental_distribute_dataset(ds)
return ds
training_ds = makeDataset(getNextBatch, str(Path(data_dir + "/training")), None)
validation_ds = makeDataset(getNextBatch, str(Path(data_dir + "/validation")), None)
model.fit(training_ds,
epochs=epochs,
callbacks=callbacks,
validation_data=validation_ds)
You might need to pass the amount of steps per epoch in your fit() call, in which case you can use the generator you've already made.

How to save large float into TFRecord format? float_list/float32 seems to truncate the values

We write processed data into TFRecords and we are noticing data loss when read back from TFRecords. Reproducible example below. Strange thing is that it doesn't just drop the decimals but seem to randomly roundup/down values. Since it only allows float32, int64 and string, we are not sure what other options to try.
We are writing these values
[20191221.1, 20191222.1, 20191223.1, 20191224.1, 20191225.1, 20191226.1, 20191227.1, 20191228.1, 20191229.1, 20191230.1]
But reading from tfrecords returns these values
tf.Tensor(
[20191222. 20191222. 20191224. 20191224. 20191226. 20191226. 20191228.
20191228. 20191230. 20191230.], shape=(10,), dtype=float32)
Reproducible Code
import tensorflow as tf
def write_date_tfrecord():
#writes 10 dummy values to replicate the issue
data = [20191221.1 + x for x in range(0,10)]
print("Writing data - ", data)
example = tf.train.Example(
features = tf.train.Features(
feature = {
'data':tf.train.Feature(float_list=tf.train.FloatList(value=data))
}
))
writer = tf.io.TFRecordWriter("data.tf_record")
writer.write(example.SerializeToString())
def parse_function(serialized_example):
features = {
'data': tf.io.FixedLenSequenceFeature([], tf.float32,allow_missing=True)
}
features = tf.io.parse_single_example(serialized=serialized_example, features=features)
data = features['data']
return data
def dataset_generator():
trRecordDataset = tf.data.TFRecordDataset("data.tf_record")
trRecordDataset = trRecordDataset.map(parse_function, num_parallel_calls = tf.data.experimental.AUTOTUNE)
return trRecordDataset
if __name__ == '__main__':
write_date_tfrecord()
generator = dataset_generator()
for data in generator:
print(data)

This solved my issue. I had this issue when writing audio files as floating point matrix using FloatList.. but when i used BytesList and stored the data into tfrecords and then read the data by decoding it.. the issue resolved.. note that even decoding with tf.float32 will lead not solve the issue. we need to decode it with tf.float64..
def _bytes_feature2(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def serialize_example(sound):
feature = {
'snd': _bytes_feature2(sound.tobytes()),
}
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
return example_proto.SerializeToString()
def write_tfrecords(rf,snd):
nsamples = len(snd)
with tf.io.TFRecordWriter(rf) as writer:
for i in range(nsamples):
SND = snd[i]
tf_example = serialize_example(SND)
writer.write(tf_example)
# writing records
write_tfrecords(os.getcwd()+'\\tfrec\\'+'train.tfrecords',train)
# loading records
raw_dataset = tf.data.TFRecordDataset(os.getcwd()+'\\tfrec\\'+'train.tfrecords')
def parse_record(record):
name_to_features= {
'snd':tf.io.FixedLenFeature([],tf.string),
}
return tf.io.parse_single_example(record, name_to_features)
def decode_record(record):
aud = tf.io.decode_raw(
record['snd'], out_type=tf.float64
)
return aud
for record in raw_dataset:
parsed_record = parse_record(record)
decoded_record = decode_record(parsed_record)
aud = decoded_record
print(aud.numpy()[0:10])
print(train[0][0:10])
output:
[ 417.69951205 -231.58708746 -10.05624011 -146.10342256 -66.60317323
-159.91550792 -3.93602823 29.94517981 106.22196629 65.53008959]
[ 417.69951205 -231.58708746 -10.05624011 -146.10342256 -66.60317323
-159.91550792 -3.93602823 29.94517981 106.22196629 65.53008959]

how to determine which JPEG picture is corrupt in Tensorflow decode_jpeg

when i decode JPEG format pictures, it give message "corrupt jpeg data premature end of data segment ". how can i determine which picture is corrupt.
This is my code:
import os
import tensorflow as tf
directory = 'D:\\tfrecord\\read'
directories = []
class_names = []
photo_filenames = []
for filename in os.listdir(directory):
path = os.path.join(directory, filename)
photo_filenames.append(path)
with tf.Session() as sess:
print("session")
init = tf.global_variables_initializer()
sess.run(init)
for filename in photo_filenames:
filecontents = tf.read_file(filename)
image = tf.image.decode_jpeg(filecontents, channels = 3)
try:
sess.run(image)
except Exception as e:
print(filename)
print (e)

insert
image = tf.Print(image,[filename])
after
image = tf.image.decode_jpeg(filecontents, channels = 3)
the filename will show right after 'corrupt jpeg data...'

DataFlow appears to be stuck. - on reading images from Google Storage

I have a DataFlow job, which first reads in 2 text files, located in Google Cloud Storage. The text files contain the paths to images also located in Google Cloud Storage.
After some inspections, I can confirm that reading the text files is successful, but the DataFlow job is stuck at reading the images. The same code runs perfectly locally. Which makes me think that maybe the image paths are incorrect, but it's not.
Here's my job ID: 2018-01-10_12_16_56-8294573519126715750
Any advice would be appreciated. Also any pointers on how to solve / debug this problem would be really useful as I don't even know where to start.
Thanks
Pipeline Definition
def configure_pipeline(pipeline, args):
read_input_source = beam.io.ReadFromText(args.input_path, strip_trailing_newlines=True)
read_img_paths = beam.io.ReadFromText(args.input_imgs, strip_trailing_newlines=True)
img_paths = (pipeline | 'Read image paths' >> read_img_paths)
train_points = (pipeline | 'Read data point' >> read_input_source)
_ = (train_points | "Read image" >> beam.ParDo(ExtractDataDoFn(), beam.pvalue.AsIter(img_paths)))
Read Images - Most of the code is simply parsing the image paths from the text file and some data to index the image.
class ExtractDataDoFn(beam.DoFn):
def start_bundle(self, context=None):
# Each frame has its own path to its image
self.frame_number_to_name = {}
def process(self, element, img_paths):
try:
line = element.element
except AttributeError:
pass
if not self.frame_number_to_name:
for path in img_paths:
if len(path) > 4:
frame_number = int(path[-10 : -4])
self.frame_number_to_name[frame_number] = path
line_tokens = element.split(':')
pivot_example = line_tokens[0].strip('\'')
example = line_tokens[1].strip('\'')
label = int(line_tokens[2])
# Get image paths
pivot_frame_number = int(pivot_example.split(',')[0])
pivot_path = self.frame_number_to_name[pivot_frame_number]
example_frame_number = int(example.split(',')[0])
example_path = self.frame_number_to_name[example_frame_number]
# Read images
def _open_file_read_binary(uri):
try:
return file_io.FileIO(uri, mode='rb')
except errors.InvalidArgumentError:
return file_io.FileIO(uri, mode='r')
# Read pivot
try:
with _open_file_read_binary(pivot_path) as f:
pivot_image_bytes = f.read()
pivot_img = Image.open(io.BytesIO(pivot_image_bytes)).convert('RGB')
except Exception as e: # pylint: disable=broad-except
logging.exception('Error processing image %s: %s', pivot_example, str(e))
return
# Read example
try:
with _open_file_read_binary(example_path) as f:
example_image_bytes = f.read()
example_img = Image.open(io.BytesIO(example_image_bytes)).convert('RGB')
except Exception as e: # pylint: disable=broad-except
logging.exception('Error processing image %s: %s', example, str(e))
return
# Convert to Numpy array
pivot_np = np.array(pivot_img)
example_np = np.array(example_img)
def _get_feature(line, img):
frame_number = int(line.split(',')[0])
y, x = int(line.split(',')[3]), int(line.split(',')[2])
h, w = int(line.split(',')[5]), int(line.split(',')[4])
bb = img[y : y + h, x : x + w, :]
return bb
# Get raw content of bounding box
pivot_feature = _get_feature(pivot_example, pivot_np)
example_feature = _get_feature(example, example_np)
# Resize data
pivot_feature = Image.fromarray(pivot_feature).resize((224, 224))
example_feature = Image.fromarray(example_feature).resize((224, 224))
# Convert back to numpy
pivot_feature = np.array(pivot_feature, np.float64)
example_feature = np.array(example_feature, np.float64)
# print(pivot_feature.shape)
yield pivot_feature, example_feature, label

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to prepare my own data for tensorflow? - tensorflow

I install Tensorflow on ubuntu 14.04. I completed MNIST For ML Beginners tutorial. I understood it. Nor, I try to use my own data. I have train datas as T[1000][10]. Labels are L[2], 1 or 0. How can I access my data mnist.train.images ?

Related

How to use TensorFlow lite on a raspberry pi 4 without keras?

TensorFlow-Keras generator: Turn off auto-sharding or switch auto_shard_policiy to DATA

How to save large float into TFRecord format? float_list/float32 seems to truncate the values

how to determine which JPEG picture is corrupt in Tensorflow decode_jpeg

DataFlow appears to be stuck. - on reading images from Google Storage

Categories

Resources