Cannot access data when tf.data with CSV - tensorflow

def get_dataset_from_csv(csv, bs=BATCH_SIZE, val=False, debug=True):
# Initialize dataset with TFRecords
dataset = tf.data.Dataset.from_tensor_slices(dict(csv))
# Decode mapping
dataset = dataset.map(decode_image, num_parallel_calls=AUTO)
def decode_image(rows):
image = cv2.imread(rows['file_path']) # cannot access rows['file_path']
target = rows['target']
return {"image": image}, target
When I run above code, the following error came out:
Can't convert object of type 'Tensor' to 'str' for 'filename
The type of rows['file_path'] is <class 'tensorflow.python.framework.ops.Tensor'>, so I can't get value during dataset.map. How can I get value of rows['file_path'] during map procedure?

Related

Tensorflow 2 custom dataset Sequence

I have a dataset in a python dictionary. The structure is as follow:
data.data['0']['input'],data.data['0']['target'],data.data['0']['length']
Both input and target are arrays of size (n,) and length is an int.
I have created a class object with tf.keras.utils.Sequence and specify __getitem__ as this:
def __getitem__(self, idx):
idx = str(idx)
return {
'input': np.asarray(self.data[idx]['input']),
'target': np.asarray(self.data[idx]['target']),
'length': self.data[idx]['length']
}
How can I iterate over such dataset using tf.data.Dataset? I am getting this error if I try to use from_tensor_slices
ValueError: Attempt to convert a value with an unsupported type (<class 'dict'>) to a Tensor.
I think you should modify the dictionary to a tensor as proposed here convert a dictionary to a tensor
or change the dictionary to a text file or to a tfrecords. Hope this would help you!

!gcloud dataproc jobs submit pyspark - ERROR AttributeError: 'str' object has no attribute 'batch'

how i can input dataset - type as input to dataproc jobs ?
mine code is below
%%writefile spark_job.py
import sys
import pyspark
import argparse
import pickle
#def time_configs_rdd(test_set, batch_sizes,batch_numbers,repetitions):
def time_configs_rdd(argv):
print(argv)
parser = argparse.ArgumentParser() # get a parser object
parser.add_argument('--out_bucket', metavar='out_bucket', required=True,
help='The bucket URL for the result.') # add a required argument
parser.add_argument('--out_file', metavar='out_file', required=True,
help='The filename for the result.') # add a required argument
parser.add_argument('--batch_size', metavar='batch_size', required=True,
help='The bucket URL for the result.') # add a required argument
parser.add_argument('--batch_number', metavar='batch_number', required=True,
help='The filename for the result.') # add a required argument
parser.add_argument('--repetitions', metavar='repetitions', required=True,
help='The filename for the result.') # add a required argument
parser.add_argument('--test_set', metavar='test_set', required=True,
help='The filename for the result.') # add a required argument
args = parser.parse_args(argv) # read the value
# the value provided with --out_bucket is now in args.out_bucket
time_configs_results = []
for s in args.batch_size:
for n in args.batch_number:
dataset = **args.test_set.batch(s).take(n)**
for r in args.repetitions:
tt0 = time.time()
for i in enumerate(dataset):
totaltime = str(time.time()-tt0)
batchtime = totaltime
#imgpersec = s*n/totaltime
time_configs_results.append((s,n,r,float(batchtime)))
#time_configs_results.append((s,n,r,batchtime,imgpersec))
time_configs_results_rdd = sc.parallelize(time_configs_results) #create an RDD with all results for each parameter
time_configs_results_rdd_avg = time_configs_results_rdd.map(lambda x: (x, x[0]*x[1]/x[3])) #RDD with the average reading speeds (RDD.map)
#mapping = time_configs_results_rdd_avg.collect()
#print(mapping)
return (time_configs_results_rdd_avg)
if 'google.colab' not in sys.modules: # Don't use system arguments when run in Colab
time_configs_rdd(sys.argv[1:])
elif __name__ == "__main__" : # but define them manually
time_configs_rdd(["--out_bucket", BUCKET, "--out_file", "time_configs_rdd_out.pkl","--batch_size", batch_size, "--batch_number", batch_number,"--test_set", test_set ] )
and code to execute it
FILENAME = 'file_RDD_OUT.pkl'
batch_size = [1]
batch_number = [1]
repetitions = [1]
#test_set = 1 will give string error
test_set = dataset2 # file <ParallelMapDataset shapes: ((192, 192, None), ()), types: (tf.float32,
tf.string)> cannot be inserted
!gcloud dataproc jobs submit pyspark --cluster $CLUSTER --region $REGION \
./spark_job.py \
-- --out_bucket $BUCKET --out_file $FILENAME --batch_size $batch_size --batch_number $batch_number --repetitions $repetitions --test_set $test_set
unfortunetlly is keep failing with error
AttributeError: 'str' object has no attribute 'batch'
ERROR: (gcloud.dataproc.jobs.submit.pyspark) Job [c2048c422f334b08a628af5a1aa492eb] failed with error:
Job failed with message [AttributeError: 'str' object has no attribute 'batch'].
problem is with test_set how i should convert dataset2(ParallelMapDataset) to be readed by the job
So you are trying to parse a string from the command line argument to a ParallelMapDataset type. You want to use the type param in your add_argument calls.
From https://docs.python.org/3/library/argparse.html#type and I quote:
By default, ArgumentParser objects read command-line arguments in as simple strings. However, quite often the command-line string should instead be interpreted as another type, like a float or int. The type keyword argument of add_argument() allows any necessary type-checking and type conversions to be performed.
and
type= can take any callable that takes a single string argument and returns the converted value
So you probably want something like:
def parse_parallel_map_dataset(string):
# your logic to parse the string into your desired data structure
...
parser.add_argument('--test_set', metavar='test_set', required=True,
type=parse_parallel_map_dataset)
Or better yet, read your test_set from a file and pass the file name as an argument.

How to map a dataset of filenames to a dataset of file contents

For example, I have a tensorflow dataset where each element is a tf.string Tensor represents a filename of an image file. Now I want to map this filename dataset to a dataset of image content Tensors.
I wrote code like this, but it doesn't work because map function can't execute eagerly. (Raises an error saying Tensor type has no attribute named numpy.)
def parseline(line):
filename = line.numpy()
image = some_library.open_image(filename).to_numpy()
return image
dataset = dataset.map(parseline)
Basically, it can be done the following way:
path = 'path_to_images'
files = [os.path.join(path, i) for i in os.listdir(path)] # If you need to create a list of filenames, because tf functions require tensors
def parse_image(filename):
file = tf.io.read_file(filename) # this will work only with filename as tensor
image = tf.image.decode_image(f)
return img
dataset = tf.data.Dataset.from_tensor_slices(files)
dataset = dataset.map(parse_image).batch(1)
if you're in eager mode just iterate over dataset
for i in dataset:
print(i)
If not, you'll need an iterator
iterator = dataset.make_one_shot_iterator()
with tf.Session as sess:
sess.run(iterator.get_next())

preprocess images with tf.data.experimental.make_csv_dataset or with read_csv option

I am adding this summarization of my issue to make it easier to understand:
I want to do exactly what is done in the following tensorflow example:
https://www.tensorflow.org/guide/datasets
# Reads an image from a file, decodes it into a dense tensor, and resizes it
# to a fixed shape.
def _parse_function(filename, label):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [28, 28])
return image_resized, label
# A vector of filenames.
filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...])
# `labels[i]` is the label for the image in `filenames[i].
labels = tf.constant([0, 37, ...])
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
dataset = dataset.map(_parse_function)
The only differences are: I read the data from CSV that has many more features and then I call the map method:
dataset = tf.data.experimental.make_csv_dataset(file_pattern=CSV_PATH_TRAIN,
batch_size=2,
header=True,
label_name = 'label').map(_parse_function)
How does my _parse_function need to look like? How do I access the image path features, updates it to be an image presentation and return a modified numeric matrix feature of the image without changing anything at the other features?
thanks,
eilalan
==================Here are my code tries:==================
My code reads a CSV with feature columns and label. One of the features is image path, the others are strings.
The image path need to be processed into image numbers matrix.
I have tried doing so with the following options. In both ways tf.read_file fails with the input dimension error.
My question is how to pass one image at a time into the map methods
def read_image_png_option_1(image_path, depth=3, scale=False):
"""Reads the image from image_path (tf.string tensor) [jpg image].
Cast the result to float32 and if scale=True scale it in [-1,1]
using scale_image. Otherwise the values are in [0,1]
Reuturn:
the decoded jpeg image, casted to float32
"""
image = tf.image.convert_image_dtype(
tf.image.decode_png(tf.read_file(image_path), channels=depth),
dtype=tf.float32)
if scale:
image = scale_image(image)
return image
def read_image_png_option_2(features, depth=3, scale=False):
"""Reads the image from image_path (tf.string tensor) [jpg image].
Cast the result to float32 and if scale=True scale it in [-1,1]
using scale_image. Otherwise the values are in [0,1]
Reuturn:
the decoded jpeg image, casted to float32
"""
image = tf.image.convert_image_dtype(
tf.image.decode_png(tf.read_file(features['image']), channels=depth),
dtype=tf.float32)
if scale:
image = scale_image(image)
features['image'] = image
return features
def make_input_fn(fileName,batch_size=8, perform_shuffle=True):
"""An input function for training """
def _input_fn():
def decode_csv(line):
print('line is ',line)
filename_col,label_col,gender_col,ethinicity = tf.decode_csv(line,
[[""]]*amount_of_columns_csv,
field_delim=",",
na_value='NA',
select_cols=None)
image_col = read_image_png_option_1(filename_col)
d = dict(zip(['image','label','gender','ethinicity'], [image_col,label_col,gender_col,ethinicity])), label
return d
## OPTION 1:
# filenames could be more than one
# dataset = tf.data.TextLineDataset(filenames=fileName).skip(1).batch(batch_size).map(decode_csv)
## OPTION 2:
dataset = tf.data.experimental.make_csv_dataset(file_pattern=CSV_PATH_TRAIN,
batch_size=2,
header=True,
label_name = 'label').map(read_image_png_option_2)
#select_columns=[0,1]) #[tf.string,tf.string,tf.string,tf.string])
if perform_shuffle:
dataset = dataset.shuffle(buffer_size=256)
return dataset
return _input_fn()
train_input_fn = lambda: make_input_fn(CSV_PATH_TRAIN)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=50)
eval_input_fn = lambda: make_input_fn(CSV_PATH_VAL)
eval_spec = tf.estimator.EvalSpec(eval_input_fn)
feature_columns = [tf.feature_column.numeric_column("image",shape=(224,224)), # here i need a pyhton method to transform
tf.feature_column.categorical_column_with_vocabulary_list("gender", ["ww","ee"]),
tf.feature_column.categorical_column_with_vocabulary_list("ethinicity",["xx","yy"])]
estimator = tf.estimator.DNNClassifier(feature_columns=feature_columns,hidden_units=[1024, 512, 256],warm_start_from=ws)
tf.estimator.train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec)
Error for option 2:
ValueError: Shape must be rank 0 but is rank 1 for 'ReadFile' (op: 'ReadFile') with input shapes: [2].
Error for option 1:
ValueError: Shape must be rank 0 but is rank 1 for 'ReadFile' (op: 'ReadFile') with input shapes: [?].
Any help is appreciated.
Thanks
First you need to read the CSV file into dataset.
Then for each row in your CSV you can call your parse function.
def getInput(fileList):
# returns a dataset containing list of filenames
files = tf.data.Dataset.from_tensor_slices(fileList)
# Returs a dataset containing list of rows taken from all the files in file list.
# dataset is filled dynamically and not all entries are read at once
dataset = files.interleave(tf.data.TextLineDataset)
# call parse function for each row
# returned dataset will contain list of whatever the parse function is returning for the row
# we want the image path to be converted to decoded image in parse function
dataset = dataset.map(_parse_function, num_parallel_calls=8)
# return an iterator for the dataset which will be used to get elements.
return dataset.make_one_shot_iterator().get_next()
The parse function will be passed only one parameter that will be a single row from the CSV file. You need to decode the CSV and do further processing on each value.
Let's say you have 3 columns in your CSV each being a string.
def _parse_function(value):
columns_default = [[""], [""], [""]]
# this will be a tensor of columns in the row
columns = tf.decode_csv(value, record_defaults=columns_default,
field_delim=',')
col_names = ["label", "imagepath", "c3"]
features = dict(zip(col_names, columns))
for f, tensor in features.items():
# process imagepath to decoded image
if f == "imagepath":
image_string = tf.read_file(tensor)
image_decoded = tf.image.decode_jpeg(image_string)
image_resized = tf.image.resize_images(image_decoded, [28, 28])
features[f] = image_resized
labels = tf.equal(features.pop('label'), "1")
labels = tf.expand_dims(labels, 0)
return features, labels
Edit:
Explanation for comment:
Dataset object simply contains a list of elements. The elements can be tensors or a tuple of tensors etc. Tensor object can contain anything. It could represent a single feature, a single record or a batch of record. Further dataset API provide handy methods to manipulate the elements within.
If you are using dataset with another API like estimator then they expect the dataset elements to be in specific format which is what need to return from our input function for eg.
https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#train
I have edited my code block above to describe what dataset object at each step will contain.
From what I understand is that you have image path as one of the field in your CSV and you want to convert that path into an actual decoded image which you will use as one of the feature.
Since the image is going to be just one of the feature, you should not try to create a dataset using image files alone. Dataset object will include all your features at once.
So doing this would be incorrect:
files = tf.data.Dataset.from_tensor_slices(ds['imagepath'])
dataset = files.interleave(tf.data.TextLineDataset)
If you are using make_csv() function to read your csv then it will convert each row of your csv into one record where one record will contain list of all features, same as columns of csv.
So each element in the returned dataset should contain a single tensor containing all your features.
Here your image path will be one of the features. now you want to transform that image path to decoded image.
I suppose you can do it by applying a parse function to elements of dataset using map() function but it will be slightly tricky as all your features are already packed inside a single tensor.

How to perform string find and replace on Tensorflow String Tensor?

I currently am using the Tensorflow dataset api to perform some augmentations to images at a specified path. The filename itself contains information that states whether to augment the file or not. So what I want to do is read in the files from the dataset and for each file, perform a find within the filename and if I find a specific substring, then set a bool flag and replace the substring with "".
The error I get is:
AttributeError: 'Tensor' object has no attribute 'find'
I can't perform a "find" on the tensor with dtype string entries because find is not a part of the Tensor, so I am trying to figure out how I can go about performing the above action. I have shared some code below that I think demonstrates what I am trying to do. Performance is important, so I would prefer to do this the correct way if anyone sees that I am going about doing this via the Dataset API incorrectly.
def preproc_img(filenames):
def parse_fn(filename):
augment_inst = False
if cfg.SPLIT_INTO_INST:
#*****************************************************
#*** THIS IS WHERE THE LOGIC IS CURRENTLY BREAKING ***
#*****************************************************
if filename.find('_data_augmentation') != -1:
augment_inst = True
filename = filename.replace('_data_augmentation', '')
image_string = tf.read_file(filename)
img = tf.image.decode_image(image_string, channels=3)
return dict(zip([filename], [img]))
dataset = tf.data.Dataset.from_tensor_slices(filenames)
dataset = dataset.map(parse_fn)
iterator = dataset.make_one_shot_iterator()
return iterator.get_next()
def perform_train():
if __name__ == '__main__':
filenames = helper.get_image_paths()
next_batch = preproc_img(filenames)
with tf.Session() as sess:
with sess .graph.as_default():
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
dat = sess.run(next_batch)
# I would now go about calling any of my tf op code below
You can use tf.regex_replace for replacing text in a tf.string tensor.
filename = tf.regex_replace(filename, "_data_augmentation", "")
For TF 2.0
filename = tf.strings.regex_replace(filename, "_data_augmentation", "")