Why ImportExampleGen reads TFRecords as SparseTensor instead of Tensor? - tensorflow

I'm converting a CSV file into a TFRecords file like this:
File: ./dataset/csv/file.csv
feature_1, feture_2, output
1, 1, 1
2, 2, 2
3, 3, 3
import tensorflow as tf
import csv
import os
print(tf.__version__)
def create_csv_iterator(csv_file_path, skip_header):
with tf.io.gfile.GFile(csv_file_path) as csv_file:
reader = csv.reader(csv_file)
if skip_header: # Skip the header
next(reader)
for row in reader:
yield row
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def create_example(row):
"""
Returns a tensorflow.Example Protocol Buffer object.
"""
features = {}
for feature_index, feature_name in enumerate(["feature_1", "feture_2", "output"]):
feature_value = row[feature_index]
features[feature_name] = _int64_feature(int(feature_value))
return tf.train.Example(features=tf.train.Features(feature=features))
def create_tfrecords_file(input_csv_file):
"""
Creates a TFRecords file for the given input data
"""
output_tfrecord_file = input_csv_file.replace("csv", "tfrecords")
writer = tf.io.TFRecordWriter(output_tfrecord_file)
print("Creating TFRecords file at", output_tfrecord_file, "...")
for i, row in enumerate(create_csv_iterator(input_csv_file, skip_header=True)):
if len(row) == 0:
continue
example = create_example(row)
content = example.SerializeToString()
writer.write(content)
writer.close()
print("Finish Writing", output_tfrecord_file)
create_tfrecords_file("./dataset/csv/file.csv")
Then I'll read the generated TFRecords files using ImportExampleGen class:
import os
import absl
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip
context = InteractiveContext()
example_gen = tfx.components.ImportExampleGen(input_base="./dataset/tfrecords")
context.run(example_gen, enable_cache=True)
statistics_gen = tfx.components.StatisticsGen(
examples=example_gen.outputs['examples'])
context.run(statistics_gen, enable_cache=True)
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
context.run(schema_gen, enable_cache=True)
File: ./transform.py
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
print(inputs)
return inputs
transform = tfx.components.Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=os.path.abspath("./transform.py"))
context.run(transform, enable_cache=True)
In the preprocessing_fn function shows that inputs is a SparseTensor objects. My question is why? As far as I can tell, my dataset's samples are dense and they should be Tensor instead. Am I doing something wrong?

For anyone else who might be struggling with the same issue, I found the culprit. It's the SchemaGen class. This is how I was instantiating its object:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
I don't know what's the use case for asking SchemaGen class not to infer the shape of the features but the tutorial I was following had it set to False and I had just copied and pasted the same thing. Comparing with some other tutorials, I realized that it could be the reason why I was getting SparseTensor.
So, if you let SchemaGen infer the shape of your features or you load a hand crafted schema in which you've set the shapes yourself, you'll be getting a Tensor in your preprocessing_fn. But if the shapes are not set, the features will be instances of SparseTensor.
For the sake of completeness, this is the fixed snippet:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=True)

Related

SpaCy 3 -- ValueError: [E973] Unexpected type for NER data

I've been stressing out on this problem for so long and I can't seem to find a solution.
I want to train a NER model to recognise animal and species names.
I created a mock training set to test it out. However, I keep getting a ValueError: [E973] Unexpected type for NER data
I have tried other solutions on other posts on StackOverflow, including:
Double checking if my formatting and type of the training set was right
Using spacy.load('en_core_web_sm') instead of spacy.blank('en')
Installing spacy-lookups-data
All of these result in the same error.
import os
import spacy
from spacy.lang.en import English
from spacy.training.example import Example
import random
def train_spacy(data, iterations = 30):
TRAIN_DATA = data
nlp = spacy.blank("en") #start with a blank model
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last = True)
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for itn in range(iterations):
print ("Starting iterations "+str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list,tuple))) #this prints True
example = Example.from_dict(doc, {"entities":annotations})
nlp.update(
[example],
drop = 0.2,
sgd = optimizer,
losses = losses
)
print(losses)
return (nlp)
if __name__ == "__main__":
#mock training set
TRAIN_DATA=[('Dog is an animal',{'entities':[(0,3,'ANIMAL')]}),
('Cat is on the table',{'entities':[(0,3,'ANIMAL')]}),
('Rats are pets',{'entities':[(0,4,'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
The error message
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 49, in <module>
nlp = train_spacy(TRAIN_DATA)
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 35, in train_spacy
example = Example.from_dict(doc, {"entities":annotations})
File "spacy\training\example.pyx", line 118, in spacy.training.example.Example.from_dict
File "spacy\training\example.pyx", line 24, in spacy.training.example.annotations_to_doc
File "spacy\training\example.pyx", line 388, in spacy.training.example._add_entities_to_doc
ValueError: [E973] Unexpected type for NER data```
I had the same problem when I migrated a code that I had from a 2.x version of spacy to a 3.x version since several things changed.
Also, in your case it looks like you have a mix of spacy 2.x and 3.x syntaxt. The next version of your code with a few changes work for me using spacy 3.2.1
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.disable_pipes(*other_pipes):
losses = None
optimizer = nlp.create_optimizer()
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list, tuple))) # this prints True
example = Example.from_dict(doc, annotations)
losses = nlp.update(
[example],
drop=0.2,
sgd=optimizer
)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
Notice the following changes:
I changed your import of Example class to from spacy.training import Example. I think you were importing the wrong clase.
I'm using en_core_web_lg but with a blank model it should work too!
I commented other pipeline models disabling because in spacy 3.x pipeline is more complex and I think you can't disable the whole pipeline for NER task. How ever feel free to read official documentation and try if some of the other models are not needed.
Optimizer now is initialized using nlp.create_optimizer() instead of nlp.begin_training()
Note that annotations are already a dictionary in the expected format so you don't need to wrap it in a new dictionary: Example.from_dict(doc, annotations) should do the job.
Finally the loss now is returned as a result of model update instead of being passed as parameter.
I hope this help you and please ask questions if you need more help.
Best regards!
EDIT:
I also want to suggest some changes in your training script to take more advantage of spacy utils:
Use spacy.utilis.minibatch util to create mini batches from your training data.
Pass a whole minibacth of examples to update method instead of a minibatch of only one example.
Your code including this improve among other minor changes would looks as follos:
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# Init loss
losses = None
# Init and configure optimizer
optimizer = nlp.create_optimizer()
optimizer.learn_rate = 0.001 # Change to some lr you prefers
batch_size = 32 # Choose batch size you prefers
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAIN_DATA, size=batch_size):
# Create Example instance for each training example in mini batch
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in batch]
# Update model with mini batch
losses = nlp.update(examples, drop=0.2, sgd=optimizer)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)

XGboost + GridSearch : wired warning

Below is a code I wrote for Hyperparameter tuning of XGboost using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, auc
from pprint import pprint
from xgboost import XGBClassifier
import time
# instantiate XGBoost model
clf = XGBClassifier(missing=np.nan, nthreads=-1)
# Define scoring metrics
scorers = {
'accuracy_score': make_scorer(accuracy_score),
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score)
}
param_grid_dummy = {
"n_estimators": [25, 250],
"max_depth": [3,5],
"learning_rate": [0.0005, 0,005],
}
def random_search_wrapper(refit_score = 'precision_score'):
"""
fits a RandomizedSearchCV classifier using refit_score for optimization
prints classifier performance metrics
"""
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = param_grid_dummy, n_iter = 3, scoring=scorers, refit = refit_score, cv = 3, return_train_score= True, n_jobs= -1)
rf_random.fit(X_train_df, Y_train)
# make the predictions
Y_pred = rf_random.predict(X_test_df)
print('Best params for {}'.format(refit_score))
print(rf_random.best_params_)
# confusion matrix on test data
print('\nConfusion matrix of Random Forest optimized for {} on the test data: '.format(refit_score))
print(pd.DataFrame(confusion_matrix(Y_test, Y_pred),
columns = ['pred_neg', 'pred_pos'], index = ['neg', 'pos']))
return rf_random
# Optimize classifier for recall score
start = time.time()
rf_random_cl = random_search_wrapper(refit_score='precision_score')
# Print time
end = time.time()
print()
print((end - start)/60, "minutes")
I get a wired warning.
/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
if diff:
Can someone pls help me understand what wrong am I doing here?
when I do simple clf.fit(X_train_df, Y_train). It works perfectly fine
This is an issue with sklearn version. few versions < 0.20.1 throw this this error
Code is correct.

A bytes-like object is required, not 'Tensor' when calling map on string tensors in eager mode

I am trying to use TF.dataset.map to port over this old code because I get a deprecation warning.
Old code which reads a set of custom protos from a TFRecord file:
record_iterator = tf.python_io.tf_record_iterator(path=filename)
for record in record_iterator:
example = MyProto()
example.ParseFromString(record)
I am trying to use eager mode and map, but I get this error.
def parse_proto(string):
proto_object = MyProto()
proto_object.ParseFromString(string)
dataset = tf.data.TFRecordDataset(dataset_paths)
parsed_protos = raw_tf_dataset.map(parse_proto)
This code works:
for raw_record in raw_tf_dataset:
proto_object = MyProto()
proto_object.ParseFromString(raw_record.numpy())
But the map gives me an error:
TypeError: a bytes-like object is required, not 'Tensor'
What is the right way to take use the argument the function results of the map and treat them like a string?
You need to extract string form the tensor and use in the map function. Below are the steps to be implemented in the code to achieve this.
You have to decorate the map function with tf.py_function(get_path, [x], [tf.float32]). You can find more about tf.py_function here. In tf.py_function, first argument is the name of map function, second argument is the element to be passed to map function and final argument is the return type.
You can get your string part by using bytes.decode(file_path.numpy()) in map function.
So modify your program as below,
parsed_protos = raw_tf_dataset.map(parse_proto)
to
parsed_protos = raw_tf_dataset.map(lambda x: tf.py_function(parse_proto, [x], [function return type]))
Also modify parse_proto as below,
def parse_proto(string):
proto_object = MyProto()
proto_object.ParseFromString(string)
to
def parse_proto(string):
proto_object = MyProto()
proto_object.ParseFromString(bytes.decode(string.numpy()))
In the below simple program, we are using tf.data.Dataset.list_files to read path of the image. Next in the map function we are reading the image using load_img and later doing the tf.image.central_crop function to crop central part of the image.
Code -
%tensorflow_version 2.x
import tensorflow as tf
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array, array_to_img
from matplotlib import pyplot as plt
import numpy as np
def load_file_and_process(path):
image = load_img(bytes.decode(path.numpy()), target_size=(224, 224))
image = img_to_array(image)
image = tf.image.central_crop(image, np.random.uniform(0.50, 1.00))
return image
train_dataset = tf.data.Dataset.list_files('/content/bird.jpg')
train_dataset = train_dataset.map(lambda x: tf.py_function(load_file_and_process, [x], [tf.float32]))
for f in train_dataset:
for l in f:
image = np.array(array_to_img(l))
plt.imshow(image)
Output -
Hope this answers your question. Happy Learning.

How to load pickle files by tensorflow's tf.data API

I have my data in multiple pickle files stored on disk. I want to use tensorflow's tf.data.Dataset to load my data into training pipeline. My code goes:
def _parse_file(path):
image, label = *load pickle file*
return image, label
paths = glob.glob('*.pkl')
print(len(paths))
dataset = tf.data.Dataset.from_tensor_slices(paths)
dataset = dataset.map(_parse_file)
iterator = dataset.make_one_shot_iterator()
Problem is I don't know how to implement the _parse_file fuction. The argument to this function, path, is of tensor type. I tried
def _parse_file(path):
with tf.Session() as s:
p = s.run(path)
image, label = pickle.load(open(p, 'rb'))
return image, label
and got error message:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'arg0' with dtype string
[[Node: arg0 = Placeholder[dtype=DT_STRING, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
After some search on the Internet I still have no idea how to do it. I will be grateful to anyone providing me a hint.
I have solved this myself. I should use tf.py_func as in this doc.
This is how I solved this issue. I didn't use the tf.py_func; check out function "load_encoding()" below, which is what's doing the pickle reading. The FACELIB_DIR contains directories of pickled vggface2 encodings, each directory named for the person of those face encodings.
import tensorflow as tf
import pickle
import os
FACELIB_DIR='/var/noggin/FaceEncodings'
# Get list of all classes & build a quick int-lookup dictionary
labelNames = sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')])
labelStrToInt = dict([(x,i) for i,x in enumerate(labelNames)])
# Function load_encoding - Loads Encoding data from enc2048 file in filepath
# This reads an encoding from disk, and through the file path gets the label oneHot value, returns both
def load_encoding(file_path):
with open(os.path.join(FACELIB_DIR,file_path),'rb') as fin:
A,_ = pickle.loads(fin.read()) # encodings, source_image_name
label_str = tf.strings.split(file_path, os.path.sep)[-2]
return (A, labelStrToInt[label_str])
# Build the dataset of every enc2048 file in our data library
encpaths = []
for D in sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')]):
# All the encoding files
encfiles = sorted(filter((lambda x: x.endswith('.enc2048')), os.listdir(os.path.join(FACELIB_DIR, D))))
encpaths += [os.path.join(D,x) for x in encfiles]
dataset = tf.data.Dataset.from_tensor_slices(encpaths)
# Shuffle and speed improvements on the dataset
BATCH_SIZE = 64
from tensorflow.data import AUTOTUNE
dataset = (dataset
.shuffle(1024)
.cache()
.repeat()
.batch(BATCH_SIZE)
.prefetch(AUTOTUNE)
)
# Benchmark our tf.data pipeline
import time
datasetGen = iter(dataset)
NUM_STEPS = 10000
start_time = time.time()
for i in range(0, NUM_STEPS):
X = next(datasetGen)
totalTime = time.time() - start_time
print('==> tf.data generated {} tensors in {:.2f} seconds'.format(BATCH_SIZE * NUM_STEPS, totalTime))
tf.py_func
This function is used to solved that problem and also as menstion in doc.

tensorboard with numpy array

Can someone give a example on how to use tensorboard visualize numpy array value?
There is a related question here, I don't really get it.
Tensorboard logging non-tensor (numpy) information (AUC)
For example,
If I have
for i in range(100):
foo = np.random.rand(3,2)
How can I keep tracking the distribution of foo using tensorboard for 100 iterations? Can someone give a code example?
Thanks.
For simple values (scalar), you can use this recipe
summary_writer = tf.train.SummaryWriter(FLAGS.logdir)
summary = tf.Summary()
summary.value.add(tag=tagname, simple_value=value)
summary_writer.add_summary(summary, global_step)
summary_writer.flush()
As far as using array, perhaps you can add 6 values in a sequence, ie
for value in foo:
summary.value.add(tag=tagname, simple_value=value)
Another (simplest) way is just using placeholders. First, you can make a placeholder for your numpy array shape.
# Some place holders for summary
summary_reward = tf.placeholder(tf.float32, shape=(), name="reward")
tf.summary.scalar("reward", summary_reward)
Then, just call session.run the merged summary with the feed_dict.
# Summary
summ = tf.summary.merge_all()
...
s = sess.run(summ, feed_dict={summary_reward: reward})
writer.add_summary(s, i)
if you install this package via pip install tensorboard-pytorch it becomes as straightforward as it can get:
import numpy as np
from tensorboardX import SummaryWriter
writer = SummaryWriter()
for i in range(50):
writer.add_histogram("moving_gauss", np.random.normal(i, i, 1000), i, bins="auto")
writer.close()
Will generate the corresponding histogram data in the runs directory:
Found a way to work around, create a variable and assign the value of numpy array to the variable, use tensorboard to track the variable
mysummary_writer = tf.train.SummaryWriter("./tmp/test/")
a = tf.Variable(tf.zeros([3,2]), name="a")
sum1 = tf.histogram_summary("nparray1", a)
summary_op = tf.merge_all_summaries()
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for ii in range(10):
foo = np.random.rand(3, 2)
assign_op = a.assign(foo)
summary, _ = sess.run([summary_op, assign_op])
mysummary_writer.add_summary(tf.Summary.FromString(summary), global_step=ii)
mysummary_writer.flush()
sess = tf.Session()
writer = tf.summary.FileWriter('tensorboard_test')
var = tf.Variable(0.0,trainable=False,name='loss')
sess.run(var.initializer)
summary_op = tf.summary.scalar('scalar1',var)
for value in array:
sess.run(var.assign(value))
summary = sess.run(summary_op)
writer.add_summary(summary,i)
It works, but slow.
You could define a function like this (taken from gyglim's gist):
def add_histogram(writer, tag, values, step, bins=1000):
"""
Logs the histogram of a list/vector of values.
From: https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
"""
# Create histogram using numpy
counts, bin_edges = np.histogram(values, bins=bins)
# Fill fields of histogram proto
hist = tf.HistogramProto()
hist.min = float(np.min(values))
hist.max = float(np.max(values))
hist.num = int(np.prod(values.shape))
hist.sum = float(np.sum(values))
hist.sum_squares = float(np.sum(values ** 2))
# Requires equal number as bins, where the first goes from -DBL_MAX to bin_edges[1]
# See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto#L30
# Therefore we drop the start of the first bin
bin_edges = bin_edges[1:]
# Add bin edges and counts
for edge in bin_edges:
hist.bucket_limit.append(edge)
for c in counts:
hist.bucket.append(c)
# Create and write Summary
summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
writer.add_summary(summary, step)
And then add to the summary writer like this:
add_histogram(summary_writer, "Histogram_Name", your_numpy_array, step)
You can plot the vector with matplotlib, convert the plot to numpy array along the lines of
https://stackoverflow.com/a/35362787/10873169, and then add it to Tensorboard as image
import numpy as np
from matplotlib.backends.backend_agg import FigureCanvasAgg
from matplotlib.figure import Figure
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/example")
for step in range(1, 10):
# Time-dependent vector we want to plot
example_vector = np.sin(np.arange(100) / step)
# Plot it in matplotlib first. Default DPI doesn't look good in Tensorboard
fig = Figure(figsize=(5, 2), dpi=200)
canvas = FigureCanvasAgg(fig)
fig.gca().plot(example_vector)
canvas.draw()
# Get the image as a string of bytes
image_as_string = np.frombuffer(canvas.tostring_rgb(), dtype='uint8')
# Need to reshape to (height, width, channels)
target_shape = canvas.get_width_height()[::-1] + (3,)
reshaped_image = image_as_string.reshape(target_shape)
# Write to Tensorboard logs
writer.add_image("example_vector", reshaped_image,
dataformats="HWC", global_step=step)
writer.close()