How do I train a pseudo-projective parser on spaCy? - spacy

I am trying to train a parser for custom semantics following the sample code from https://raw.githubusercontent.com/explosion/spaCy/master/examples/training/train_intent_parser.py
The idea is to get a non-projective parse so when I pass a text like: ROOT AAAA BBBB 12 21 12 becomes a child of AAAA and 21 becomes a child of BBBB. To test this I am training only this case and testing this same case but it doesn't seem to work, what I get as a response is:
[('ROOT', 'ROOT', 'ROOT'), ('AAAA', 'LETTERS', 'ROOT'), ('BBBB', 'LETTERS', 'ROOT'), ('12', 'NUMBERS', 'BBBB'), ('21', 'NUMBERS', 'BBBB')]
As you can see both numbers are dependent on BBBB when 12 should be dependent on AAAA.
The code I am using to train and test is:
import plac
import random
import spacy
from spacy.util import minibatch, compounding
TRAIN_DATA = list()
samples = 1000
for _ in range(samples):
sample = (
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
TRAIN_DATA.append(sample)
def test_model(nlp):
texts = ['ROOT AAAA BBBB 12 21']
docs = nlp.pipe(texts)
for doc in docs:
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
#plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
n_iter=("Number of training iterations", "option", "n", int),
)
# Just in case I am using the german model since it supports pseudo-projective parsing (https://explosion.ai/blog/german-model#word-order)
def main(model='de_core_news_sm', n_iter=15):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# We'll use the built-in dependency parser class, but we want to create a
# fresh instance – just in case.
if "parser" in nlp.pipe_names:
nlp.remove_pipe("parser")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True)
for text, annotations in TRAIN_DATA:
for dep in annotations.get("deps", []):
parser.add_label(dep)
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)
So, what am I doing wrong?
Thank you in advance for any help on this!

The problem is that the simple training example script isn't projectivitizing the training instances when initializing and training the model. The parsing algorithm itself can only handle projective parses, but if the parser component finds projectivized labels in its output, they're deprojectivitzed in a postprocessing step. You don't need to modify any parser settings (so starting with a German model makes no difference), just provide projectivized input in the right format.
The initial projectivization is handled automatically by the train CLI, which uses GoldCorpus.train_docs() to prepare the training examples for nlp.update() and sets make_projective=True when creating the GoldParses. In general, I'd recommend switching to the train CLI (which also requires switching to the internal JSON training format, which is admittedly a minor hassle), because the train CLI sets a lot of better defaults.
However, a toy example also works fine as long as you create projectivized training examples (with GoldParse(make_projective=True), add all the projectivized dependency labels to the parser, and train with Doc and the projectivized GoldParse input instead of the text/annotation input:
# tested with spaCy v2.2.4
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
TRAIN_DATA = [
(
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
]
samples = 200
def test_model(nlp):
texts = ["ROOT AAAA BBBB 12 21"]
for doc in nlp.pipe(texts):
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
spacy.displacy.serve(doc)
#plac.annotations(
n_iter=("Number of training iterations", "option", "n", int),
)
def main(n_iter=10):
"""Load the model, set up the pipeline and train the parser."""
nlp = spacy.blank("xx")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser)
docs_golds = []
for text, annotation in TRAIN_DATA:
doc = nlp.make_doc(text)
gold = GoldParse(doc, **annotation, make_projective=True)
# add the projectivized labels
for dep in gold.labels:
parser.add_label(dep)
docs_golds.append((doc, gold))
# duplicate the training instances
docs_golds = docs_golds * samples
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training(min_action_freq=1)
for itn in range(n_iter):
random.shuffle(docs_golds)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(docs_golds, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)

Related

Tensor flow and tflearn Chatbot keeps on getting high probability even when user input is wrong

I coded a simple AI chatbot with TensorFlow and tflearn and it runs just fine but the issue is when the user inputs the wrong thing, the bot is supposed to say it doesnt understand if the prediction accuracy is less than 70%, but the bot always scores above that even if the user gives jibberish like "rjrigrejfr". The bot assumes theyre greeting them. The patterns its supposed to study in the json are "patterns": ["Hi", "How are you", "Wassup", "Hello", "Good day", "Waddup", "Yo"]. I can share the json file if needed its short. Anyway, this is the python code:
import numpy as np
import nltk
import tensorflow
import tflearn
import random
import json
import pickle
# Some extra configuration:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
nltk.download('punkt')
# Load the data from the json file into a variable.
with open("intents.json") as file:
data = json.load(file)
# If we already have saved data, we do not need to retrain the model and waste time (could develop into an issue in more complex programs. Save in pickle. )
try:
with open("data.pickle", "rb") as f: # rb stands for bytes.
words, labels, training, output = pickle.load(f)
# --- Pre-training data preparation ---
except:
words = []
docsx = [] # Stores patterns
docsy = [] # Stores intents
labels = [] # All the specific tag values such as greeting, contact, etc.
for intent in data["intents"]:
for pattern in intent["patterns"]:
w = nltk.word_tokenize(pattern) # nltk function that splits the sentences inside intent into words list.
words.extend(w) # Add the tokenized list to words list.
docsx.append(w)
docsy.append(intent["tag"]) # append the classification of the sentence
if intent["tag"] not in labels:
labels.append(intent["tag"])
words = [stemmer.stem(w.lower()) for w in words if w not in ".?!"] # Stemming the words to remove unnecessary elements leaving their root. Convert all to lowercase.
words = sorted(list(set(words))) # Set ensures no duplicate elements then we convert back to list and sort it.
labels = sorted(labels)
training = []
output = []
out_empty = [0 for i in range(len(labels))] # Gives a list of 0 ints based on # of tags. This is useful later in the program when binerizing.
# One hot encoding the intent categories. Need to one-hot code the data which improves the efficiency of the ML to "binerize" the data.
# In this case, we have a list of 0s and 1s if the word appears it is assigned a 1 else a 0.
for x, doc in enumerate(docsx):
bag = [] # Bag of words or the one-hot coded data for the ML.
docx_word_stemmed = [stemmer.stem(word) for word in doc] # Stemming the data in docx.
# Now adding and transforming data into the one-hot coded list/bag of words data.
for i in words:
if i in docx_word_stemmed: # Checking against stemmed words:
# Word exists
bag.append(1)
else:
bag.append(0)
output_row = out_empty[:] # Copying out_empty
# Going through the labels list using .index() and for the occurance of docx value in docy, assign binary 1.
output_row[labels.index(docsy[x])] = 1
training.append(bag)
output.append(output_row)
# Required to use numpy arrays for use in tflearn. It is also faster.
training = np.array(training)
output = np.array(output)
# Saving the data so we do not need to do the data configuration every time.
with open("data.pickle", "wb") as f:
pickle.dump((words, labels, training, output), f)
try:
model.load('model.tflearn')
except:
tensorflow.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')
net = tflearn.regression(net)
model = tflearn.DNN(net)
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")
def bagofwords(sentence, words):
bag = [0 for _ in range(len(words))] # blank bag of words.
# Tokenize s and then stem it.
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
for string in sentence_words:
for i, word in enumerate(words):
if word == string:
bag[i] = 1
return np.array(bag)
def chat():
print("Hello there! I'm the SRO AI Virtual Assistant. How am I help you?")
# Figure out the error slime!
while True:
user_input = input("Type here:")
if user_input == "quit":
break
result = model.predict([bagofwords(user_input, words)])[0] #bagofwords func and predict function to give predictions on what the user is saying.
best_result = np.argmax(result) # We want to only use the best result.
tag = labels[best_result]
print(result[best_result])
# Open JSON file and pick a response.
if result[best_result] > 0.7:
for tg in data["intents"]:
if tg['tag'] == tag:
responses = tg['responses']
print(random.choice(responses))
else:
print("I don't quite understand")
chat()

Sampling for large class and augmentation for small classes in each batch

Let's say we have 2 classes one is small and the second is large.
I would like to use for data augmentation similar to ImageDataGenerator
for the small class, and sampling from each batch, in such a way, that, that each batch would be balanced. (Fro minor class- augmentation for major class- sampling).
Also, I would like to continue using image_dataset_from_directory (since the dataset doesn't fit into RAM).
What about
sample_from_datasets
function?
import tensorflow as tf
from tensorflow.python.data.experimental import sample_from_datasets
def augment(val):
# Example of augmentation function
return val - tf.random.uniform(shape=tf.shape(val), maxval=0.1)
big_dataset_size = 1000
small_dataset_size = 10
# Init some datasets
dataset_class_large_positive = tf.data.Dataset.from_tensor_slices(tf.range(100, 100 + big_dataset_size, dtype=tf.float32))
dataset_class_small_negative = tf.data.Dataset.from_tensor_slices(-tf.range(1, 1 + small_dataset_size, dtype=tf.float32))
# Upsample and augment small dataset
dataset_class_small_negative = dataset_class_small_negative \
.repeat(big_dataset_size // small_dataset_size) \
.map(augment)
dataset = sample_from_datasets(
datasets=[dataset_class_large_positive, dataset_class_small_negative],
weights=[0.5, 0.5]
)
dataset = dataset.shuffle(100)
dataset = dataset.batch(6)
iterator = dataset.as_numpy_iterator()
for i in range(5):
print(next(iterator))
# [109. -10.044552 136. 140. -1.0505208 -5.0829906]
# [122. 108. 141. -4.0211563 126. 116. ]
# [ -4.085523 111. -7.0003924 -7.027302 -8.0362625 -4.0226436]
# [ -9.039093 118. -1.0695585 110. 128. -5.0553837]
# [100. -2.004463 -9.032592 -8.041705 127. 149. ]
Set up the desired balance between the classes in the weights parameter of sample_from_datasets.
As it was noticed by
Yaoshiang,
the last batches are imbalanced and the datasets length are different. This can be avoided by
# Repeat infinitely both datasets and augment the small one
dataset_class_large_positive = dataset_class_large_positive.repeat()
dataset_class_small_negative = dataset_class_small_negative.repeat().map(augment)
instead of
# Upsample and augment small dataset
dataset_class_small_negative = dataset_class_small_negative \
.repeat(big_dataset_size // small_dataset_size) \
.map(augment)
This case, however, the dataset is infinite and the number of batches in epoch has to be further controlled.
You can use tf.data.Dataset.from_generator that allows more control on your data generation without loading all your data into RAM.
def generator():
i=0
while True :
if i%2 == 0:
elem = large_class_sample()
else :
elem =small_class_augmented()
yield elem
i=i+1
ds= tf.data.Dataset.from_generator(
generator,
output_signature=(
tf.TensorSpec(shape=yourElem_shape , dtype=yourElem_ype))
This generator will alterate samples between the two classes,and you can add more dataset operations(batch , shuffle..)
I didn't totally follow the problem. Would psuedo-code this work? Perhaps there are some operators on tf.data.Dataset that are sufficient to solve your problem.
ds = image_dataset_from_directory(...)
ds1=ds.filter(lambda image, label: label == MAJORITY)
ds2=ds.filter(lambda image, label: label != MAJORITY)
ds2 = ds2.map(lambda image, label: data_augment(image), label)
ds1.batch(int(10. / MAJORITY_RATIO))
ds2.batch(int(10. / MINORITY_RATIO))
ds3 = ds1.zip(ds2)
ds3 = ds3.map(lambda left, right: tf.concat(left, right, axis=0)
You can use the tf.data.Dataset.from_tensor_slices to load the images of two categories seperately and do data augmentation for the minority class. Now that you have two datasets combine them with tf.data.Dataset.sample_from_datasets.
# assume class1 is the minority class
files_class1 = glob('class1\\*.jpg')
files_class2 = glob('class2\\*.jpg')
def augment(filepath):
class_name = tf.strings.split(filepath, os.sep)[0]
image = tf.io.read_file(filepath)
image = tf.expand_dims(image, 0)
if tf.equal(class_name, 'class1'):
# do all the data augmentation
image_flip = tf.image.flip_left_right(image)
return [[image, class_name],[image_flip, class_name]]
# apply data augmentation for class1
train_class1 = tf.data.Dataset.from_tensor_slices(files_class1).\
map(augment,num_parallel_calls=tf.data.AUTOTUNE)
train_class2 = tf.data.Dataset.from_tensor_slices(files_class2)
dataset = tf.python.data.experimental.sample_from_datasets(
datasets=[train_class1,train_class2],
weights=[0.5, 0.5])
dataset = dataset.batch(BATCH_SIZE)

ValueError when trying to fine-tune GPT-2 model in TensorFlow

I am encountering a ValueError in my Python code when trying to fine-tune Hugging Face's distribution of the GPT-2 model. Specifically:
ValueError: Dimensions must be equal, but are 64 and 0 for
'{{node Equal_1}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Cast_18, Cast_19)'
with input shapes: [64,0,1024], [2,0,12,1024].
I have around 100 text files that I concatenate into a string variable called raw_text and then pass into the following function to create training and testing TensorFlow datasets:
def to_datasets(raw_text):
# split the raw text in smaller sequences
seqs = [
raw_text[SEQ_LEN * i:SEQ_LEN * (i + 1)]
for i in range(len(raw_text) // SEQ_LEN)
]
# set up Hugging Face GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# tokenize the character sequences
tokenized_seqs = [
tokenizer(seq, padding="max_length", return_tensors="tf")["input_ids"]
for seq in seqs
]
# convert tokenized sequences into TensorFlow datasets
trn_seqs = tf.data.Dataset \
.from_tensor_slices(tokenized_seqs[:int(len(tokenized_seqs) * TRAIN_PERCENT)])
tst_seqs = tf.data.Dataset \
.from_tensor_slices(tokenized_seqs[int(len(tokenized_seqs) * TRAIN_PERCENT):])
def input_and_target(x):
return x[:-1], x[1:]
# map into (input, target) tuples, shuffle order of elements, and batch
trn_dataset = trn_seqs.map(input_and_target) \
.shuffle(SHUFFLE_BUFFER_SIZE) \
.batch(BATCH_SIZE, drop_remainder=True)
tst_dataset = tst_seqs.map(input_and_target) \
.shuffle(SHUFFLE_BUFFER_SIZE) \
.batch(BATCH_SIZE, drop_remainder=True)
return trn_dataset, tst_dataset
I then try to train my model, calling train_model(*to_datasets(raw_text)):
def train_model(trn_dataset, tst_dataset):
# import Hugging Face GPT-2 model
model = TFGPT2Model.from_pretrained("gpt2")
model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=tf.metrics.SparseCategoricalAccuracy()
)
model.fit(
trn_dataset,
epochs=EPOCHS,
initial_epoch=0,
validation_data=tst_dataset
)
The ValueError is triggered on the model.fit() call. The variables in all-caps are settings pulled in from a JSON file. Currently, they are set to:
{
"BATCH_SIZE":64,
"SHUFFLE_BUFFER_SIZE":10000,
"EPOCHS":500,
"SEQ_LEN":2048,
"TRAIN_PERCENT":0.9
}
Any information regarding what this error means or ideas on how to resolve it would be greatly appreciated. Thank you!
I'm having the same problem but when I change the batch size to 12 (same as n_layer parameter in the gpt-2 config file) it works.
I don't Know why it works but you can try it...
If you manage to solve it on different way I will be glad to hear.

pretrained vectors not loading in spacy

I am training a custom NER model from scratch using the spacy.blank("en") model. I add custom word vectors to it. The vectors are loaded as follows:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
med_vec = KeyedVectors.load_word2vec_format('./wikipedia-pubmed-and-PMC-w2v.bin', binary=True, limit = 300000)
and I add it to the blank model in this code snippet here:
def main(model=None, n_iter=3, output_dir=None):
"""Set up the pipeline and entity recognizer, and train the new entity."""
random.seed(0)
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
nlp.vocab.reset_vectors(width=200)
for idx in range(len(med_vec.index2word)):
word = med_vec.index2word[idx]
vector = med_vec.vectors[idx]
nlp.vocab.set_vector(word, vector)
for key, vector in nlp.vocab.vectors.items():
nlp.vocab.strings.add(nlp.vocab.strings[key])
nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
print("Created blank 'en' model")
......Code for training the ner
I then save this model.
When I try to load the model,
nlp = spacy.load("./NDLA/vectorModel0")
I get the following error:
`~\AppData\Local\Continuum\anaconda3\lib\site-packages\thinc\neural\_classes\static_vectors.py in __init__(self, lang, nO, drop_factor, column)
47 if self.nM == 0:
48 raise ValueError(
---> 49 "Cannot create vectors table with dimension 0.\n"
50 "If you're using pre-trained vectors, are the vectors loaded?"
51 )
ValueError: Cannot create vectors table with dimension 0.
If you're using pre-trained vectors, are the vectors loaded?
I also get this warning:
UserWarning: [W019] Changing vectors name from spacy_pretrained_vectors to spacy_pretrained_vectors_336876, to avoid clash with previously loaded vectors. See Issue #3853.
"__main__", mod_spec)
The vocab directory in the model has a vectors file of size 270 MB. So I know it is not empty... What is causing this error?
You could try to pass all vectors at once instead of using a for loop.
nlp.vocab.vectors = spacy.vocab.Vectors(data=med_vec.syn0, keys=med_vec.vocab.keys())
So you're else statement would become like this:
else:
nlp = spacy.blank("en") # create blank Language class
nlp.vocab.reset_vectors(width=200)
nlp.vocab.vectors = spacy.vocab.Vectors(data=med_vec.syn0, keys=med_vec.vocab.keys())
nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
print("Created blank 'en' model")

Define instance key (index number) for Cloud machine learning prediction

I followed the 'Getting Started' tutorial for Cloud Machine Learning Engine and deployed it. I can pass an input file containing JSON instances to Batch Prediction service and it returns a file containing the predictions. How can I pass an instance key (index number) through the application graph unaltered so that the prediction contain the key and I know which JSON prediction belongs to which JSON input? It probably can be done by adding/changing a few lines in the original tutorial code (also copy pasted below). Can someone help me with that? I am relatively new to Tensorflow so a detailed description will be greatly appreciated. A sample code or tutorial will be very helpful, too... The 'Getting Started' sample code contains two files copy pasted below:
model.py
# Copyright 2016 Google Inc. All Rights Reserved. Licensed under the Apache
# License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
"""Define a Wide + Deep model for classification on structured data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
import six
import tensorflow as tf
# Define the format of your input data including unused columns
CSV_COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income_bracket']
CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
[0], [0], [0], [''], ['']]
LABEL_COLUMN = 'income_bracket'
LABELS = [' <=50K', ' >50K']
# Define the initial ingestion of each feature used by your model.
# Additionally, provide metadata about the feature.
INPUT_COLUMNS = [
# Categorical base columns
# For categorical columns with known values we can provide lists
# of values ahead of time.
tf.feature_column.categorical_column_with_vocabulary_list(
'gender', [' Female', ' Male']),
tf.feature_column.categorical_column_with_vocabulary_list(
'race',
[' Amer-Indian-Eskimo', ' Asian-Pac-Islander',
' Black', ' Other', ' White']
),
tf.feature_column.categorical_column_with_vocabulary_list(
'education',
[' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
' Doctorate', ' Prof-school', ' 5th-6th', ' 10th',
' 1st-4th', ' Preschool', ' 12th']),
tf.feature_column.categorical_column_with_vocabulary_list(
'marital_status',
[' Married-civ-spouse', ' Divorced', ' Married-spouse-absent',
' Never-married', ' Separated', ' Married-AF-spouse', ' Widowed']),
tf.feature_column.categorical_column_with_vocabulary_list(
'relationship',
[' Husband', ' Not-in-family', ' Wife', ' Own-child', ' Unmarried',
' Other-relative']),
tf.feature_column.categorical_column_with_vocabulary_list(
'workclass',
[' Self-emp-not-inc', ' Private', ' State-gov',
' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc',
' Without-pay', ' Never-worked']
),
# For columns with a large number of values, or unknown values
# We can use a hash function to convert to categories.
tf.feature_column.categorical_column_with_hash_bucket(
'occupation', hash_bucket_size=100, dtype=tf.string),
tf.feature_column.categorical_column_with_hash_bucket(
'native_country', hash_bucket_size=100, dtype=tf.string),
# Continuous base columns.
tf.feature_column.numeric_column('age'),
tf.feature_column.numeric_column('education_num'),
tf.feature_column.numeric_column('capital_gain'),
tf.feature_column.numeric_column('capital_loss'),
tf.feature_column.numeric_column('hours_per_week'),
]
UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - \
{LABEL_COLUMN}
def build_estimator(config, embedding_size=8, hidden_units=None):
"""Build a wide and deep model for predicting income category.
Wide and deep models use deep neural nets to learn high level abstractions
about complex features or interactions between such features.
These models then combined the outputs from the DNN with a linear regression
performed on simpler features. This provides a balance between power and
speed that is effective on many structured data problems.
You can read more about wide and deep models here:
https://research.googleblog.com/2016/06/wide-deep-learning-better-together-with.html
To define model we can use the prebuilt DNNCombinedLinearClassifier class,
and need only define the data transformations particular to our dataset, and
then
assign these (potentially) transformed features to either the DNN, or linear
regression portion of the model.
Args:
config: tf.contrib.learn.RunConfig defining the runtime environment for the
estimator (including model_dir).
embedding_size: int, the number of dimensions used to represent categorical
features when providing them as inputs to the DNN.
hidden_units: [int], the layer sizes of the DNN (input layer first)
learning_rate: float, the learning rate for the optimizer.
Returns:
A DNNCombinedLinearClassifier
"""
(gender, race, education, marital_status, relationship,
workclass, occupation, native_country, age,
education_num, capital_gain, capital_loss, hours_per_week) = INPUT_COLUMNS
# Build an estimator.
# Reused Transformations.
# Continuous columns can be converted to categorical via bucketization
age_buckets = tf.feature_column.bucketized_column(
age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
# Wide columns and deep columns.
wide_columns = [
# Interactions between different categorical features can also
# be added as new virtual features.
tf.feature_column.crossed_column(
['education', 'occupation'], hash_bucket_size=int(1e4)),
tf.feature_column.crossed_column(
[age_buckets, race, 'occupation'], hash_bucket_size=int(1e6)),
tf.feature_column.crossed_column(
['native_country', 'occupation'], hash_bucket_size=int(1e4)),
gender,
native_country,
education,
occupation,
workclass,
marital_status,
relationship,
age_buckets,
]
deep_columns = [
# Use indicator columns for low dimensional vocabularies
tf.feature_column.indicator_column(workclass),
tf.feature_column.indicator_column(education),
tf.feature_column.indicator_column(marital_status),
tf.feature_column.indicator_column(gender),
tf.feature_column.indicator_column(relationship),
tf.feature_column.indicator_column(race),
# Use embedding columns for high dimensional vocabularies
tf.feature_column.embedding_column(
native_country, dimension=embedding_size),
tf.feature_column.embedding_column(occupation, dimension=embedding_size),
age,
education_num,
capital_gain,
capital_loss,
hours_per_week,
]
return tf.estimator.DNNLinearCombinedClassifier(
config=config,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=hidden_units or [100, 70, 50, 25]
)
def parse_label_column(label_string_tensor):
"""Parses a string tensor into the label tensor
Args:
label_string_tensor: Tensor of dtype string. Result of parsing the
CSV column specified by LABEL_COLUMN
Returns:
A Tensor of the same shape as label_string_tensor, should return
an int64 Tensor representing the label index for classification tasks,
and a float32 Tensor representing the value for a regression task.
"""
# Build a Hash Table inside the graph
table = tf.contrib.lookup.index_table_from_tensor(tf.constant(LABELS))
# Use the hash table to convert string labels to ints and one-hot encode
return table.lookup(label_string_tensor)
# ************************************************************************
# YOU NEED NOT MODIFY ANYTHING BELOW HERE TO ADAPT THIS MODEL TO YOUR DATA
# ************************************************************************
def csv_serving_input_fn():
"""Build the serving inputs."""
csv_row = tf.placeholder(
shape=[None],
dtype=tf.string
)
features = parse_csv(csv_row)
features.pop(LABEL_COLUMN)
return tf.estimator.export.ServingInputReceiver(features, {'csv_row': csv_row})
def example_serving_input_fn():
"""Build the serving inputs."""
example_bytestring = tf.placeholder(
shape=[None],
dtype=tf.string,
)
feature_scalars = tf.parse_example(
example_bytestring,
tf.feature_column.make_parse_example_spec(INPUT_COLUMNS)
)
return tf.estimator.export.ServingInputReceiver(
features,
{'example_proto': example_bytestring}
)
# [START serving-function]
def json_serving_input_fn():
"""Build the serving inputs."""
inputs = {}
for feat in INPUT_COLUMNS:
inputs[feat.name] = tf.placeholder(shape=[None], dtype=feat.dtype)
return tf.estimator.export.ServingInputReceiver(inputs, inputs)
# [END serving-function]
SERVING_FUNCTIONS = {
'JSON': json_serving_input_fn,
'EXAMPLE': example_serving_input_fn,
'CSV': csv_serving_input_fn
}
def parse_csv(rows_string_tensor):
"""Takes the string input tensor and returns a dict of rank-2 tensors."""
# Takes a rank-1 tensor and converts it into rank-2 tensor
# Example if the data is ['csv,line,1', 'csv,line,2', ..] to
# [['csv,line,1'], ['csv,line,2']] which after parsing will result in a
# tuple of tensors: [['csv'], ['csv']], [['line'], ['line']], [[1], [2]]
row_columns = tf.expand_dims(rows_string_tensor, -1)
columns = tf.decode_csv(row_columns, record_defaults=CSV_COLUMN_DEFAULTS)
features = dict(zip(CSV_COLUMNS, columns))
# Remove unused columns
for col in UNUSED_COLUMNS:
features.pop(col)
return features
def input_fn(filenames,
num_epochs=None,
shuffle=True,
skip_header_lines=0,
batch_size=200):
"""Generates features and labels for training or evaluation.
This uses the input pipeline based approach using file name queue
to read data so that entire data is not loaded in memory.
Args:
filenames: [str] list of CSV files to read data from.
num_epochs: int how many times through to read the data.
If None will loop through data indefinitely
shuffle: bool, whether or not to randomize the order of data.
Controls randomization of both file order and line order within
files.
skip_header_lines: int set to non-zero in order to skip header lines
in CSV files.
batch_size: int First dimension size of the Tensors returned by
input_fn
Returns:
A (features, indices) tuple where features is a dictionary of
Tensors, and indices is a single Tensor of label indices.
"""
filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
if shuffle:
# Process the files in a random order.
filename_dataset = filename_dataset.shuffle(len(filenames))
# For each filename, parse it into one element per line, and skip the header
# if necessary.
dataset = filename_dataset.flat_map(
lambda filename: tf.data.TextLineDataset(filename).skip(skip_header_lines))
dataset = dataset.map(parse_csv)
if shuffle:
dataset = dataset.shuffle(buffer_size=batch_size * 10)
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
return features, parse_label_column(features.pop(LABEL_COLUMN))
task.py
import argparse
import os
import trainer.model as model
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.utils import (
saved_model_export_utils)
from tensorflow.contrib.training.python.training import hparam
def run_experiment(hparams):
"""Run the training and evaluate using the high level API"""
train_input = lambda: model.input_fn(
hparams.train_files,
num_epochs=hparams.num_epochs,
batch_size=hparams.train_batch_size
)
# Don't shuffle evaluation data
eval_input = lambda: model.input_fn(
hparams.eval_files,
batch_size=hparams.eval_batch_size,
shuffle=False
)
train_spec = tf.estimator.TrainSpec(train_input,
max_steps=hparams.train_steps
)
exporter = tf.estimator.FinalExporter('census',
model.SERVING_FUNCTIONS[hparams.export_format])
eval_spec = tf.estimator.EvalSpec(eval_input,
steps=hparams.eval_steps,
exporters=[exporter],
name='census-eval'
)
run_config = tf.estimator.RunConfig()
run_config = run_config.replace(model_dir=hparams.job_dir)
print('model dir {}'.format(run_config.model_dir))
estimator = model.build_estimator(
embedding_size=hparams.embedding_size,
# Construct layers sizes with exponetial decay
hidden_units=[
max(2, int(hparams.first_layer_size *
hparams.scale_factor**i))
for i in range(hparams.num_layers)
],
config=run_config
)
tf.estimator.train_and_evaluate(estimator,
train_spec,
eval_spec)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Input Arguments
parser.add_argument(
'--train-files',
help='GCS or local paths to training data',
nargs='+',
required=True
)
parser.add_argument(
'--num-epochs',
help="""\
Maximum number of training data epochs on which to train.
If both --max-steps and --num-epochs are specified,
the training job will run for --max-steps or --num-epochs,
whichever occurs first. If unspecified will run for --max-steps.\
""",
type=int,
)
parser.add_argument(
'--train-batch-size',
help='Batch size for training steps',
type=int,
default=40
)
parser.add_argument(
'--eval-batch-size',
help='Batch size for evaluation steps',
type=int,
default=40
)
parser.add_argument(
'--eval-files',
help='GCS or local paths to evaluation data',
nargs='+',
required=True
)
# Training arguments
parser.add_argument(
'--embedding-size',
help='Number of embedding dimensions for categorical columns',
default=8,
type=int
)
parser.add_argument(
'--first-layer-size',
help='Number of nodes in the first layer of the DNN',
default=100,
type=int
)
parser.add_argument(
'--num-layers',
help='Number of layers in the DNN',
default=4,
type=int
)
parser.add_argument(
'--scale-factor',
help='How quickly should the size of the layers in the DNN decay',
default=0.7,
type=float
)
parser.add_argument(
'--job-dir',
help='GCS location to write checkpoints and export models',
required=True
)
# Argument to turn on all logging
parser.add_argument(
'--verbosity',
choices=[
'DEBUG',
'ERROR',
'FATAL',
'INFO',
'WARN'
],
default='INFO',
)
# Experiment arguments
parser.add_argument(
'--train-steps',
help="""\
Steps to run the training job for. If --num-epochs is not specified,
this must be. Otherwise the training job will run indefinitely.\
""",
type=int
)
parser.add_argument(
'--eval-steps',
help='Number of steps to run evalution for at each checkpoint',
default=100,
type=int
)
parser.add_argument(
'--export-format',
help='The input format of the exported SavedModel binary',
choices=['JSON', 'CSV', 'EXAMPLE'],
default='JSON'
)
args = parser.parse_args()
# Set python level verbosity
tf.logging.set_verbosity(args.verbosity)
# Set C++ Graph Execution level verbosity
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(
tf.logging.__dict__[args.verbosity] / 10)
# Run the training job
hparams=hparam.HParams(**args.__dict__)
run_experiment(hparams)
In Tensorflow 2.x use Keras to write a new export signature that takes the original inputs plus the keys. Note that you have to define the shape(s) of your original input appropriately
#tf.function(input_signature=[tf.TensorSpec([None, 1], dtype=tf.float32), tf.TensorSpec([None, 1], dtype=tf.int32)])
def keyed_prediction(originput, key):
pred = model(originput, training=False)
return {
'price': pred,
'key': key
}
model.save(EXPORT_PATH, signatures={'serving_default': keyed_prediction})
In Tensorflow 1.x modify the export signature:
config = estimator.config
def model_fn2(features, labels, mode):
estimatorSpec = estimator._call_model_fn(features, labels, mode, config=config)
if estimatorSpec.export_outputs:
for ekey in ['predict', 'serving_default']:
estimatorSpec.export_outputs[ekey] = \
tf.estimator.export.PredictOutput(estimatorSpec.predictions)
return estimatorSpec
return tf.estimator.Estimator(model_fn=model_fn2, config=config)
See:
https://towardsdatascience.com/how-to-extend-a-canned-tensorflow-estimator-to-add-more-evaluation-metrics-and-to-pass-through-ddf66cd3047d
Currently, passing through keys requires exporting a model capable of handling keys. Unfortunately, in the current state of affairs, this also requires that the training data contain a column for keys.
The function tf.contrib.estimators.forward_features() is intended for this purpose, but it assumes the key is also present in your training data. In that case, you would simply add this line to model.py
estimator = model.build_estimator(...)
estimator = tf.contrib.estimators.forward_features(estimator, "key")
Where 'key' is the name of the column containing the key. (So you would also have to add another column to the CSV input files and modify the CSV_COLUMNS and CSV_DEFAULTS as appropriate).