spacy nightly (3.0.0rc) load without vocab how to add word2vec vectorspace? - spacy

In spacy 2 I use this to add a vocab to an empty spacy model with vectorspace (spacy init)
:
nlp3=spacy.load('nl_core_news_sm') #standard model without vectors
spacy.load("spacyinitnlmodelwithvectorspace",vocab=nlp3.vocab)
In spacy nightly version 3.0.0rc the vocab parameter is not in spacy.load anymore. Has anyone a suggesstion how I can add vocab to a spacy model?

this works, from Export vectors from fastText to spaCy
add's vecfile to spacy model. only tested on small dataset
from future import unicode_literals
import numpy
import spacy
def spacy_load_vec(spacy_model,vec_file,spacy_vec_model,print_words=False):
"""
spacy model zonder vectoren + vecfile wordt spacy model met vectorspace
Export vectors from fastText to spaCy
Parameters
----------
spacy_model : TYPE
spacy model zonder vectorspace.
vec_file : TYPE
vecfile met fasttext of w2v getrainde vectoren.
spacy_vec_model : TYPE
spacy model met vectorspace.
print_words : TYPE, optional
woorden printen True/false. The default is False.
Returns
-------
None.
"""
nlp = spacy.load(spacy_model)
with open(vec_file, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim))
count = 0
for line in file_:
count += 1
line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', int(nr_dim))
word = pieces[0]
if print_words:
print("{} - {}".format(count, word))
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
nlp.to_disk(spacy_vec_model)

Related

Can you iterate over hyperparameters in scikit?

Is there a way to iterate over a Random Forest model so that I create a new model with different hyper-parameters?
i.e.
model = RandomForestClassifier(n_estimators= N, max_depth= D)
I want to be able to construct a model for each N value ranging from 1 - 25 and D 1 - 5.
Is this possible?
Thanks
There is more than one way to iterate over hyperparameters and train/test models. A simple approach would be:
from sklearn import ensemble
from sklearn import model_selection
# generating parameter grid
params = {
"n_estimators": list(range(1,26)),
"max_depth": list(range(1,6)),
}
grid = model_selection.ParameterGrid(params)
# iterate over grid and fit/score model with the varying hyperparameters
for param in grid:
rf_clf = ensemble.RandomForestClassifier(**param) # unpacking param which is a dictionary
rf_clf.fit(x_train, y_train)
print(rf_clf.score(x_val, y_val), param)
An alternate approach that preserves more information and includes cross-validation would be:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
rf_clf = ensemble.RandomForestRegressor()
params = {
"n_estimators": list(range(1,26)),
"max_depth": list(range(1,6)),
}
cv = model_selection.GridSearchCV(
estimator=rf_clf,
param_grid=params,
scoring=metrics.accuracy_score # scorer of choice (optional)
)
# fits model and saves info on each distinct hyperparameter combination using cross-validation
cv.fit(x_train, y_train) #
# access GridSearchCV object info how you like. For example:
print(cv.best_score_, cv.best_params_)
print(cv.cv_results_)

Why ImportExampleGen reads TFRecords as SparseTensor instead of Tensor?

I'm converting a CSV file into a TFRecords file like this:
File: ./dataset/csv/file.csv
feature_1, feture_2, output
1, 1, 1
2, 2, 2
3, 3, 3
import tensorflow as tf
import csv
import os
print(tf.__version__)
def create_csv_iterator(csv_file_path, skip_header):
with tf.io.gfile.GFile(csv_file_path) as csv_file:
reader = csv.reader(csv_file)
if skip_header: # Skip the header
next(reader)
for row in reader:
yield row
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def create_example(row):
"""
Returns a tensorflow.Example Protocol Buffer object.
"""
features = {}
for feature_index, feature_name in enumerate(["feature_1", "feture_2", "output"]):
feature_value = row[feature_index]
features[feature_name] = _int64_feature(int(feature_value))
return tf.train.Example(features=tf.train.Features(feature=features))
def create_tfrecords_file(input_csv_file):
"""
Creates a TFRecords file for the given input data
"""
output_tfrecord_file = input_csv_file.replace("csv", "tfrecords")
writer = tf.io.TFRecordWriter(output_tfrecord_file)
print("Creating TFRecords file at", output_tfrecord_file, "...")
for i, row in enumerate(create_csv_iterator(input_csv_file, skip_header=True)):
if len(row) == 0:
continue
example = create_example(row)
content = example.SerializeToString()
writer.write(content)
writer.close()
print("Finish Writing", output_tfrecord_file)
create_tfrecords_file("./dataset/csv/file.csv")
Then I'll read the generated TFRecords files using ImportExampleGen class:
import os
import absl
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip
context = InteractiveContext()
example_gen = tfx.components.ImportExampleGen(input_base="./dataset/tfrecords")
context.run(example_gen, enable_cache=True)
statistics_gen = tfx.components.StatisticsGen(
examples=example_gen.outputs['examples'])
context.run(statistics_gen, enable_cache=True)
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
context.run(schema_gen, enable_cache=True)
File: ./transform.py
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
print(inputs)
return inputs
transform = tfx.components.Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=os.path.abspath("./transform.py"))
context.run(transform, enable_cache=True)
In the preprocessing_fn function shows that inputs is a SparseTensor objects. My question is why? As far as I can tell, my dataset's samples are dense and they should be Tensor instead. Am I doing something wrong?
For anyone else who might be struggling with the same issue, I found the culprit. It's the SchemaGen class. This is how I was instantiating its object:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
I don't know what's the use case for asking SchemaGen class not to infer the shape of the features but the tutorial I was following had it set to False and I had just copied and pasted the same thing. Comparing with some other tutorials, I realized that it could be the reason why I was getting SparseTensor.
So, if you let SchemaGen infer the shape of your features or you load a hand crafted schema in which you've set the shapes yourself, you'll be getting a Tensor in your preprocessing_fn. But if the shapes are not set, the features will be instances of SparseTensor.
For the sake of completeness, this is the fixed snippet:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=True)

SpaCy 3 -- ValueError: [E973] Unexpected type for NER data

I've been stressing out on this problem for so long and I can't seem to find a solution.
I want to train a NER model to recognise animal and species names.
I created a mock training set to test it out. However, I keep getting a ValueError: [E973] Unexpected type for NER data
I have tried other solutions on other posts on StackOverflow, including:
Double checking if my formatting and type of the training set was right
Using spacy.load('en_core_web_sm') instead of spacy.blank('en')
Installing spacy-lookups-data
All of these result in the same error.
import os
import spacy
from spacy.lang.en import English
from spacy.training.example import Example
import random
def train_spacy(data, iterations = 30):
TRAIN_DATA = data
nlp = spacy.blank("en") #start with a blank model
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last = True)
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for itn in range(iterations):
print ("Starting iterations "+str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list,tuple))) #this prints True
example = Example.from_dict(doc, {"entities":annotations})
nlp.update(
[example],
drop = 0.2,
sgd = optimizer,
losses = losses
)
print(losses)
return (nlp)
if __name__ == "__main__":
#mock training set
TRAIN_DATA=[('Dog is an animal',{'entities':[(0,3,'ANIMAL')]}),
('Cat is on the table',{'entities':[(0,3,'ANIMAL')]}),
('Rats are pets',{'entities':[(0,4,'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
The error message
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 49, in <module>
nlp = train_spacy(TRAIN_DATA)
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 35, in train_spacy
example = Example.from_dict(doc, {"entities":annotations})
File "spacy\training\example.pyx", line 118, in spacy.training.example.Example.from_dict
File "spacy\training\example.pyx", line 24, in spacy.training.example.annotations_to_doc
File "spacy\training\example.pyx", line 388, in spacy.training.example._add_entities_to_doc
ValueError: [E973] Unexpected type for NER data```
I had the same problem when I migrated a code that I had from a 2.x version of spacy to a 3.x version since several things changed.
Also, in your case it looks like you have a mix of spacy 2.x and 3.x syntaxt. The next version of your code with a few changes work for me using spacy 3.2.1
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.disable_pipes(*other_pipes):
losses = None
optimizer = nlp.create_optimizer()
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list, tuple))) # this prints True
example = Example.from_dict(doc, annotations)
losses = nlp.update(
[example],
drop=0.2,
sgd=optimizer
)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
Notice the following changes:
I changed your import of Example class to from spacy.training import Example. I think you were importing the wrong clase.
I'm using en_core_web_lg but with a blank model it should work too!
I commented other pipeline models disabling because in spacy 3.x pipeline is more complex and I think you can't disable the whole pipeline for NER task. How ever feel free to read official documentation and try if some of the other models are not needed.
Optimizer now is initialized using nlp.create_optimizer() instead of nlp.begin_training()
Note that annotations are already a dictionary in the expected format so you don't need to wrap it in a new dictionary: Example.from_dict(doc, annotations) should do the job.
Finally the loss now is returned as a result of model update instead of being passed as parameter.
I hope this help you and please ask questions if you need more help.
Best regards!
EDIT:
I also want to suggest some changes in your training script to take more advantage of spacy utils:
Use spacy.utilis.minibatch util to create mini batches from your training data.
Pass a whole minibacth of examples to update method instead of a minibatch of only one example.
Your code including this improve among other minor changes would looks as follos:
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# Init loss
losses = None
# Init and configure optimizer
optimizer = nlp.create_optimizer()
optimizer.learn_rate = 0.001 # Change to some lr you prefers
batch_size = 32 # Choose batch size you prefers
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAIN_DATA, size=batch_size):
# Create Example instance for each training example in mini batch
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in batch]
# Update model with mini batch
losses = nlp.update(examples, drop=0.2, sgd=optimizer)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)

exporting TensorFlow model with no variables

i am exporting Keras TF model without a luck:
import tensorflow as tf
import numpy as np
ssValues = np.zeros(shape=(640,800,6),dtype=np.float16)
ssValues += 3.
ssKerasConstant = tf.keras.backend.constant(value=ssValues, dtype=tf.dtypes.float16, shape=(1,640,800,6));
inputLayer = tf.keras.Input(shape=(640,800,6),
name='inputLayer',
batch_size=None,
dtype=tf.dtypes.float16)
ssConstant = tf.constant(ssValues, dtype=tf.dtypes.float16, shape=(1,640,800,6), name='ss')
ssm = tf.keras.layers.Multiply()([inputLayer,ssKerasConstant])
model = tf.keras.models.Model(inputs=inputLayer, outputs=ssm)
tf.keras.experimental.export_saved_model(model, '~/models/model7.pb')
and i get the following error:
graph = inputs[0].graph
IndexError: list index out of range
even though i am able to predict the model.
You can save the model successfully by replacing the last line of your code,
tf.keras.experimental.export_saved_model(model, '~/models/model7.pb')
with the below line:
tf.saved_model.save(model, '~/models/model7.pb')
It works in Tensorflow Version, 2.0. Please find Gist here.

How to load pickle files by tensorflow's tf.data API

I have my data in multiple pickle files stored on disk. I want to use tensorflow's tf.data.Dataset to load my data into training pipeline. My code goes:
def _parse_file(path):
image, label = *load pickle file*
return image, label
paths = glob.glob('*.pkl')
print(len(paths))
dataset = tf.data.Dataset.from_tensor_slices(paths)
dataset = dataset.map(_parse_file)
iterator = dataset.make_one_shot_iterator()
Problem is I don't know how to implement the _parse_file fuction. The argument to this function, path, is of tensor type. I tried
def _parse_file(path):
with tf.Session() as s:
p = s.run(path)
image, label = pickle.load(open(p, 'rb'))
return image, label
and got error message:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'arg0' with dtype string
[[Node: arg0 = Placeholder[dtype=DT_STRING, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
After some search on the Internet I still have no idea how to do it. I will be grateful to anyone providing me a hint.
I have solved this myself. I should use tf.py_func as in this doc.
This is how I solved this issue. I didn't use the tf.py_func; check out function "load_encoding()" below, which is what's doing the pickle reading. The FACELIB_DIR contains directories of pickled vggface2 encodings, each directory named for the person of those face encodings.
import tensorflow as tf
import pickle
import os
FACELIB_DIR='/var/noggin/FaceEncodings'
# Get list of all classes & build a quick int-lookup dictionary
labelNames = sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')])
labelStrToInt = dict([(x,i) for i,x in enumerate(labelNames)])
# Function load_encoding - Loads Encoding data from enc2048 file in filepath
# This reads an encoding from disk, and through the file path gets the label oneHot value, returns both
def load_encoding(file_path):
with open(os.path.join(FACELIB_DIR,file_path),'rb') as fin:
A,_ = pickle.loads(fin.read()) # encodings, source_image_name
label_str = tf.strings.split(file_path, os.path.sep)[-2]
return (A, labelStrToInt[label_str])
# Build the dataset of every enc2048 file in our data library
encpaths = []
for D in sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')]):
# All the encoding files
encfiles = sorted(filter((lambda x: x.endswith('.enc2048')), os.listdir(os.path.join(FACELIB_DIR, D))))
encpaths += [os.path.join(D,x) for x in encfiles]
dataset = tf.data.Dataset.from_tensor_slices(encpaths)
# Shuffle and speed improvements on the dataset
BATCH_SIZE = 64
from tensorflow.data import AUTOTUNE
dataset = (dataset
.shuffle(1024)
.cache()
.repeat()
.batch(BATCH_SIZE)
.prefetch(AUTOTUNE)
)
# Benchmark our tf.data pipeline
import time
datasetGen = iter(dataset)
NUM_STEPS = 10000
start_time = time.time()
for i in range(0, NUM_STEPS):
X = next(datasetGen)
totalTime = time.time() - start_time
print('==> tf.data generated {} tensors in {:.2f} seconds'.format(BATCH_SIZE * NUM_STEPS, totalTime))
tf.py_func
This function is used to solved that problem and also as menstion in doc.