tensorflow dataset shuffle examples instead of batches

tensorflow dataset shuffle examples instead of batches - tensorflow

How do I get a tensorflow dataset in batch mode to shuffle across all the samples? It is only shuffling the batches.
Below is a program that makes a dataset of 1000 items and goes through 10 epochs of it in batches of 5. I have shuffle() turned on. I can see that tensorflow groups the dataset into 200 batches of 5 examples each, and the shuffle is across those batches. I want each new batch to be a random sample of the original 1000 examples, not a sample of the 200 original batches.
That is, this program:
import numpy as np
import tensorflow as tf
import random
def rec2tfrec_example(rec):
def _int64_feat(value):
arr_value = np.empty([1], dtype=np.int64)
arr_value[0] = value
return tf.train.Feature(int64_list=tf.train.Int64List(value=arr_value))
feat = {
'uid': _int64_feat(rec['uid']),
}
return tf.train.Example(features=tf.train.Features(feature=feat)).SerializeToString()
def parse_example(tfrec_serialized_string):
feat = {
'uid': tf.FixedLenFeature([], tf.int64),
}
return tf.parse_example(tfrec_serialized_string, feat)
def write_tfrecs_to_file(fname, recs):
recwriter = tf.python_io.TFRecordWriter(fname)
for rec in recs:
recwriter.write(bytes(rec))
recwriter.close()
def check_shuffle(sess, tfrec_output_filename, data, N, batch_size):
epochs = 10
dataset = tf.data.TFRecordDataset(tfrec_output_filename) \
.batch(batch_size) \
.repeat(epochs) \
.shuffle(2*N) \
.map(parse_example, num_parallel_calls=2)
tf_iter = dataset.make_initializable_iterator()
get_next = tf_iter.get_next()
sess.run(tf_iter.initializer)
num_batches = N//batch_size
for epoch in range(epochs ):
for batch in range(N//batch_size):
tfres = sess.run(get_next)
print("epoch=%4d batch=%d uid=%s" % (epoch, batch, tfres['uid']))
def main(N=1000, batch_size=5, tfrec_output_filename='tfrec_testing.tfrecords'):
tf.reset_default_graph()
data = [{'uid': uid } for uid in range(N)]
tfrec_strings = [rec2tfrec_example(rec) for rec in data]
write_tfrecs_to_file(tfrec_output_filename, tfrec_strings)
with tf.Session() as sess:
check_shuffle(sess, tfrec_output_filename, data, N, batch_size)
if __name__ == '__main__':
main()
produces output like:
epoch= 9 batch=186 uid=[685 686 687 688 689]
epoch= 9 batch=187 uid=[235 236 237 238 239]
epoch= 9 batch=188 uid=[520 521 522 523 524]
epoch= 9 batch=189 uid=[135 136 137 138 139]
epoch= 9 batch=190 uid=[95 96 97 98 99]
epoch= 9 batch=191 uid=[290 291 292 293 294]
epoch= 9 batch=192 uid=[230 231 232 233 234]
epoch= 9 batch=193 uid=[215 216 217 218 219]

ah, the order of batch and shuffle matters, if I set up the dataset like
dataset = tf.data.TFRecordDataset(tfrec_output_filename) \
.shuffle(2*N) \
.batch(batch_size) \
.repeat(epochs) \
.map(parse_example, num_parallel_calls=2)
with shuffle before batch, then it works.

Related

How efficiently filter a specific number of entries and concatenating them in a unique tf.data.Dataset?

I have a huge TFRecord file with more than 4M entries. It is a very unbalanced dataset containing many more entries of some labels and few others - compare to the whole dataset. I want to filter a limited number of entries of some of these labels in order to have a balanced dataset. Below, you can see my attempt, but it takes more than 24 hours to filter 1k from each label (33 different labels).
import tensorflow as tf
tf.compat.as_str(
bytes_or_text='str', encoding='utf-8'
)
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
print("Device:", tpu.master())
strategy = tf.distribute.TPUStrategy(tpu)
except:
strategy = tf.distribute.get_strategy()
print("Number of replicas:", strategy.num_replicas_in_sync)
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False
dataset = tf.data.TFRecordDataset('/test.tfrecord')
dataset = dataset.with_options(ignore_order)
features, feature_lists = detect_schema(dataset)
#Decodings TFRecord serialized data
def decode_data(serialized):
X, y = tf.io.parse_single_sequence_example(
serialized,
context_features=features,
sequence_features=feature_lists)
return X['title'], y['subject']
dataset = dataset.map(lambda x: tf.py_function(func=decode_data, inp=[x], Tout=(tf.string, tf.string)))
#Filtering and concatenating the samples
def balanced_dataset(dataset, labels_list, sample_size=1000):
datasets_list = []
for label in labels_list:
#Filtering the chosen labels
locals()[label] = dataset.filter(lambda x, y: tf.greater(tf.reduce_sum(tf.cast(tf.equal(tf.constant(label, dtype=tf.int64), y), tf.float32)), tf.constant(0.)))
#appending a limited sample
datasets_list.append(locals()[label].take(sample_size))
concat_dataset = datasets_list[0]
#concatenating the datasets
for dset in datasets_list[1:]:
concat_dataset = concat_dataset.concatenate(dset)
return concat_dataset
balanced_data = balanced_dataset(tabledataset, labels_list=list(decod_dic.values()), sample_size=1000)

One way to solve this is by using group_by_window method where the window_size would be the sample size of each class (in your case 1k).
ds = ds.group_by_window(
# Use label as key
key_func=lambda _, l: l,
# Convert each window to a sample_size
reduce_func=lambda _, window: window.batch(sample_size),
# Use window size as sample_size
window_size=sample_size)
This will form batches of single classes of size sample_size. But there is one problem, there will be multiple batches of same class, but you just need one of the batches in each class.
To solve the above problem, we need to add a count for each of the batches and then filter out count==0, which will fetch the first batch of all the classes.
Lets define an example:
labels = np.array(sum([[label]*repeat for label, repeat in zip([0, 1, 2], [100, 200, 15])], []))
features = np.arange(len(labels))
np.unique(labels, return_counts=True)
#(array([0, 1, 2]), array([100, 200, 15]))
# There are 3 labels chosen for simplicity and each of their counts are shown along.
sample_size = 15 # we choose to pick sample of 15 from each class
We create a dataset from the above inputs,
ds = tf.data.Dataset.from_tensor_slices((features, labels))
In the above window function we modify the reduce_func to make the counter, so the batch will have 3 elements (X_batch, y_batch, label_counter) :
def reduce_func(x, y):
#class_count[y] += 1
z = table.lookup(x)
table.insert(x, z+1)
return y.batch(sample_size).map(lambda a,b: (a, b, z))
# Group by window
ds = tf.data.Dataset.from_tensor_slices((features, labels))
ds = ds.group_by_window(
# Use label as key
key_func=lambda _, l: l,
# Convert each window to a sample_size
reduce_func=reduce_func,
# Use window size as sample_size
window_size=sample_size)
The counter logic in reduce_func is implemented as a table lookup where the counter needs to be updated and read from a lookup table. Its initialized as shown below:
n_classes = 3
keys = tf.range(0,n_classes, dtype=tf.int64)
vals = tf.zeros_like(keys, dtype=tf.int64)
table = tf.lookup.experimental.MutableHashTable(key_dtype=tf.int64,
value_dtype=tf.int64,
default_value=-1)
table.insert(keys, vals)
Now we filter out the batch where the count==0 and remove the count element to form (X, y) batch pairs:
ds = ds.filter(lambda x, y, count: count==0)
ds = ds.map(lambda x, y, count: (x, y))
Output,
for x, y in ds:
print(x.numpy(), y.numpy())
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[100 101 102 103 104 105 106 107 108 109 110 111 112 113 114] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[300 301 302 303 304 305 306 307 308 309 310 311 312 313 314] [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]

ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2

What is the solution to the following error in tensorflow.
ValueError: The two structures don't have the same sequence length.
Input structure has length 1, while shallow structure has length 2.
I tried tensorflow versions: 2.9.1 and 2.4.0.
The toy example is given to reproduce the error.
import tensorflow as tf
d1 = tf.data.Dataset.range(10)
d1 = d1.map(lambda x:tf.cast([x], tf.float32))
def func1(x):
y1 = 2.0 * x
y2 = -3.0 * x
return tuple([y1, y2])
d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
d3 = d2.padded_batch(3, padded_shapes=(None,))
for x, y in d2.as_numpy_iterator():
pass
The full error is:
ValueError Traceback (most recent call last)
~/Documents/pythonProject/tfProjects/asr/transformer/dataset.py in <module>
256 return tuple([y1, y2])
257 d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
---> 258 d3 = d2.padded_batch(3, padded_shapes=(None,))
259 for x, y in d2.as_numpy_iterator():
260 pass
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in padded_batch(self, batch_size, padded_shapes, padding_values, drop_remainder, name)
1887 padding_values,
1888 drop_remainder,
-> 1889 name=name)
1890
1891 def map(self,
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, batch_size, padded_shapes, padding_values, drop_remainder, name)
5171
5172 input_shapes = get_legacy_output_shapes(input_dataset)
-> 5173 flat_padded_shapes = nest.flatten_up_to(input_shapes, padded_shapes)
5174
5175 flat_padded_shapes_as_tensors = []
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py in flatten_up_to(shallow_tree, input_tree)
377 `input_tree`.
378 """
--> 379 assert_shallow_structure(shallow_tree, input_tree)
380 return list(_yield_flat_up_to(shallow_tree, input_tree))
381
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py in assert_shallow_structure(shallow_tree, input_tree, check_types)
290 if len(input_tree) != len(shallow_tree):
291 raise ValueError(
--> 292 "The two structures don't have the same sequence length. Input "
293 f"structure has length {len(input_tree)}, while shallow structure "
294 f"has length {len(shallow_tree)}.")
ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2.

The following modification in padded_shapes argument will resolve the error.
import tensorflow as tf
d1 = tf.data.Dataset.range(10)
d1 = d1.map(lambda x:tf.cast([x], tf.float32))
def func1(x):
y1 = 2.0 * x
y2 = -3.0 * x
return tuple([y1, y2])
d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
d3 = d2.padded_batch(3, padded_shapes=([None],[None]))
for x, y in d2.as_numpy_iterator():
pass

XGBoostError: problem with pipeline and scikit.learn

I am very new to Python and ML. I have been doing few courses from Kaggle and working on pipelines.Everything seemed to work fine without the pipelines but got XGBoostError when I piped it all. I have an issue with my code but I cannot figure it out. Here below is the code and the error after:
X_full = pd.read_csv(train_path).copy()
X_test = pd.read_csv(test_path).copy()
def cleaning(var):
q1, q3 = np.percentile(var['Fare'], [25, 75])
iqr = q3 - q1
lower_bound_val = q1 - (1.5 * iqr)
upper_bound_val = q3 + (1.5 * iqr)
var = var[(var['Fare'] >= lower_bound_val) & (var['Fare'] < upper_bound_val)].copy()
var['family_size'] = var.SibSp + var.Parch
drop_cols = ['PassengerId', 'Name', 'Parch', 'SibSp', 'Ticket', 'Cabin', 'Embarked']
var = var.drop(drop_cols, axis=1)
return var
get_cleaning = FunctionTransformer(cleaning, validate=False)
age_transformer = SimpleImputer(missing_values=np.nan, strategy='median')
age_col = ['Age']
sex_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
sex_col = ['Sex']
# Define the model
xgboost_m = XGBRegressor(random_state=0)
prepro_col = ColumnTransformer(
transformers=[
('age', age_transformer, age_col),
('sex', sex_transformer, sex_col)
])
pl = Pipeline(steps=[('get_cleaning', get_cleaning),
('prepro_col', prepro_col),
('XGBoost', xgboost_m)
])
# Drop assign target to y and drop from X_full
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)
pl.fit(X_train, y_train)
And here the error:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-887-676d922c8ba5> in <module>
----> 1 pl.fit(X_train, y_train)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, callbacks)
546 obj=obj, feval=feval,
547 verbose_eval=verbose, xgb_model=xgb_model,
--> 548 callbacks=callbacks)
549
550 if evals_result:
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
210 evals=evals,
211 obj=obj, feval=feval,
--> 212 xgb_model=xgb_model, callbacks=callbacks)
213
214
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
73 # Skip the first update if it is a recovery step.
74 if version % 2 == 0:
---> 75 bst.update(dtrain, i, obj)
76 bst.save_rabit_checkpoint()
77 version += 1
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
1159 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1160 ctypes.c_int(iteration),
-> 1161 dtrain.handle))
1162 else:
1163 pred = self.predict(dtrain, output_margin=True, training=True)
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
186 """
187 if ret != 0:
--> 188 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
189
190
XGBoostError: [22:28:42] ../src/data/data.cc:530: Check failed: labels_.Size() == num_row_ (712 vs. 622) : Size of labels must equal to number of rows.
Stack trace:
[bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xa5dc4) [0x7f27232f2dc4]
[bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x106c92) [0x7f2723353c92]
[bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1a84b7) [0x7f27233f54b7]
[bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1aae4e) [0x7f27233f7e4e]
[bt] (4) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x55) [0x7f27232e4f35]
[bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f2783ff0630]
[bt] (6) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f2783feffed]
[bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f278323c60e]
[bt] (8) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x13044) [0x7f278323d044]

The error indicates that, labels_.Size() == num_row_ (712 vs. 622) , your have 622 rows and 712 label, that isn't equal. Check your dataset and try again. In your dataset y = X_full.Survived is label/ Target Output.

Use Text as feature column in Tensorflows existing Estimator

I try to build an Classification with the existing estimator to predict if an article will be sold or not.
I tried to use a linearClassifier, because I'm a beginner in Tensorflow and Pyhton.
I have a dataset with price, category and size, which is perfect for numeric or category feature columns. But I also have a description of the article, only 3-6 words per article and around 6500 different words as per my analysis.
I tried to use shared embed, with one category column per word, but this not work. And when I add all 6500 columns directly to the model it is very slow.
What is the best way and easiest way to handle the description? At best with code example. The word order doesn't matter, but for example if it's from a brand it will sell better than noname.
Many thanks for your answers
Edit: I tried with this post Tensorflow pad sequence feature column
But I now have the problem that tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) don't work
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow.compat.v2.feature_column as fc
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from tensorflow.python.framework.ops import disable_eager_execution
import itertools
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence
dfall = pd.read_csv('./articles.csv')
# Build vacabulary
vocab_size = 6203
oov_tok = '<OOV>'
sentences = dfall['description'].to_list()
tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
# if word_index shorter then default value of vocab_size we'll save actual size
vocab_size=len(word_index)
print("vocab_size = word_index = ",len(word_index))
# Split sentensec on tokens. here token = word
# text_to_word_sequence() has good default filter for
# charachters include basic punctuation, tabs, and newlines
dfall['description'] = dfall['description'].apply(text_to_word_sequence)
max_length = 9
# paddind and trancating setnences
# do that directly with strings without using tokenizer.texts_to_sequences()
# the feature_colunm will convert strings into numbers
dfall['description']=dfall['description'].apply(lambda x, N=max_length: (x + N * [''])[:N])
dfall['description']=dfall['description'].apply(lambda x, N=max_length: x[:N])
#dfall['description']=dfall['description'].apply(np.asarray)
dfall.head()
# Define method to create tf.data dataset from Pandas Dataframe
def df_to_dataset(dataframe, label_column, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
#labels = dataframe.pop(label_column)
labels = dataframe[label_column]
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
return ds
# Split dataframe into train and validation sets
train_df, val_df = train_test_split(dfall, test_size=0.2)
print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')
batch_size = 32
ds = df_to_dataset(dfall, 'sold',shuffle=False,batch_size=batch_size)
train_ds = df_to_dataset(train_df, 'sold', shuffle=False, batch_size=batch_size)
val_ds = df_to_dataset(val_df, 'sold', shuffle=False, batch_size=batch_size)
# and small batch for demo
example_batch = next(iter(ds))[0]
example_batch
# Helper methods to print exxample outputs of for defined feature_column
def demo(feature_column):
feature_layer = tf.keras.layers.DenseFeatures(feature_column)
print(feature_layer(example_batch).numpy())
def seqdemo(feature_column):
sequence_feature_layer = tf.keras.experimental.SequenceFeatures(feature_column)
print(sequence_feature_layer(example_batch))
dfall.head() is
sold description category_id size_id gender price host_id lat long year month
0 1 [dünne, jacke, gepunktet, , , , , , ] 9 25 f 3.5 1 48.21534 11.29949 2019 3
1 1 [kleid, pudel, dunkelblau, gepunktet, , , , , ] 9 25 f 4.0 1 48.21534 11.29949 2019 3
2 0 [kleid, rosa, hum, hund, katze, , , , ] 9 24 f 4.0 1 48.21534 11.29949 2019 3
3 1 [kleid, hum, blau, elsa, und, anna, , , ] 9 24 f 4.0 1 48.21534 11.29949 2019 3
4 0 [kleid, blue, seven, lachsfarben, , , , , ] 9 23 f 4.5 1 48.21534 11.29949 2019 3
The result is
vocab_size = word_index = 6203
12482 train examples
3121 validation examples
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\util\structure.py in normalize_element(element)
92 try:
---> 93 spec = type_spec_from_value(t, use_fallback=False)
94 except TypeError:
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\util\structure.py in type_spec_from_value(element, use_fallback)
464
--> 465 raise TypeError("Could not build a TypeSpec for %r with type %s" %
466 (element, type(element).__name__))
TypeError: Could not build a TypeSpec for 0 [dünne, jacke, gepunktet, , , , , , ]
1 [kleid, pudel, dunkelblau, gepunktet, , , , , ]
2 [kleid, rosa, hum, hund, katze, , , , ]
3 [kleid, hum, blau, elsa, und, anna, , , ]
4 [kleid, blue, seven, lachsfarben, , , , , ]
...
15598 [gartenschuhe, pink, , , , , , , ]
15599 [sandalen, grau, blume, superfit, , , , , ]
15600 [turnschuhe, converse, grau, , , , , , ]
15601 [strickjacke, rosa, , , , , , , ]
15602 [bikinihose, schmetterling, , , , , , , ]
Name: description, Length: 15603, dtype: object with type Series
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-1-420304a651bd> in <module>
71
72 batch_size = 32
---> 73 ds = df_to_dataset(dfall, 'sold',shuffle=False,batch_size=batch_size)
74
75 train_ds = df_to_dataset(train_df, 'sold', shuffle=False, batch_size=batch_size)
<ipython-input-1-420304a651bd> in df_to_dataset(dataframe, label_column, shuffle, batch_size)
58 labels = dataframe[label_column]
59
---> 60 ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
61 if shuffle:
62 ds = ds.shuffle(buffer_size=len(dataframe))
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in from_tensor_slices(tensors)
638 Dataset: A `Dataset`.
639 """
--> 640 return TensorSliceDataset(tensors)
641
642 class _GeneratorState(object):
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in __init__(self, element)
2856 def __init__(self, element):
2857 """See `Dataset.from_tensor_slices()` for details."""
-> 2858 element = structure.normalize_element(element)
2859 batched_spec = structure.type_spec_from_value(element)
2860 self._tensors = structure.to_batched_tensor_list(batched_spec, element)
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\util\structure.py in normalize_element(element)
96 # the value. As a fallback try converting the value to a tensor.
97 normalized_components.append(
---> 98 ops.convert_to_tensor(t, name="component_%d" % i))
99 else:
100 if isinstance(spec, sparse_tensor.SparseTensorSpec):
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1339
1340 if ret is None:
-> 1341 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
1342
1343 if ret is NotImplemented:
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in _constant_tensor_conversion_function(v, dtype, name, as_ref)
319 as_ref=False):
320 _ = as_ref
--> 321 return constant(v, dtype=dtype, name=name)
322
323
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in constant(value, dtype, shape, name)
259 ValueError: if called on a symbolic tensor.
260 """
--> 261 return _constant_impl(value, dtype, shape, name, verify_shape=False,
262 allow_broadcast=True)
263
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
268 ctx = context.context()
269 if ctx.executing_eagerly():
--> 270 t = convert_to_eager_tensor(value, ctx, dtype)
271 if shape is None:
272 return t
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
94 dtype = dtypes.as_dtype(dtype).as_datatype_enum
95 ctx.ensure_initialized()
---> 96 return ops.EagerTensor(value, ctx.device_name, dtype)
97
98
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
I already tried to use
dfall['description']=dfall['description'].apply(np.asarray)
but then I got
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).
For all have same problem the solution is
tf.data.Dataset.from_tensor_slices((dataframe .to_dict(orient='list'), labels))

Unless there is a good reason to use Tensorflow, I would advise to start with a simple model first. Use scikit-learn and follow their tutorial on working with text data. This will show you techniques like Bag of words (BoW) embeddings or TF-IDF embeddings.
For your particular problem, one thing really interesting to try is the following: you embed your article description using BoW or TF-IDF, and you embed the rest of your features as you would for regular tabular data. And then you concatenate the embeddings and feed that to a linear classifier in scikit-learn.

In keras, how can you clone a model with custom objects?

I have a model with a custom activation. As a result,
model2 = keras.models.clone_model(model)
gives an error. I'm able to load saved models using custom_objects keyword, but I see no such option on clone_model. Is there a way around it besides remaking the model and transferring weights?
EDIT:
Here's example code (toy problem):
import tensorflow.keras as keras
import tensorflow.keras.backend as K
def myTanh(x):
return K.tanh(x)
inp = keras.Input(shape=(10,10,1))
flat = keras.layers.Flatten()(inp)
out = keras.layers.Dense(20, activation=myTanh)(flat)
model = keras.Model(inp,out)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss='categorical_crossentropy')
model2 = keras.models.clone_model(model)
And the error dump:
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/models.py in clone_model(model, input_tensors)
269 return _clone_sequential_model(model, input_tensors=input_tensors)
270 else:
--> 271 return _clone_functional_model(model, input_tensors=input_tensors)
272
273
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/models.py in _clone_functional_model(model, input_tensors)
129 if layer not in layer_map:
130 # Clone layer.
--> 131 new_layer = layer.__class__.from_config(layer.get_config())
132 layer_map[layer] = new_layer
133 layer = new_layer
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py in from_config(cls, config)
400 A layer instance.
401 """
--> 402 return cls(**config)
403
404 def compute_output_shape(self, input_shape):
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/layers/core.py in __init__(self, units, activation, use_bias, kernel_initializer, bias_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, kernel_constraint, bias_constraint, **kwargs)
920 activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
921 self.units = int(units)
--> 922 self.activation = activations.get(activation)
923 self.use_bias = use_bias
924 self.kernel_initializer = initializers.get(kernel_initializer)
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/activations.py in get(identifier)
209 if isinstance(identifier, six.string_types):
210 identifier = str(identifier)
--> 211 return deserialize(identifier)
212 elif callable(identifier):
213 return identifier
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/activations.py in deserialize(name, custom_objects)
200 module_objects=globals(),
201 custom_objects=custom_objects,
--> 202 printable_module_name='activation function')
203
204
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/utils/generic_utils.py in deserialize_keras_object(identifier, module_objects, custom_objects, printable_module_name)
210 if fn is None:
211 raise ValueError('Unknown ' + printable_module_name + ':' +
--> 212 function_name)
213 return fn
214 else:
ValueError: Unknown activation function:myTanh

I solved the issue by calling
keras.utils.get_custom_objects().update(custom_objects)
Right after the definition of the additional objects that keras must be aware of to properly clone the model.
def lrelu(x, alpha=0.2):
return tf.nn.relu(x) * (1 - alpha) + x * alpha
custom_object = {
'lrelu': lrelu,
}
keras.utils.get_custom_objects().update(custom_objects)

This is an open bug in Keras.
The suggested way around is to use a Lambda layer in stead of an Activation layer.
x = keras.layers.Lambda(my_custom_activation_function)(x)

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

tensorflow dataset shuffle examples instead of batches - tensorflow

ah, the order of batch and shuffle matters, if I set up the dataset like dataset = tf.data.TFRecordDataset(tfrec_output_filename) \ .shuffle(2*N) \ .batch(batch_size) \ .repeat(epochs) \ .map(parse_example, num_parallel_calls=2) with shuffle before batch, then it works.

Related

How efficiently filter a specific number of entries and concatenating them in a unique tf.data.Dataset?

ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2

XGBoostError: problem with pipeline and scikit.learn

Use Text as feature column in Tensorflows existing Estimator

In keras, how can you clone a model with custom objects?

Categories

Resources