tensorflow 2 TextVectorization process tensor and dataset error - tensorflow

I would like to process text with tensorflow 2.8 on Jupyter notebook.
my code:
import re
import string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_text as tf_text
def standardize(input_data):
lowercase_str = tf.strings.lower(input_data)
a_str = tf.strings.regex_replace(lowercase_str, f"[{re.escape(string.punctuation)}]", "")
tokenizer = tf_text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(a_str)
return tokens
# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name
# e.g. one column is "sports", so the input_dataset["sports"]
# return a tensor, which is like the following example
my_data_tensor = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]])
tf.print(my_data_tensor)
tf.print(my_data_tensor.shape)
tf.print(f"type is {type(my_data_tensor)}")
text_layer = layers.TextVectorization(
standardize = standardize,
max_tokens = 10,
output_mode = 'int',
output_sequence_length=10
)
my_dataset = tf.data.Dataset.from_tensor_slices(my_data_tensor)
text_layer.adapt(my_dataset.batch(2)) # error         
processed_text = text_layer(my_dataset)
error:
ValueError: Exception encountered when calling layer "query_tower" (type QueryTower).
When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(2, 1, None) with rank=3
I have tried tf.unstack() and tf.reshape, tf.unbatch, but none of them work.
For the given example:
[["SWIM 2008-07 Baseball"], ["Football"]]
what I need:
[["swim 200807 baseball"], ["football"]]
then
it will be encoded as int by the "text_layer"
these data (bach_size=2) will be used for a machine learning model as features.
Did I do something wrong ? thanks

You could try something like this:
import re
import string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_text as tf_text
def standardize(input_data):
lowercase_str = tf.strings.lower(input_data)
a_str = tf.strings.regex_replace(lowercase_str, f"[{re.escape(string.punctuation)}]", "")
tokenizer = tf_text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(a_str)
return tokens
# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name
# e.g. one column is "sports", so the input_dataset["sports"]
# return a tensor, which is like the following example
my_data_tensor = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]])
tf.print(my_data_tensor)
tf.print(my_data_tensor.shape)
tf.print(f"type is {type(my_data_tensor)}")
text_layer = layers.TextVectorization(
standardize = standardize,
max_tokens = 10,
output_mode = 'int',
output_sequence_length=10
)
my_dataset = tf.data.Dataset.from_tensor_slices(my_data_tensor)
my_dataset = tf.data.Dataset.from_tensor_slices((tf.concat(list(my_dataset.map(lambda x: x)), axis=0)))
text_layer.adapt(my_dataset)
my_dataset = my_dataset.batch(2)
processed_text = my_dataset.map(lambda x: text_layer(tf.squeeze(x, axis=-1)))
for p in process_text:
print(p)
[["SWIM 2008-07 Baseball"]
["Football"]]
TensorShape([2, 1])
type is <class 'tensorflow.python.framework.ops.EagerTensor'>
(<tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[2, 5, 6, 4, 0, 0, 0, 0, 0, 0],
[3, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 0], dtype=int32)>)

the callable function returns the target text value.
enter link description here
You may adapts to use Bytes_split for target values.
[ Sample ]:
import tensorflow as tf
import tensorflow_text as tf_text
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, "<[^>]+>", " ")
return input_data
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Varibles
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Working
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
print("")
print("")
print("")
dataset = tf.data.Dataset.from_tensors( standardize(input_data) )
dataset = dataset.batch(2)
process_text = text_layer.adapt(dataset)
print( "standardize: " + str(standardize(input_data)) )
print( "process_text: " + str(process_text) )
[ Output ]:
standardize: tf.Tensor(
[[b'swim 2008-07 baseball']
[b'football']], shape=(2, 1), dtype=string)
process_text: None
Reference

Related

Tensorflow time-series classification using parquet files

I am currently receiving one of the following errors (depending on the sequence of data prep):
TypeError: Inputs to a layer should be tensors. Got: <tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x000001E02F62FB00>
TypeError: Inputs to a layer should be tensors. Got: <_VariantDataset shapes: OrderedDict
Background: I have some parquet files, where each file is a multi-variate time-series. Since I am using the files for a multivariate time-series classification problem, I am storing the labels in a single numpy array. I need to use tf.data.Dataset for reading the files, since I cannot fit them all in memory.
Here is a working example that reproduces my error:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Masking, LSTM, Dropout, Dense
#!pip install tensorflow-io
import tensorflow_io as tfio
num_files = 10
num_features = 3
num_timesteps = 50
num_classes = 2
batch_size = 2
for i in range(num_files):
df = pd.DataFrame({"A": np.random.rand(num_timesteps), "B": np.random.rand(num_timesteps), "C": np.random.rand(num_timesteps)})
df.to_parquet("file_{}.parquet".format(i))
columns_init = {"A": tf.TensorSpec(tf.TensorShape([]), tf.float32), "B": tf.TensorSpec(tf.TensorShape([]), tf.float32), "C": tf.TensorSpec(tf.TensorShape([]), tf.float32)}
labels = np.array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0])
train_split_size = 0.8
num_train_files = int(train_split_size * num_files)
train_names = ["file_{}.parquet".format(i) for i in range(num_train_files)]
val_names = ["file_{}.parquet".format(i) for i in range(num_train_files, num_files)]
y_train = labels[ : num_train_files]
y_val = labels[num_train_files : num_files]
def map_fn(file_names, label_ds):
return tfio.IODataset.from_parquet(file_names, columns=columns_init), label_ds
train_ds = tf.data.Dataset.from_tensor_slices((train_names, y_train))
train_ds = train_ds.shuffle(buffer_size = num_train_files)
train_ds = train_ds.map(map_fn)
train_ds = train_ds.batch(batch_size)
train_ds = train_ds.prefetch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((val_names, y_val))
# No need for shuffling the validation set
val_ds = val_ds.map(map_fn)
val_ds = val_ds.batch(batch_size)
val_ds = val_ds.prefetch(batch_size)
ip = Input(shape=(num_timesteps, num_features))
x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='softmax')(x)
model = Model(ip, out)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)
How do I overcome this error? I would prefer to keep my files separate and shuffle only how they are batched, since I don't want to meddle with the time-series sequences within the files. Is there a similar solution for .csv files instead of .parquet. I prefer parquet files because they are lighter and easier to read, but I am happy to convert my files if there is no turnaround.
For anyone experiencing a similar issue, I found a workaround, which was not straightforward. In this case, I defined a common_ds function for reading all the data from the files. I applied batching, where the batch size is equal to the time-series length to split the observations as they were stored. (Note: this assumes that the files are already preprocessed and all the files have equal number of rows.) After combining the features with the labels, the data is shuffled and batched according to the desired batch size. The final step uses the pack_features_function to change the format into tensor shapes that can be fed to the model.
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Masking, LSTM, Dropout, Dense, Input
#!pip install tensorflow-io
import tensorflow_io as tfio
num_files = 10
num_features = 3
num_timesteps = 50
num_classes = 2
batch_size = 2
for i in range(num_files):
df = pd.DataFrame({"A": np.random.rand(num_timesteps),
"B": np.random.rand(num_timesteps),
"C": np.random.rand(num_timesteps)})
df.to_parquet("file_{}.parquet".format(i))
columns_init = {"A": tf.TensorSpec(tf.TensorShape([]), tf.float32),
"B": tf.TensorSpec(tf.TensorShape([]), tf.float32),
"C": tf.TensorSpec(tf.TensorShape([]), tf.float32)}
labels = np.array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0])
train_split_size = 0.8
num_train_files = int(train_split_size * num_files)
train_names = ["file_{}.parquet".format(i) for i in range(num_train_files)]
val_names = ["file_{}.parquet".format(i) for i in range(num_train_files, num_files)]
y_train = labels[ : num_train_files]
y_val = labels[num_train_files : num_files]
def make_common_ds(files):
common_ds = tfio.IODataset.from_parquet(files[0], columns=columns_init)
for file_name in files[1:]:
ds = tfio.IODataset.from_parquet(file_name, columns=columns_init)
common_ds = common_ds.concatenate(ds)
return common_ds
def pack_features_vector(features, labels):
"""Pack the features into a single array."""
features = tf.stack(list(features.values()), axis=2)
return features, labels
train_names_ds = make_common_ds(train_names)
train_names_ds = train_names_ds.batch(num_timesteps)
train_label_ds = tf.data.Dataset.from_tensor_slices(y_train)
train_ds = tf.data.Dataset.zip((train_names_ds, train_label_ds))
train_ds = train_ds.shuffle(buffer_size = num_train_files)
train_ds = train_ds.batch(batch_size)
train_ds = train_ds.prefetch(batch_size)
train_ds = train_ds.map(pack_features_vector)
val_names_ds = make_common_ds(val_names)
val_names_ds = val_names_ds.batch(num_timesteps)
val_label_ds = tf.data.Dataset.from_tensor_slices(y_val)
val_ds = tf.data.Dataset.zip((val_names_ds, val_label_ds))
# No need to shuffle the validation set
val_ds = val_ds.batch(batch_size)
val_ds = val_ds.prefetch(batch_size)
val_ds = val_ds.map(pack_features_vector)
ip = Input(shape=(num_timesteps, num_features))
x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='softmax')(x)
model = Model(ip, out)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)

Tensor format issue from converting Pytorch -> Onnx -> Tensorflow

I have an issue with Tensorflow model that is converted from Pytorch -> Onnx -> Tensorflow. The issue is the converted Tensorflow model expects the input in Pytorch format that is (batch size, number channels, height, width) but not in Tensorflow format (batch size, height, width, number channel). Therefore, I cannot use the model to process further with Vitis AI.
So I would like to ask is there is any ways to convert this Pytorch input format to Tensorflow format by using tools from Onnx, Tensorflow 1, or others?
My code is as below:
Pytorch -> Onnx
from hardnet import hardnet
import torch
import onnx
ckpt = torch.load('../hardnet.pth')
model_state_dict = ckpt['model_state_dict']
optimizer_state_dict = ckpt['optimizer_state_dict']
model = hardnet(11)
model.load_state_dict(model_state_dict)
model.eval()
dummy_input = torch.randn(1, 3, 1080, 1920)
input_names = ['input0']
output_names = ['output0']
output_file = 'hardnet.onnx'
torch.onnx.export(model, dummy_input, output_file, verbose=True,
input_names=input_names, output_names=output_names,
opset_version=11, keep_initializers_as_inputs=True)
onnx_model = onnx.load(output_file)
onnx.checker.check_model(onnx_model)
print('Passed Onnx')
Onnx -> Tensorflow 1 (using Tensorflow 1.15)
import cv2
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import onnx
from onnx_tf.backend import prepare
output_file = 'hardnet.onnx'
onnx_model = onnx.load(output_file)
output = prepare(onnx_model)
output.export_graph('hardnet.pb')
tf.compat.v1.disable_eager_execution()
def load_pb(path_to_pb: str):
"""From: https://stackoverflow.com/questions/51278213/what-is-the-use-of-a-pb-file-in-tensorflow-and-how-does-it-work
"""
with tf.gfile.GFile(path_to_pb, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name='')
return graph
graph = load_pb('hardnet.pb')
input = graph.get_tensor_by_name('input0:0')
output = graph.get_tensor_by_name('output0:0')
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
img = cv2.imread('train_0.jpg', cv2.IMREAD_COLOR)
img = cv2.resize(img, (1920, 1080))
img = img/255
img = img - mean
img = img/std
img = np.expand_dims(img, -1)
# To Pytorch format.
img = np.transpose(img, (3, 2, 0, 1))
img = img
with tf.Session(graph=graph) as sess:
pred = sess.run(output, {input: img})
You could wrap your Pytorch model into another one that would do the transpose you want to have in TensorFlow. See the following example:
Let's say you have the following toy NN:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.rnn = nn.LSTM(10, 20, 2)
def forward(self, x):
h0 = torch.zeros(2, 3, 20)
c0 = torch.zeros(2, 3, 20)
return self.rnn(x, (h0, c0))
the exemplary pytorch/tensorflow input shape would be :
>> pytorch_input = torch.randn(5, 3, 10)
>> tf_input = torch.transpose(pytorch_input, 1, 2)
>> print("PyTorch input shape: ", pytorch_input.shape)
>> print("TensorFlow input shape: ", tf_input.shape)
PyTorch input shape: torch.Size([5, 3, 10])
TensorFlow input shape: torch.Size([5, 10, 3])
Now, the wrapper which will first transpose input and then pass transposed input to some model:
class NetTensorFlowWrapper(nn.Module):
def __init__(self, main_module: nn.Module):
super(NetTensorFlowWrapper, self).__init__()
self.main_module = main_module
def forward(self, x):
x = torch.transpose(x, 1, 2)
return self.main_module(x)
Then, this is possible:
net = Net()
net_wrapper = NetTensorFlowWrapper(net)
net(pytorch_input)
net_wrapper(tf_input)
and then, when you finally save your models like you did previously via torch.onnx.export and read their graph via onnx package (not torch.onnx) you will have...
for Net- input 5x3x10 and no transpose layer
graph torch-jit-export (
%input0[FLOAT, 5x3x10]
{
%76 = Shape(%input0)
%77 = Constant[value = <Scalar Tensor []>]()
for NetTensorFlowWrapper- input 5x10x3 and transpose layer
graph torch-jit-export (
%input0[FLOAT, 5x10x3]
{
%9 = Transpose[perm = [0, 2, 1]](%input0)
%77 = Shape(%9)
%78 = Constant[value = <Scalar Tensor []>]()
...

How to get number of rows, columns /dimensions of tensorflow.data.Dataset?

Like pandas_df.shape is there any way for tensorflow.data.Dataset?
Thanks.
I'm not familiar with something built-in, but the shapes could be retrieved from Dataset._tensors attribute. Example:
import tensorflow as tf
def dataset_shapes(dataset):
try:
return [x.get_shape().as_list() for x in dataset._tensors]
except TypeError:
return dataset._tensors.get_shape().as_list()
And usage:
from sklearn.datasets import make_blobs
x_train, y_train = make_blobs(n_samples=10,
n_features=2,
centers=[[1, 1], [-1, -1]],
cluster_std=0.5)
dataset = tf.data.Dataset.from_tensor_slices(x_train)
print(dataset_shapes(dataset)) # [10, 2]
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
print(dataset_shapes(dataset)) # [[10, 2], [10]]
To add to Vlad's answer, just in case someone is trying this out for datasets downloaded via tfds, a possible way is to use the dataset information:
info.features['image'].shape # shape of 1 feature in dataset
info.features['label'].num_classes # number of classes
info.splits['train'].num_examples # number of training examples
Eg. tf_flowers :
import tensorflow as tf
import tensorflow_datasets as tfds
dataset, info = tfds.load("tf_flowers", with_info=True) # download data with info
image_size = info.features['image'].shape # (None, None, 3)
num_classes = info.features['label'].num_classes # 5
data_size = info.splits['train'].num_examples # 3670
Eg. fashion_mnist :
import tensorflow as tf
import tensorflow_datasets as tfds
dataset, info = tfds.load("fashion_mnist", with_info=True) # download data with info
image_size = info.features['image'].shape # (28, 28, 1)
num_classes = info.features['label'].num_classes # 10
data_splits = {k:v.num_examples for k,v in info.splits.items()} # {'test': 10000, 'train': 60000}
Hope this helps.

how to calculate the derivate value of Multi-input models in keras by with tensorflow backend

My question is: I want to calculate the derivation of "time input" and "dense_input". Before asking question, I search the soluatoin of calculaing jacobian matrix by keras function.
After running it, But I got this error:
File "\keras\backend\tensorflow_backend.py", line 2614, in _call
dtype=tensor.dtype.base_dtype.name))
AttributeError: 'list' object has no attribute 'dtype'
Here is my simple version:
from keras.models import *
from keras.layers import *
import keras.backend as K
import pandas as pd
from keras import optimizers
def get_model(timestamp, features):
time_input = Input(shape=(timestamp, features,), name='time_input')
lstm_out = LSTM(4)(time_input)
dense_hidden_units = 2
dense_input_layer = Input(shape=(dense_length,), name='dense_input_layer')
final_input_layer = concatenate([lstm_out, dense_input_layer])
# Disable biases in the hidden layer
dense_1 = Dense(units=dense_hidden_units, use_bias=False, activation='sigmoid')(final_input_layer)
# Disable bias in output layer
output_layer = Dense(units=1, use_bias=False, name='final_output')(dense_1)
model = Model(
inputs=[time_input, dense_input_layer],
outputs=output_layer
)
print(model.summary())
return model
if __name__ == '__main__':
timestamp = 3
features = 1
dense_length = 3
temp_data = pd.DataFrame([
[1, 2, 3, 2, 3, 4],
])
time_data = temp_data.values.reshape(-1, timestamp, features)
dense_data = temp_data.values.reshape(-1, dense_length)
target_data = np.array([1, 2])
print(time_data.shape)
print(dense_data.shape)
print(target_data.shape)
model = get_model(
timestamp, features
)
Ada = optimizers.Adagrad(lr=0.09, epsilon=1e-04)
model.compile(loss='mse', optimizer=Ada, metrics=['mse'])
model.fit(
{
'time_input': time_data,
'dense_input_layer': dense_data,
},
{
'final_output': target_data
},
epochs=1, batch_size=1
)
time_input = model.get_layer('time_input').input
GPP_input_layer = model.get_layer('dense_input_layer').input
J = K.gradients(model.output, [time_input, GPP_input_layer])
jacobianTime = K.function([[time_input, GPP_input_layer], K.learning_phase()], J)
deriRes = jacobianTime([time_data, dense_data]) # this line throw exception
print(deriRes[0])
Thanks for help!
You have an extra set of brackets.
jacobianTime = K.function([[time_input, GPP_input_layer], K.learning_phase()], J)
to
jacobianTime = K.function([time_input, GPP_input_layer, K.learning_phase()], J)
I was able to run your code like this at least.

create multi-hot SparseTensor by categorical feature array column from CSV in TensorFlow

This is a typical way of handling sparse features (such as some ID features) in recommendation system. I'm looking for a convenient way to prepare the data for TensorFlow pipeline.
I did lots of search, but yet find the good solution yet.
Below is the one which seems to be close to what I need, but not working yet.
See ####### part below
The data file is like:
csv = [
'1221,cc,1',
'213,aa|cc|ff,1',
]
for the second row, i need some SparseTensor like multi-hot
aa bb cc dd ee ff
| 0 0 1 0 0 0 |
| 1 0 1 0 0 1 |
The full version of code is:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import shutil
import sys
import tensorflow as tf # pylint: disable=g-bad-import-order
_CSV_COLUMNS = ['a_id', 'b_id', 'tags', 'label']
_CSV_COLUMN_DEFAULTS = [[0], [0], [''], [0]]
def input_fn(data_file, num_epochs, shuffle, batch_size):
"""Generate an input function for the Estimator."""
assert tf.gfile.Exists(data_file), (
'%s not found. Please make sure you have run data_download.py and '
'set the --data_dir argument to the correct path.' % data_file)
"""
$ cat vocab.txt
a
b
c
d
e
f
g
h
i
j
k
l
m
n
"""
table = tf.contrib.lookup.index_table_from_file(
vocabulary_file='vocab.txt', num_oov_buckets=1)
def parse_csv(value):
print('Parsing', data_file)
columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
features = dict(zip(_CSV_COLUMNS, columns))
######################## BEGIN ###########################
# support multi-hot sparse features
split_tags = tf.string_split([columns[2]], '|') # hard-coded 'tags' column index
# Output: tags.indices Tensor("StringSplit:0", shape=(?, 2), dtype=int64)
print('tags.indices', split_tags.indices)
indice_idx = tf.map_fn(lambda x : x[0], split_tags.indices)
# Output: indice_idx Tensor("map/TensorArrayStack/TensorArrayGatherV3:0", shape=(?,), dtype=int64)
print('indice_idx', indice_idx)
value_idx = tf.map_fn(lambda x : x[1], split_tags.indices)
value_arr = tf.cast(tf.gather(split_tags.values, value_idx), tf.int64)
# Output: value_arr shape (?,)
print('value_arr shape', value_arr.shape)
# stack is doing: [1, 2, 3], [4, 5, 6] ==> [[1, 2], [3, 4], [5,6]]
new_indices = tf.stack([indice_idx, value_arr], axis=1)
print('new_indices', new_indices)
new_values = tf.ones_like(value_arr)
# Output: new_values Tensor("ones_like:0", shape=(?,), dtype=int64)
print('new_values', new_values)
with tf.Session() as s1:
s1.run([tf.global_variables_initializer(), tf.tables_initializer()])
##### FAIL here #####
# InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'arg0' with dtype string
# [[Node: arg0 = Placeholder[dtype=DT_STRING, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
print(split_tags.values.eval())
print(indice_idx.eval())
print('value_arr', value_arr.eval())
print('new_values', new_values.eval())
categorial_tensor = tf.SparseTensor(
indices=new_indices,
values=new_values,
dense_shape=[new_indices.shape[1], 4])
######################## END ###########################
categorical_cols = {
'tags': categorial_tensor}
features.update(categorical_cols)
labels = features.pop('label')
return features, tf.equal(labels, 1)
# Extract lines from input files using the Dataset API.
dataset = tf.data.TextLineDataset(data_file)
if shuffle:
dataset = dataset.shuffle(buffer_size=6) # num of lines in the file
dataset = dataset.map(parse_csv, num_parallel_calls=5)
# We call repeat after shuffling, rather than before, to prevent separate
# epochs from blending together.
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
return dataset
"""
$ cat data.csv
1,2,a|c|g,1
0,1,c|f,0
0,2,b|g,1
0,1,b|v,0
0,1,g|j|k|l,1
0,1,a,0
"""
train_file = 'data.csv'
epochs_between_evals = 2
batch_size = 40
ds = input_fn(train_file, epochs_between_evals, True, batch_size)
with tf.Session() as s:
s.run([tf.global_variables_initializer(), tf.tables_initializer()])
print(s.run(ds))