ValueError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]] - Tokenizing BERT / Distilbert Error - tokenize

def split_data(path):
df = pd.read_csv(path)
return train_test_split(df , test_size=0.1, random_state=100)
train, test = split_data(DATA_DIR)
train_texts, train_labels = train['text'].to_list(), train['sentiment'].to_list()
test_texts, test_labels = test['text'].to_list(), test['sentiment'].to_list()
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=100)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
When I tried to split from the dataframe using BERT tokenizers I got an error us such.

I had the same error. The problem was that I had None in my list, e.g:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-german-cased')
# create test dataframe
texts = ['Vero Moda Damen Übergangsmantel Kurzmantel Chic Business Coatigan SALE',
'Neu Herren Damen Sportschuhe Sneaker Turnschuhe Freizeit 1975 Schuhe Gr. 36-46',
'KOMBI-ANGEBOT Zuckerpaste STRONG / SOFT / ZUBEHÖR -Sugaring Wachs Haarentfernung',
None]
labels = [1, 2, 3, 1]
d = {'texts': texts, 'labels': labels}
test_df = pd.DataFrame(d)
So, before I converted the Dataframe columns to list I remove all None rows.
test_df = test_df.dropna()
texts = test_df["texts"].tolist()
texts_encodings = tokenizer(texts, truncation=True, padding=True)
This worked for me.

In my case I had to set is_split_into_words=True
https://huggingface.co/transformers/main_classes/tokenizer.html
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set is_split_into_words=True (to lift the ambiguity with a batch of sequences).

Similar to MarkusOdenthal I had a non string type in my list. I fixed it by converting the column to string, then converting it to a list, before splitting it into train and test segments. So you would do
train_texts = train['text'].astype(str).values.to_list()

def split_data(path):
df = pd.read_csv(path)
return train_test_split(df , test_size=0.2, random_state=100)
train, test = split_data(DATA_DIR)
train_texts, train_labels = train['text'].to_list(), train['sentiment'].to_list()
test_texts, test_labels = test['text'].to_list(), test['sentiment'].to_list()
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=100)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
Try changing the size of the split. It will work. Which means that the split data wasn't enough for tokenizer to tokenize

Related

Huggingface - Finetuning in Tensorflow with custom datasets

I have been battling with my own implementation on my dataset with a different transformer model than the tutorial, and I have been getting this error AttributeError: 'NoneType' object has no attribute 'dtype', when i was starting to train my model. I have been trying to debug for hours, and then I have tried the tutorial from hugging face as it can be found here https://huggingface.co/transformers/v3.2.0/custom_datasets.html. Running this exact code, so I could identify my mistake, also leads to the same error.
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
from pathlib import Path
def read_imdb_split(split_dir):
split_dir = Path(split_dir)
texts = []
labels = []
for label_dir in ["pos", "neg"]:
for text_file in (split_dir/label_dir).iterdir():
texts.append(text_file.read_text())
labels.append(0 if label_dir is "neg" else 1)
return texts, labels
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels
))
from transformers import TFDistilBertForSequenceClassification
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss) # can also use any keras loss fn
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)
My goal will be to perform multi-label text classification on my own custom dataset, which unfortunately I cannot share for privacy reasons. If anyone could point out what is wrong with this implementation, will be highly appreciated.
There seems to be an error, when you are passing the loss parameter.
model.compile(optimizer=optimizer, loss=model.compute_loss) # can also use any keras loss fn
You don't need to pass the loss parameter, if you want to use the model's built-in loss function.
I was able to train the model with your provided source code by changing mentioned line to:
model.compile(optimizer=optimizer)
or by passing a loss function
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn)
transformers version: 4.20.1
Hope it helps.

Filling a dataframe with a list to get the max_leaf_nodes with the lowest mean_absolute_error

I made a simple DecisionTreeRegressor and want to get the best max_leaf_nodes value.
Code:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import train_test_split as TTS
#split the data in 2 parts: training data and validation data
train_X, val_X, train_y, val_y = TTS(X, y, random_state=0)
#Define and fit the modell with the training data
model = DecisionTreeRegressor(random_state=1)
model.fit(train_X, train_y)
#predict
val_prediction = model.predict(val_X)
#check predictions
print(MAE(val_prediction, val_y))
#defining get_mae function
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
#DataFrame and list
df_mae = pd.DataFrame(columns = ["MAE"])
li = []
#collecting mae's depending on max_leaf_nodes values
for max_leaf_nodes in range(2, 10000, 2):
mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
li.append(mae)
How can I add the values of li to the "MAE" column of df_mae?
Is there a better way to find good max_leaf_nodes? (My Laptop was working on that for-loop for 25 minutes)
You could append a row directly in the dataframe, instead of creating a list first.
df_mae = df_mae.append({'MAE': mae}, ignore_index = True)
However, if you prefer to add the list instead of individual values (outside the for loop):
df_mae = df_mae.append(pd.DataFrame({'MAE': li}), ignore_index = True)
Please, be aware that you need to store the max_leaf_nodes as well, otherwise your resulting dataframe won't be meaningful.
df_mae = pd.DataFrame(columns = ["MAE", "max_leaf_nodes"])
li = []
max_leaf_nodes_list = []
for max_leaf_nodes in range(2, 10000, 2):
mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
li.append(mae)
max_leaf_nodes_list.append(max_leaf_nodes)
df_mae = df_mae.append(pd.DataFrame({'MAE': li, 'max_leaf_nodes': max_leaf_nodes_list}), ignore_index = True)
or, appending the values into the dataframe directly:
df_mae = pd.DataFrame(columns = ["MAE", "max_leaf_nodes"])
for max_leaf_nodes in range(2, 10000, 2):
mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
df_mae = df_mae.append({'MAE': mae, 'max_leaf_nodes': max_leaf_nodes}, ignore_index = True)
To reduce the execution time with this approach, I would increase the step from 2 to a bigger number on the range function. Once you find the interval which produces the best values, you can limit the interval to find an even better metric. In other words, searching the entire hyperparameter grid is not the best approach.
Alternatively, you could use other methods such as Hyperopt or Hyperopt-sklearn.

Tensorflow time-series classification using parquet files

I am currently receiving one of the following errors (depending on the sequence of data prep):
TypeError: Inputs to a layer should be tensors. Got: <tensorflow.python.data.ops.dataset_ops._NestedVariant object at 0x000001E02F62FB00>
TypeError: Inputs to a layer should be tensors. Got: <_VariantDataset shapes: OrderedDict
Background: I have some parquet files, where each file is a multi-variate time-series. Since I am using the files for a multivariate time-series classification problem, I am storing the labels in a single numpy array. I need to use tf.data.Dataset for reading the files, since I cannot fit them all in memory.
Here is a working example that reproduces my error:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Masking, LSTM, Dropout, Dense
#!pip install tensorflow-io
import tensorflow_io as tfio
num_files = 10
num_features = 3
num_timesteps = 50
num_classes = 2
batch_size = 2
for i in range(num_files):
df = pd.DataFrame({"A": np.random.rand(num_timesteps), "B": np.random.rand(num_timesteps), "C": np.random.rand(num_timesteps)})
df.to_parquet("file_{}.parquet".format(i))
columns_init = {"A": tf.TensorSpec(tf.TensorShape([]), tf.float32), "B": tf.TensorSpec(tf.TensorShape([]), tf.float32), "C": tf.TensorSpec(tf.TensorShape([]), tf.float32)}
labels = np.array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0])
train_split_size = 0.8
num_train_files = int(train_split_size * num_files)
train_names = ["file_{}.parquet".format(i) for i in range(num_train_files)]
val_names = ["file_{}.parquet".format(i) for i in range(num_train_files, num_files)]
y_train = labels[ : num_train_files]
y_val = labels[num_train_files : num_files]
def map_fn(file_names, label_ds):
return tfio.IODataset.from_parquet(file_names, columns=columns_init), label_ds
train_ds = tf.data.Dataset.from_tensor_slices((train_names, y_train))
train_ds = train_ds.shuffle(buffer_size = num_train_files)
train_ds = train_ds.map(map_fn)
train_ds = train_ds.batch(batch_size)
train_ds = train_ds.prefetch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((val_names, y_val))
# No need for shuffling the validation set
val_ds = val_ds.map(map_fn)
val_ds = val_ds.batch(batch_size)
val_ds = val_ds.prefetch(batch_size)
ip = Input(shape=(num_timesteps, num_features))
x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='softmax')(x)
model = Model(ip, out)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)
How do I overcome this error? I would prefer to keep my files separate and shuffle only how they are batched, since I don't want to meddle with the time-series sequences within the files. Is there a similar solution for .csv files instead of .parquet. I prefer parquet files because they are lighter and easier to read, but I am happy to convert my files if there is no turnaround.
For anyone experiencing a similar issue, I found a workaround, which was not straightforward. In this case, I defined a common_ds function for reading all the data from the files. I applied batching, where the batch size is equal to the time-series length to split the observations as they were stored. (Note: this assumes that the files are already preprocessed and all the files have equal number of rows.) After combining the features with the labels, the data is shuffled and batched according to the desired batch size. The final step uses the pack_features_function to change the format into tensor shapes that can be fed to the model.
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Masking, LSTM, Dropout, Dense, Input
#!pip install tensorflow-io
import tensorflow_io as tfio
num_files = 10
num_features = 3
num_timesteps = 50
num_classes = 2
batch_size = 2
for i in range(num_files):
df = pd.DataFrame({"A": np.random.rand(num_timesteps),
"B": np.random.rand(num_timesteps),
"C": np.random.rand(num_timesteps)})
df.to_parquet("file_{}.parquet".format(i))
columns_init = {"A": tf.TensorSpec(tf.TensorShape([]), tf.float32),
"B": tf.TensorSpec(tf.TensorShape([]), tf.float32),
"C": tf.TensorSpec(tf.TensorShape([]), tf.float32)}
labels = np.array([0, 1, 1, 1, 0, 1, 0, 0, 1, 0])
train_split_size = 0.8
num_train_files = int(train_split_size * num_files)
train_names = ["file_{}.parquet".format(i) for i in range(num_train_files)]
val_names = ["file_{}.parquet".format(i) for i in range(num_train_files, num_files)]
y_train = labels[ : num_train_files]
y_val = labels[num_train_files : num_files]
def make_common_ds(files):
common_ds = tfio.IODataset.from_parquet(files[0], columns=columns_init)
for file_name in files[1:]:
ds = tfio.IODataset.from_parquet(file_name, columns=columns_init)
common_ds = common_ds.concatenate(ds)
return common_ds
def pack_features_vector(features, labels):
"""Pack the features into a single array."""
features = tf.stack(list(features.values()), axis=2)
return features, labels
train_names_ds = make_common_ds(train_names)
train_names_ds = train_names_ds.batch(num_timesteps)
train_label_ds = tf.data.Dataset.from_tensor_slices(y_train)
train_ds = tf.data.Dataset.zip((train_names_ds, train_label_ds))
train_ds = train_ds.shuffle(buffer_size = num_train_files)
train_ds = train_ds.batch(batch_size)
train_ds = train_ds.prefetch(batch_size)
train_ds = train_ds.map(pack_features_vector)
val_names_ds = make_common_ds(val_names)
val_names_ds = val_names_ds.batch(num_timesteps)
val_label_ds = tf.data.Dataset.from_tensor_slices(y_val)
val_ds = tf.data.Dataset.zip((val_names_ds, val_label_ds))
# No need to shuffle the validation set
val_ds = val_ds.batch(batch_size)
val_ds = val_ds.prefetch(batch_size)
val_ds = val_ds.map(pack_features_vector)
ip = Input(shape=(num_timesteps, num_features))
x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)
out = Dense(1, activation='softmax')(x)
model = Model(ip, out)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)

How to print feature values after transformation inside tensorflow model

How can I see the value of final features that are being trained inside tensorflow model. Like in the below case I am trying to multi-hot my column 'x' and I want to see how the features are going to my model.
This is very easy to do in sklearn but being new to Tensorflow I dont understand how is it possible.
import tensorflow as tf
import pandas as pd
data = {'x':['a c', 'a b', 'b c'], 'y': [1, 1, 0]}
df = pd.DataFrame(data)
Y = df['y']
X = df.drop('y', axis=1)
indicator_features = [tf.feature_column.indicator_column(categorical_column=
tf.feature_column.categorical_column_with_vocabulary_list(key = 'x',
vocabulary_list = ['a','b','c']))]
model = tf.estimator.LinearClassifier(feature_columns=indicator_features,
model_dir = "/tmp/samplemodel")
training_input_fn = tf.estimator.inputs.pandas_input_fn(x = X,
y=Y,
batch_size=64,
shuffle= True,
num_epochs = None)
model.train(input_fn=training_input_fn,steps=1000)
I have been able to print the values by enabling the eager execution in tensorflow.
Posting my solution below. Welcome for any other ideas as well.
import tensorflow as tf
import tensorflow.feature_column as fc
import pandas as pd
PATH = "/tmp/sample.csv"
tf.enable_eager_execution()
COLUMNS = ['education','label']
train_df = pd.read_csv(PATH, header=None, names = COLUMNS)
#train_df['education'] = train_df['education'].str.split(" ")
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
label = df[label_key]
ed = tf.string_split(df['education']," ")
df['education'] = ed
ds = tf.data.Dataset.from_tensor_slices((dict(df),label))
if shuffle:
ds = ds.shuffle(10000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
ds = easy_input_function(train_df, label_key='label', num_epochs=5, shuffle=False, batch_size=5)
for feature_batch, label_batch in ds.take(1):
print('Some feature keys:', list(feature_batch.keys())[:5])
print()
print('A batch of education :', feature_batch['education'])
print()
print('A batch of Labels:', label_batch )
print(feature_batch)
education_vocabulary_list = [
'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
'5th-6th', '10th', '1st-4th', 'Preschool', '12th']
education = tf.feature_column.categorical_column_with_vocabulary_list('education', vocabulary_list=education_vocabulary_list)
fc.input_layer(feature_batch, [fc.indicator_column(education)])

Tensorflow padded_batch for sparse tensor?

I have a code, which is like that
import tensorflow as tf
import numpy as np
sequences = np.array([[1,3,4],[5,6,7,8],[9,10,11,12,13],[14,15]])
def generator():
for el in sequences:
yield el, np.random.randn(3,5).astype('float32')
def parser(dense_tensor,spectrogram):
labels = tf.contrib.layers.dense_to_sparse(dense_tensor)
return spectrogram,labels
dataset = tf.data.Dataset().from_generator(generator, output_types= (tf.int64, tf.float32), output_shapes=([None],[None,None]))
dataset = dataset.map(lambda den, spec: parser(den,spec)).batch(2)
iter = dataset.make_initializable_iterator()
spectrogram,labels = iter.get_next()
with tf.Session() as sess:
sess.run(iter.initializer)
while True:
try:
spar,spe = sess.run([labels,spectrogram])
print(spar, spe.shape)
except Exception as e:
#print(e)
break
where I am using the tf.data to get the labels and spectrogram for speech to text. I have put a toy example above, it is ok, if I have a same length signal for speech, but for different length signal in batch, I need to do padded_batch, but dense_to_sparse does not allow the padded batch, any solution where I can use padded_batch with sparse tensor?
import tensorflow as tf
import numpy as np
def generator():
for el in sequences:
yield el, np.random.randn(np.random.randint(1,4),5).astype('float32')
def parser(dense_tensor,spectrogram):
#labels = tf.contrib.layers.dense_to_sparse(dense_tensor, eos_token=100)
labels = dense_tensor
return spectrogram,labels
dataset = tf.data.Dataset().from_generator(generator, output_types= (tf.int64, tf.float32), output_shapes=([None],[None,None]))
dataset = dataset.map(lambda den, spec: parser(den,spec)).padded_batch(2, ([None,None],[None]),padding_values=(0. , tf.constant(100,dtype=tf.int64)))
iter = dataset.make_initializable_iterator()
spectrogram,labels = iter.get_next()
res = tf.contrib.layers.dense_to_sparse(labels,eos_token=100)
print(res)
with tf.Session() as sess:
sess.run(iter.initializer)
while True:
try:
spar,spe,res1 = sess.run([labels,spectrogram,res])
print(res1, spar,spe)
except Exception as e:
#print(e)
break