Email classification using word2vec - tensorflow

My goal is to classify email, so each email have to correspond to a specific category like sport, clothes and so on.
My idea is to use a Recurrent Neural Network with one layer based on embedding (by word2vec). I'm using the followed schema "emojify v2" with some changing:
https://github.com/enggen/Deep-Learning-Coursera/blob/master/Sequence%20Models/Week2/Emojify/Emojify%20-%20v2.ipynb
and the word2vec pretrained model downloaded from:
http://hlt.isti.cnr.it/wordembeddings/
The problem is that this neural network does not work properly. In fact, trying to train the network on only 2 categories (20 sample). I get an accuracy of 50-60% on the training set that is of course too low.
Do you have any hint to improve my result?
What could be the problem?
a low number of samples?
a low number of nodes?
using loss='categorical_crossentropy'?
using optimizer='adam'?
the text of the email is too long?
the lenght of each email is different. This could be a problem?
I'm using keras (tensorflow).
edit1: insert code
import numpy as np
import emoji
import matplotlib.pyplot as plt
import numpy as np
# utiliti che serviranno in seguito
import csv
import numpy as np
import emoji
import pandas as pd
from sklearn.metrics import confusion_matrix
def read_glove_vecs(glove_file):
with open(glove_file, 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
def get_categories(filename):
categories_code = []
categories_name = []
with open(filename, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
category_code = row[1];
if (search(category_code, categories_code) == False):
categories_code.append(row[1])
categories_name.append(row[2])
Y = np.asarray(categories_code)
Z = np.asarray(categories_name)
return Y, Z
def search(category, categories):
for c in categories:
if (c == category):
return True
return False
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def read_csv_for_email(filename):
name = []
text = []
category = []
category_code = []
with open(filename, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
name.append(row[0])
category_code.append(row[1])
category.append(row[2])
text.append(row[3])
X = np.asarray(name)
Y = np.asarray(category_code)
Z = np.asarray(category)
W = np.asarray(text)
return X, Y, Z, W
def convert_to_one_hot(Y, C):
Y = np.eye(C)[Y.reshape(-1)]
return Y
# converto in one hot vector
def my_convert_to_one_hot(Y, values):
#print(Y)
#print(values)
col = len(values)
row = len(Y)
#print(col)
#print(row)
R = np.zeros((row, col))
for res in range(row):
for i in range(col):
#print(Y[res])
#print(values[i])
if Y[res] == values[i]:
X = [0]*col;
X[i] = 1;
R[res] = np.array(X)
return R;
def print_predictions(X, pred):
print()
for i in range(X.shape[0]):
print(X[i], label_to_emoji(int(pred[i])))
def plot_confusion_matrix(y_actu, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r):
df_confusion = pd.crosstab(y_actu, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)
df_conf_norm = df_confusion / df_confusion.sum(axis=1)
plt.matshow(df_confusion, cmap=cmap) # imshow
#plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(df_confusion.columns))
plt.xticks(tick_marks, df_confusion.columns, rotation=45)
plt.yticks(tick_marks, df_confusion.index)
#plt.tight_layout()
plt.ylabel(df_confusion.index.name)
plt.xlabel(df_confusion.columns.name)
def predict(X, Y, W, b, word_to_vec_map):
"""
Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.
Arguments:
X -- input data containing sentences, numpy array of shape (m, None)
Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)
Returns:
pred -- numpy array of shape (m, 1) with your predictions
"""
m = X.shape[0]
pred = np.zeros((m, 1))
for j in range(m): # Loop over training examples
# Split jth test example (sentence) into list of lower case words
words = X[j].lower().split()
# Average words' vectors
avg = np.zeros((50,))
for w in words:
avg += word_to_vec_map[w]
avg = avg/len(words)
# Forward propagation
Z = np.dot(W, avg) + b
A = softmax(Z)
pred[j] = np.argmax(A)
print("Accuracy: " + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
return pred
import tensorflow as tf
sess = tf.Session()
from keras.layers import Input, Dense, Activation
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)
# GRADED FUNCTION: sentences_to_indices
def sentences_to_indices(X, word_to_index, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to `Embedding()` (described in Figure 4).
Arguments:
X -- array of sentences (strings), of shape (m, 1)
word_to_index -- a dictionary containing the each word mapped to its index
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
Returns:
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
m = X.shape[0] # number of training examples
### START CODE HERE ###
# Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
X_indices = np.zeros((m, max_len))
for i in range(m): # loop over training examples
# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
sentence_words = X[i].lower().split()
# Initialize j to 0
j = 0
# Loop over the words of sentence_words
for w in sentence_words:
# Set the (i,j)th entry of X_indices to the index of the correct word.
if w in word_to_index:
if (j >= maxLen):
continue
X_indices[i, j] = word_to_index[w]
# Increment j to j + 1
j = j + 1
### END CODE HERE ###
return X_indices
from gensim.models import Word2Vec
model = Word2Vec.load('/home/ale/Scrivania/emoji_ita/embedding/glove_WIKI') # glove model
def getMyModels (model):
word_to_index = dict({})
index_to_word = dict({})
word_to_vec_map = dict({})
for idx, key in enumerate(model.wv.vocab):
word_to_index[key] = idx
index_to_word[idx] = key
word_to_vec_map[key] = model.wv[key]
return word_to_index, index_to_word, word_to_vec_map
code_train, category_train, category_code_train, text_train = read_csv_for_email('/home/ale/Scrivania/TestClassificazione/dataText2Categories.csv')
#FIXME
X_test, Y_test, Z_test, text_test = read_csv_for_email('C:\Users\Alessio\Desktop\3Febbraio\dataText2Categories.csv')
word_to_index, index_to_word, word_to_vec_map = getMyModels(model)
# EMBEDDING LAYER
# GRADED FUNCTION: pretrained_embedding_layer
# emb_dim define dimensionality of your GloVe word vectors (= 50)
def pretrained_embedding_layer(word_to_vec_map, word_to_index, emb_dim):
"""
Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
Arguments:
word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)
Returns:
embedding_layer -- pretrained layer Keras instance
"""
vocab_len = len(word_to_index) + 1 # adding 1 to fit Keras embedding (requirement)
### START CODE HERE ###
# Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
emb_matrix = np.zeros((vocab_len, emb_dim))
# Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
for word, index in word_to_index.items():
emb_matrix[index, :] = word_to_vec_map[word]
# Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False.
embedding_layer = Embedding(vocab_len, emb_dim)
### END CODE HERE ###
# Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
embedding_layer.set_weights([emb_matrix])
return embedding_layer
# MODELLO EMOJIFY2
def Classifier(input_shape, word_to_vec_map, word_to_index, emb_dim, num_activation):
"""
Function creating the Emojify-v2 model's graph.
Arguments:
input_shape -- shape of the input, usually (max_len,)
word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)
Returns:
model -- a model instance in Keras
"""
### START CODE HERE ###
# Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
sentence_indices = Input(shape=input_shape, dtype=np.int32)
# Create the embedding layer pretrained with GloVe Vectors (≈1 line)
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, emb_dim)
# Propagate sentence_indices through your embedding layer, you get back the embeddings
embeddings = embedding_layer(sentence_indices)
# Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
# Be careful, the returned output should be a batch of sequences.
X = LSTM(128, return_sequences=True)(embeddings)
# Add dropout with a probability of 0.5
X = Dropout(0.5)(X)
# Propagate X trough another LSTM layer with 128-dimensional hidden state
# Be careful, the returned output should be a single hidden state, not a batch of sequences.
X = LSTM(128)(X)
# Add dropout with a probability of 0.5
X = Dropout(0.5)(X)
# Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
X = Dense(num_activation, activation='softmax')(X)
# Add a softmax activation
X = Activation('softmax')(X)
# Create Model instance which converts sentence_indices into X.
model = Model(sentence_indices, X)
### END CODE HERE ###
return model
def max_text_lenght(filename):
max_len = 0;
max_id = 0;
with open(filename, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
text = row[3];
current_lenght = len(text);
if (current_lenght > max_len):
max_len = current_lenght;
max_id = row[0]
#print(max_id)
return max_len
def create_values_list():
z, y = get_categories('C:\Users\Alessio\Desktop\3Febbraio\dataText2Categories.csv')
return y;
values = create_values_list()
num_activation = len(values)
maxLen = 300
print(maxLen)
model = Classifier((maxLen,), word_to_vec_map, word_to_index, maxLen, num_activation)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
Y_train_indices = sentences_to_indices(text_train, word_to_index, maxLen)
Z_train_oh = my_convert_to_one_hot(category_code_train, values)
print(Z_train_oh)
model.fit(Y_train_indices, Z_train_oh, epochs = 60, batch_size = 32, shuffle=True)
model.save('text_classification.h5')

Related

Implementation difference between TensorFlow LSTMBlockFusedCell and PyTorch LSTM

I am attempting to translate a tensorflow LSTMBlockFusedCell model to pytorch LSTM, but I'm not getting the same outputs with identical input and weights in both models. I believe this is due to how the weights are being set for the torch model; in the code snippet beneath the TensorFlow weight has the shape (400, 164) whilst the PyTorch weights has the shape (400,64) and (400,100) for torch_lstm.weight_ih_l0 and torch_lstm.weight_hh_l0 respectively. I addressed this inconsistency by using the first 64 elements as weight_ih_l0 and the proceeding 100 elements as weight_hh_l0. According to this article, TensorFlow uses right-multiplication instead of PyTorch left-multiplication which is why I need to transpose the weight. Also I am setting the bias to 0 (rendering it useless) for debugging.
import tensorflow as tf
import numpy as np
import torch
time_len, batch_size, input_size, num_units = 50, 1, 64, 100 # L, N, Hin, Hout with torch semantics
# setup tensorflow LSTM
tf_lstm = tf.contrib.rnn.LSTMBlockFusedCell(num_units=num_units)
inp = tf.placeholder(tf.float32, shape=(time_len, batch_size, input_size))
out, c = tf_lstm(inp, dtype=tf.float32)
tf_weight = tf_lstm.weights[0]
tf_bias = tf_lstm.weights[1]
# initialize weights
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# tf forward pass
a = np.random.randn(time_len, batch_size, input_size).astype(np.float32) # input
b = np.zeros(tf_bias.shape) # set lstm bias to zero
tf_out, lstm_weight, lstm_bias = sess.run([out, tf_weight, tf_bias], {inp: a, tf_bias: b})
assert (lstm_bias == 0).all() # make sure lstm bias was 0
# setup pytorch LSTM
torch_lstm = torch.nn.LSTM(input_size=input_size, hidden_size=num_units, num_layers=1, bias=False)
# set torch weights same as tensorflow weights (is this correct?)
w1 = lstm_weight[:input_size, :] # first 64 elements
w2 = lstm_weight[input_size:, :] # proceeding 100 elements
torch_lstm.weight_ih_l0.data = torch.tensor(w1.T) # transpose and set first weight
torch_lstm.weight_hh_l0.data = torch.tensor(w2.T) # transpose and set second weight
# torch forward pass
torch_out, (hn, cn) = torch_lstm(torch.tensor(a))
torch_out = torch_out.detach().numpy() # convert to numpy for compatibility
# compare
assert torch_out.shape == tf_out.shape
print("np.allclose(torch_out, tf_out) = ", np.allclose(torch_out, tf_out))
print("normalized difference: ", np.linalg.norm(torch_out - tf_out))
output:
np.allclose(torch_out, tf_out) = False
normalized difference: 10.741002
Expected output:
np.allclose(torch_out, tf_out) = True
normalized difference: ~0.0
I am running on cpu with the following dependencies:
numpy==1.21.5
tensorflow-gpu==1.14.0
torch==1.11.0
I am running tensorflow v1, cpu version should work, the python wheel is available here for python<=3.7.
Any help is appreciated.
I believe I solved this by changing the order of weight associated with each gate and setting forget_bias=0.0 in LSTMBlockFusedCell:
import tensorflow as tf
import numpy as np
import torch
import itertools as it
time_len, batch_size, input_size, num_units = 50, 1, 64, 100 # L, N, Hin, Hout with torch semantics
# setup tensorflow LSTM
tf_lstm = tf.contrib.rnn.LSTMBlockFusedCell(num_units=num_units, forget_bias=0.0, dtype=tf.float32)
inp = tf.placeholder(tf.float32, shape=(time_len, batch_size, input_size))
out, c = tf_lstm(inp, dtype=tf.float32)
tf_weight = tf_lstm.weights[0]
tf_bias = tf_lstm.weights[1]
# initialize weights
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# tf forward pass
a = np.random.randn(*inp.shape).astype(np.float32) # input
b = np.zeros(tf_bias.shape) # set lstm bias to zero
tf_out, lstm_weight, lstm_bias = sess.run([out, tf_weight, tf_bias], {inp: a, tf_bias: b})
assert (lstm_bias == 0).all() # make sure lstm bias was 0
# setup pytorch LSTM
torch_lstm = torch.nn.LSTM(input_size=input_size, hidden_size=num_units, num_layers=1, bias=False)
# weights associated with each gate
i = lstm_weight[:, 0:100].copy(), 'i'
f = lstm_weight[:, 100:200].copy(), 'f'
o = lstm_weight[:, 200:300].copy(), 'o'
g = lstm_weight[:, 300:400].copy(), 'g'
for i,f,o,g in it.permutations([i,f,o,g], 4):
print(*[x[1] for x in (i,f,o,g)])
i,f,o,g = (x[0] for x in (i,f,o,g))
lstm_weight = np.concatenate([i,f,o,g], axis=1)
# set torch weights same as tensorflow weights
w1 = lstm_weight[:input_size, :] # first 64 elements
w2 = lstm_weight[input_size:, :] # proceeding 100 elements
torch_lstm.weight_ih_l0.data = torch.tensor(w1.T) # transpose and set first weight
torch_lstm.weight_hh_l0.data = torch.tensor(w2.T) # transpose and set second weight
# torch forward pass
torch_out, (hn, cn) = torch_lstm(torch.tensor(a))
torch_out = torch_out.detach().numpy() # convert to numpy for compatibility
# compare
assert torch_out.shape == tf_out.shape
print("np.allclose(torch_out, tf_out) = ", np.allclose(torch_out, tf_out))
print("normalized difference: ", np.linalg.norm(torch_out - tf_out))
This will print the difference for all permutations of gate weights, the combination i o f g gave a difference of 1.7814435e-06 which is close enough.

Visualizing self attention weights for sequence addition problem with LSTM?

I am using Self Attention layer from here for a simple problem of adding all the numbers in a sequence that come before a delimiter. With training, I expect the neural network to learn which numbers to add and using Self Attention layer, I expect to visualize where the model is focusing. The code to reproduce the the results is following
import os
import sys
import matplotlib.pyplot as plt
import numpy
import numpy as np
from keract import get_activations
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dense, Dropout, LSTM
from attention import Attention # https://github.com/philipperemy/keras-attention-mechanism
def add_numbers_before_delimiter(n: int, seq_length: int, delimiter: float = 0.0,
index_1: int = None) -> (np.array, np.array):
"""
Task: Add all the numbers that come before the delimiter.
x = [1, 2, 3, 0, 4, 5, 6, 7, 8, 9]. Result is y = 6.
#param n: number of samples in (x, y).
#param seq_length: length of the sequence of x.
#param delimiter: value of the delimiter. Default is 0.0
#param index_1: index of the number that comes after the first 0.
#return: returns two numpy.array x and y of shape (n, seq_length, 1) and (n, 1).
"""
x = np.random.uniform(0, 1, (n, seq_length))
y = np.zeros(shape=(n, 1))
for i in range(len(x)):
if index_1 is None:
a = np.random.choice(range(1, len(x[i])), size=1, replace=False)
else:
a = index_1
y[i] = np.sum(x[i, 0:a])
x[i, a] = delimiter
x = np.expand_dims(x, axis=-1)
return x, y
def main():
numpy.random.seed(7)
# data. definition of the problem.
seq_length = 20
x_train, y_train = add_numbers_before_delimiter(20_000, seq_length)
x_val, y_val = add_numbers_before_delimiter(4_000, seq_length)
# just arbitrary values. it's for visual purposes. easy to see than random values.
test_index_1 = 4
x_test, _ = add_numbers_before_delimiter(10, seq_length, 0, test_index_1)
# x_test_mask is just a mask that, if applied to x_test, would still contain the information to solve the problem.
# we expect the attention map to look like this mask.
x_test_mask = np.zeros_like(x_test[..., 0])
x_test_mask[:, test_index_1:test_index_1 + 1] = 1
model = Sequential([
LSTM(100, input_shape=(seq_length, 1), return_sequences=True),
SelfAttention(name='attention_weight'),
Dropout(0.2),
Dense(1, activation='linear')
])
model.compile(loss='mse', optimizer='adam')
print(model.summary())
output_dir = 'task_add_two_numbers'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
max_epoch = int(sys.argv[1]) if len(sys.argv) > 1 else 200
class VisualiseAttentionMap(Callback):
def on_epoch_end(self, epoch, logs=None):
attention_map = get_activations(model, x_test, layer_names='attention_weight')['attention_weight']
# top is attention map.
# bottom is ground truth.
plt.imshow(np.concatenate([attention_map, x_test_mask]), cmap='hot')
iteration_no = str(epoch).zfill(3)
plt.axis('off')
plt.title(f'Iteration {iteration_no} / {max_epoch}')
plt.savefig(f'{output_dir}/epoch_{iteration_no}.png')
plt.close()
plt.clf()
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=max_epoch,
batch_size=64, callbacks=[VisualiseAttentionMap()])
if __name__ == '__main__':
main()
However, I get following results attention weights
[Please click the link]1 to view weights during training.
I expect the attention to focus on all values before the delimiter. The white below represents ground truth while the upper half part represents weights for 10 samples.

keras split Input for multiple "mini-units"

Trying to implement this article.
Edit1: found one error, my output size is 10 and not 1. (one number per each sentence, there are 10 sentences per document)
Edit2: I got another error that involves the batch size. when I make it 10 the model trains (!!!!). but I think it's not the right way... the error I had given batch size 3 is
Edit 3 Solved!! stuff with sizes + the fact the BIDIRECTIONAL returns different stuff from LSTM so I need to concat myself. Will put right code in answer.
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Incompatible shapes: [10] vs. [3]
[[{{node training_5/Adam/gradients/loss_8/dense_61_loss/mul_grad/BroadcastGradientArgs}}]]
[[metrics_8/acc/Mean_1/_5481]]
(1) Invalid argument: Incompatible shapes: [10] vs. [3]
[[{{node training_5/Adam/gradients/loss_8/dense_61_loss/mul_grad/BroadcastGradientArgs}}]]
0 successful operations.
0 derived errors ignored.
The target is extractive document summarization.
Link to colab with code
What they do is (you can see in the picture at page 3)
100 units of BI_LSTM + Attention for each sentence of the document.
Concat those and insert them to 1 BI_LSTM + Attention to get document embeddings.
Use document embeddings + hidden states from the LSTM to get some features.
Classify according to features
After a lot of struggle with keras low level api, I got a simple version to work.
What I did was to get the already sentence embedding and just do the last LSTM.
Or get words embedding of a sentence and make the small unit of sentence LSTM to work.
Now I am trying to put every thing together but can not get the sizes to fit.
My input size is
number_of_document * sentences_in_document * words_in_sentence * word_embedding
In the code I set those to be 20 * 10 * 50 * 100
(10 sentence in document is for everything to run faster for now..).
My output is
10 * 1 meaning for each sentence I get 1/0 if it's part of the document summary.
(I have not yet did the features extraction part, I just use another dense layer to give me probabilities instead..)
I think the problem is with this part of the code
X_doc = Lambda(lambda x: x[:,t, :, :])(X)
The code with sample data
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np
import keras
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from keras import backend as K
num_of_training_examples = 20
words_in_sentence = 50 # max words per sentence
sentences_in_doc = 10
model_output_size = 10
word_embeddings_size = 100
lstm_hidden_size = 200
densor1_output_size = 400
densor2_output_size = 400
x_train = np.random.rand(num_of_training_examples, sentences_in_doc, words_in_sentence, word_embeddings_size)
y_train= np.random.randint(2, size=(num_of_training_examples, sentences_in_doc))
print(x_train.shape)
print(y_train.shape)
# Initialize arrays
inputs = []
bi_lstms = []
densors_1 =[]
densors_2 = []
for i in range(sentences_in_doc):
bi_lstms.append(Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(words_in_sentence, word_embeddings_size),
return_sequences=False, name='bidirectional_' + str(i)), merge_mode='concat'))
densors_1.append(Dense(densor1_output_size, activation = "tanh"))
densors_2.append(Dense(densor2_output_size, activation = "softmax"))
def invoke_sentence(sentence_matrix, index):
if index==0:
print(type(sentence_matrix))
print(tf.shape(sentence_matrix))
Ys = bi_lstms[index](sentence_matrix)
attention_middle = densors_1[index](Ys)
output = densors_2[index](attention_middle)
if index==0:
print(f'Ys shape is {Ys.shape}')
print(f'attention_middle shape is {attention_middle.shape}')
print(f'output shape is {output.shape}')
return output
def model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size):
"""
Arguments:
words_in_sentence -- Tx -- length of the input sequence - max words per sentence
sentences_in_doc --Ty -- length of the output sequence - number of sentences per document
lstm_hidden_size -- hidden state size of the Bi-LSTM
word_embeddings_size -- size of the word embeddings
model_output_size -- size of each sentence label (1 or 0)
Returns:
model -- Keras model instance
"""
sentences_embeddings = []
X = Input(shape=(sentences_in_doc , words_in_sentence, word_embeddings_size), name= 'X')
for t in range(Ty):
X_doc = Lambda(lambda x: x[:,t, :, :])(X)
print(type(X_doc))
print(X_doc)
print(X_doc.shape)
sentences_embeddings.append(invoke_sentence(X_doc, t))
sentences_embeddings_stacked = Lambda(lambda x: tf.stack(x, axis=0))(sentences_embeddings)
Ys = Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(sentences_in_doc , lstm_hidden_size*2),
return_sequences=False, name='bidirectional_document'),
merge_mode='concat')(sentences_embeddings_stacked)
attention_middle = Dense(densor1_output_size, activation = "tanh")(Ys)
document_embedding = Dense(densor2_output_size, activation = "softmax")(attention_middle)
outputs = Dense(model_output_size, activation = "softmax")(document_embedding)
# compute_features(document_embeddings, sentences_embeddings, ys)
model = Model(inputs=X, outputs=outputs)
return model
model = model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x = x_train, y = y_train, batch_size=2, epochs=1)
# Sizes
num_of_training_examples = 20
words_in_sentence = 50 # max words per sentence
sentences_in_doc = 10
model_output_size = 10
word_embeddings_size = 100
lstm_hidden_size = 200
densor1_output_size = 400
densor2_output_size = 400
# Data
x_train = np.random.rand(num_of_training_examples, sentences_in_doc, words_in_sentence, word_embeddings_size)
y_train= np.random.randint(2, size=(num_of_training_examples, sentences_in_doc))
print(x_train.shape)
print(y_train.shape)
# Initialize arrays
inputs = []
bi_lstms = []
densors_1 =[]
densors_2 = []
for i in range(sentences_in_doc):
bi_lstms.append(Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(words_in_sentence, word_embeddings_size),
return_sequences=True, return_state=True, name='bidirectional_' + str(i))))
densors_1.append(Dense(densor1_output_size, activation = "tanh",name='senteence_dense_tanh' + str(i)))
densors_2.append(Dense(densor2_output_size, activation = "softmax",name='senteence_dense_softmax' + str(i)))
def invoke_sentence(sentence_matrix, index):
if index==0:
print(type(sentence_matrix))
print(tf.shape(sentence_matrix))
lstm, forward_h, forward_c, backward_h, backward_c = bi_lstms[index](sentence_matrix)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
attention_middle = densors_1[index](state_h)
output = densors_2[index](attention_middle)
if index==0:
print(f'lstm shape is {lstm.shape}')
print(f'state_h shape is {state_h.shape}')
print(f'state_c shape is {state_c.shape}')
print(f'attention_middle shape is {attention_middle.shape}')
print(f'output shape is {output.shape}')
return output
def model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size):
"""
Arguments:
words_in_sentence -- Tx -- length of the input sequence - max words per sentence
sentences_in_doc --Ty -- length of the output sequence - number of sentences per document
lstm_hidden_size -- hidden state size of the Bi-LSTM
word_embeddings_size -- size of the word embeddings
model_output_size -- size of each sentence label (1 or 0)
Returns:
model -- Keras model instance
"""
sentences_embeddings = []
X = Input(shape=(sentences_in_doc, words_in_sentence, word_embeddings_size), name= 'X')
for t in range(sentences_in_doc):
X_doc = Lambda(lambda x: x[:, t,:, :])(X)
if(t==0):
print("X_doc")
print(type(X_doc))
print(X_doc)
print(X_doc.shape)
sentence_embedding = invoke_sentence(X_doc, t)
sentences_embeddings.append(sentence_embedding)
if(t==0):
print("sentence_embedding")
print(type(sentence_embedding))
print(sentence_embedding)
print(sentence_embedding.shape)
sentences_embeddings_stacked = Lambda(lambda x: tf.stack(x, axis=1))(sentences_embeddings)
print("sentences_embeddings_stacked")
print(type(sentences_embeddings_stacked))
print(sentences_embeddings_stacked)
print(sentences_embeddings_stacked.shape)
doc_lstm, doc_forward_h, doc_forward_c, doc_backward_h, doc_backward_c = Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(sentences_in_doc, lstm_hidden_size*2),
return_sequences=True, return_state=True, name='bidirectional_document'),
merge_mode='concat')(sentences_embeddings_stacked)
doc_state_h = Concatenate()([doc_forward_h, doc_backward_h])
doc_state_c = Concatenate()([doc_forward_c, doc_backward_c])
print(f'doc_lstm shape is {doc_lstm.shape}')
print(f'doc_state_h shape is {doc_state_h.shape}')
print(f'doc_state_c shape is {doc_state_c.shape}')
attention_middle = Dense(densor1_output_size, activation = "tanh")(doc_state_h)
document_embedding = Dense(densor2_output_size, activation = "softmax")(attention_middle)
print(f'document_embedding shape is {document_embedding.shape}')
# my_layer = MyLayer(input_shape=((400), (10,400), (10,400)), output_dim=2)
# custom_output = my_layer([document_embedding, sentences_embeddings_stacked, doc_state_h])
# print(f'custom_output shape is {custom_output.shape}')
outputs = Dense(model_output_size, activation = "softmax")(document_embedding)
model = Model(inputs=X, outputs=outputs)
return model
model = model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x = x_train, y = y_train, batch_size=5, epochs=1)

Sequence Labelling at paragraph/sentence embedding level using Bi-LSTM + CRF with Keras

I am working on a Sequence tagging task where the element to be tagged is sentences (or paragraph). Most of implementations I found present solution at the token level (NER, POS-Tagging, etc.), whereas here, I first need to build a paragraph embedding before doing sequence labelling.
There is actually two steps:
the first model is building a dense representation (with attention) from the word embeddings
First model takes the input (#batch size, #words in a paragraph)
the second model takes has input the following shape:
- (#batch size, #paragraphs, #words)
is taking the output of the first model and apply the Bi-LSTM + CRF architecture on top of it to tag every paragraph of a given document.
import keras
import keras.backend as K
from keras.layers import *
from keras.activations import *
from keras.regularizers import *
from keras.initializers import *
from keras.models import Model
from keras.layers import Embedding, Input, Bidirectional, GRU, LSTM, TimeDistributed, Layer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_accuracy
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
MAX_PARAGRAPHS = 200 # number of max paragraphs (ie. sequences)
MAX_PARAGRAPH_LENGTH = 60 # number of max token in a given sequences
EMBEDDING_DIM = 100 # word embedding dimension
PARAGRAPH_DIM = 100 # paragraph/sentence embedding dimension
MAX_NB_WORDS = 5000 # max nb words in the vocabulary
n_tags = 5 #number of classes
class AttLayer(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(AttLayer, self).__init__()
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim, )))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(AttLayer, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = x * ait
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(flatten_texts)
vocab_size = len(tokenizer.word_index) + 1
embedding_layer = Embedding(vocab_size,
EMBEDDING_DIM,
#weights=[embedding_matrix],
input_length=MAX_PARAGRAPH_LENGTH,
trainable=True,
mask_zero=True)
paragraph_input = Input(shape=(MAX_PARAGRAPH_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(paragraph_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer(60)(l_lstm)
paragraph_encoder = Model(paragraph_input, l_att)
sequence_input = Input(shape=(MAX_PARAGRAPHS, MAX_PARAGRAPH_LENGTH), dtype='int32')
parag_encoder = TimeDistributed(paragraph_encoder)(sequence_input)
bi_lstm = Bidirectional(LSTM(units=128, return_sequences=True,recurrent_dropout=0.2))(parag_encoder) # variational biLSTM
final_dense = TimeDistributed(Dense(128, activation="relu"))(bi_lstm)
crf = CRF(n_tags, sparse_target=True) # CRF layer
out = crf(final_dense) # output
model = Model(sequence_input, out)
model.compile(optimizer="rmsprop", loss=crf_loss, metrics=[crf_accuracy])
compiling the model leads to following error:
AssertionError: Input mask to CRF must have dim 2 if not None
I am using CRF layer from keras_contrib package
Could you please tell me what I'm doing wrong ?
This seems to make it run:
final_dense = TimeDistributed(Dense(128, activation="relu"))(bi_lstm)
final_dense = Masking()(final_dense) # insert this
crf = CRF(n_tags, sparse_target=True) # CRF layer
out = crf(final_dense)
But not entirely sure why.

A neural network outputting vectors with all components NaN

My neural network code ends up outputting vectors with NaNs most of the time. The code is given below:
from __future__ import division, print_function
from six.moves import xrange
import time
import os
from glob import glob
from zipfile import ZipFile, ZIP_DEFLATED
import numpy as np
import tensorflow as tf
## Defining variables which have to be provided by user
## Defining the number of units in the RNN. This is also the size of the word
## and document embeddings
num_units = 100
##The number of data elements in a batch
batch_size = 1
##The folder where the npz files with the numpy arrays are stored.
npz_files_folder = "npz_files"
## Name of the file to which we want the model to be saved
model_file = "rnn_trial"
## Number of labels sampled from the noise for NCE
num_sampled = 50
## The dropout probability for the NN
dropout = 0.2
## The learning rate for the optimizer
lr = 0.1
## The number of epochs
epochs = 10
## Reading in the list of npz files with vectors for each document
doc_files = sorted(glob(os.path.join(npz_files_folder, "*.npz")))
num_classes = num_docs = len(doc_files)
## The tensor for storing a batch of sentences where each sentence is a
## sequence of word embeddings. This is an input to the NN
sentences = tf.placeholder(tf.float32, [batch_size, None, num_units],
name='sentences')
## The tensor for storing a batch of documents where each document is a
## sequence of sentence embeddings. This is an input to the NN
documents = tf.placeholder(tf.float32, [batch_size, None, num_units])
## The tensor for storing the labels for each batch of documents
labels = tf.placeholder(tf.float32, [batch_size])
## Here we define the LSTM used in the first layer
sent_lstm = tf.contrib.rnn.BasicLSTMCell(num_units)
sent_lstm = tf.contrib.rnn.DropoutWrapper(sent_lstm,
output_keep_prob=1.0-dropout)
## We define the initial_state of the LSTM in first layer here
initial_state_sent_lstm = sent_lstm.zero_state(batch_size, tf.float32)
## Here we get the outputs and states from the first layer
outputs_lstm, states_lstm = tf.nn.dynamic_rnn(sent_lstm,
inputs=sentences, initial_state=initial_state_sent_lstm)
## Here we define the forward GRU used in the second layer
doc_gru_fw = tf.contrib.rnn.GRUCell(num_units//2)
initial_state_doc_gru_fw = doc_gru_fw.zero_state(batch_size, tf.float32)
## Here we define the reverse GRU used in second layer.
doc_gru_bw = tf.contrib.rnn.GRUCell(num_units-num_units//2)
initial_state_doc_gru_bw = doc_gru_bw.zero_state(batch_size, tf.float32)
## Here we get the outputs and states from the second layer
outputs_gru, states_gru = tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_gru_fw,
cell_bw=doc_gru_bw, initial_state_fw=initial_state_doc_gru_fw,
initial_state_bw=initial_state_doc_gru_bw,
inputs=documents)
# outputs_gru, states_gru = tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_gru_fw,
# cell_bw=doc_gru_bw,
# inputs=documents, dtype=tf.float32)
## The final document embeddings
final_output = tf.reduce_mean(tf.concat(outputs_gru, 2), axis=1)
sigmoid_W = tf.Variable(
tf.truncated_normal([num_units, 1],
stddev=1.0/np.sqrt(num_units)))
sigmoid_b = tf.Variable(tf.zeros([1], dtype=tf.float32))
logits = tf.matmul(final_output, sigmoid_W) + sigmoid_b
y_ = (num_docs - 1) * tf.sigmoid(tf.reshape(logits, [-1]))
loss = tf.reduce_sum(tf.square(y_ - labels))
## Defining the training step
train = tf.train.AdamOptimizer(lr).minimize(loss)
## Initializing the session
sess = tf.Session()
## Initializing the variables
sess.run(tf.global_variables_initializer())
t = time.time()
for n in xrange(epochs):
result = False
for j, doc in enumerate(doc_files):
# if j==100:
# break
try:
npz_file = np.load(doc, allow_pickle=False)
except ValueError:
continue
train_label = np.array([j])
sent_files = sorted(npz_file.files)
temp_doc = np.array([])
temp_doc = np.reshape(temp_doc, (0, num_units))
for i, sent_file in enumerate(sent_files):
sent_input = np.reshape(npz_file[sent_file], (1, -1, num_units))
if 0 in sent_input.shape:
continue
output_1 = sess.run(outputs_lstm,
feed_dict={sentences: sent_input})
sent_embed = output_1[:, -1:]
temp_doc = np.concatenate([temp_doc] + list(sent_embed), 0)
## Training the model
temp_doc = np.array([temp_doc])
_, doc_vector = sess.run([train, final_output], feed_dict={
documents: temp_doc, labels: train_label})
if np.isnan(np.sum(doc_vector)):
result = True
print(result)
print("Finished with epoch ", n)
print()
doc_vecs_file_name = model_file + "_doc_vecs.zip"
with ZipFile(doc_vecs_file_name, 'w', ZIP_DEFLATED, True) as myzip:
for doc in doc_files:
# if doc_files.index(doc)==100:
# break
try:
npz_file = np.load(doc, allow_pickle=False)
except ValueError:
continue
sent_files = sorted(npz_file.files)
temp_doc = np.array([])
temp_doc = np.reshape(temp_doc, (0, num_units))
for i, sent_file in enumerate(sent_files):
sent_input = np.reshape(npz_file[sent_file], (1, -1, num_units))
if 0 in sent_input.shape:
continue
output_1 = sess.run(outputs_lstm,
feed_dict={sentences: sent_input})
sent_embed = output_1[:, -1:]
temp_doc = np.concatenate([temp_doc] + list(sent_embed), 0)
## Training the model
temp_doc = np.array([temp_doc])
doc_vec = sess.run(final_output, feed_dict={documents: temp_doc})
temp_file = doc.split(os.sep)[-1][:-4] + ".csv"
np.savetxt(temp_file, doc_vec, delimiter=',')
myzip.write(temp_file)
os.remove(temp_file)
saver = tf.train.Saver()
saver.save(sess, model_file)
print("Time taken = ", (time.time() - t))
If needed, I can upload a sample data set which you can use to try running the code yourself. With that sample data set, occasionally the training is completed without any NaNs creeping in. But, most of the time, NaNs pop up while training.
I am using tensorflow version 1.1.0 alongwith python 2.7.13 from the anaconda distribution.