How to handle out of vocab words with bag of words - pandas

I am attempting to use BoW before ML on my text based dataset. But, I do not want my training set to influence my test set (i.e., data leakage). I want to deploy BoW on the train set before the test set. But, then my test set has different features (i.e., words) than my train set so the matrices are not the same size. I tried keeping columns in the test set that also appear in the train set but 1) My code is not right and 2) I do not think this is the most efficient procedure. I think I also need code to add filler columns? Here is what I have:
from sklearn.feature_extraction.text import CountVectorizer
def bow (tokens, data):
tokens = tokens.apply(nltk.word_tokenize)
cvec = CountVectorizer(min_df = .01, max_df = .99, ngram_range=(1,2), tokenizer=lambda doc:doc, lowercase=False)
cvec.fit(tokens)
cvec_counts = cvec.transform(tokens)
cvec_counts_bow = cvec_counts.toarray()
vocab = cvec.get_feature_names()
bow_model = pd.DataFrame(cvec_counts_bow, columns=vocab)
return bow_model
X_train = bow(train['text'], train)
X_test = bow(test['text'], test)
vocab = list(X_train.columns)
X_test = test.filter.columns([w for w in X_test if w in vocab])

You would normaliy fit the CountVectorizer only on the train set and use the same Vectorizer on the test set, e.g:
from sklearn.feature_extraction.text import CountVectorizer
def bow (tokens, data, cvec=None):
tokens = tokens.apply(nltk.word_tokenize)
if cvec==None:
cvec = CountVectorizer(min_df = .01, max_df = .99, ngram_range=(1,2), tokenizer=lambda doc:doc, lowercase=False)
cvec.fit(tokens)
cvec_counts = cvec.transform(tokens)
cvec_counts_bow = cvec_counts.toarray()
vocab = cvec.get_feature_names()
bow_model = pd.DataFrame(cvec_counts_bow, columns=vocab)
return bow_model, cvec
X_train, cvec = bow(train['text'], train)
X_test, cvec = bow(test['text'], test, cvec=cvec)
vocab = list(X_train.columns)
X_test = test.filter.columns([w for w in X_test if w in vocab])
This will of course ignore words not seen on the train set, but this shouldn't be a problem because train and test should have more or less the same distribution and therefore unknown words should be rare .
Note: Code is not tested

Related

I am doing NLP LSTM next word prediction. But I get error of to_categorical "IndexError: index 2718 is out of bounds for axis 1 with size 2718"

Below is the full code:
import spacy
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
def read_file(filepath):
with open(filepath) as f:
str_text = f.read()
return str_text
moby_text = read_file('moby_dick.txt')
nlp = spacy.load('en_core_web_sm')
doc = nlp(moby_text)
#getting tokens using list comprehension
tokens = [token.text.lower() for token in doc]
#cleaning text
tokens = [token for token in tokens if token not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?#[\\]^_`{|}~\t\n ']
train_len = 10+1 # 10 i/p and 1 o/p
text_sequences = []
for i in range(train_len,len(tokens)):
seq = tokens[i-train_len:i]
text_sequences.append(seq)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
for i in sequences[0]:
print(f'{i} : {tokenizer.index_word[i]}')
sequences = np.array(sequences)
vocabulary_size = len(tokenizer.word_counts)
def create_model(vocabulary_size, seq_len):
model = Sequential()
model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size)
Here in the to_categorical I'm getting the error. I don't understand why? and after reading so many articles I still don't get how to solve it.
IndexError: index 2718 is out of bounds for axis 1 with size 2718
error
seq_len = X.shape[1]
model = create_model(vocabulary_size, seq_len)
model.fit(X, y, epochs=100,verbose=1)
I don't understand the error. I have searched the error and tried different ways to solve it but I can't find anything to solve it. Also, I guess this is because the indices for lists start at 0. And I have done
Y = Y - 1
y = to_categorical(y, num_classes=vocabulary_size)
but this doesn't work because it gives error in the model. So I am back to square one.
Node: 'sequential/embedding/embedding_lookup'
indices[13,9] = 2718 is not in [0, 2718)
[[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_train_function_5647]
So how can I solve it?
Can someone please help me out? Thank you!!!
The Tokenizer doesn't use 0, it starts counting with 1:
0 is a reserved index that won't be assigned to any word.
Try this:
vocabulary_size = len(tokenizer.word_counts) + 1

embedding layer for several categories and regression

I found [this][1] and created this running POC code:
import tensorflow as tf
from tensorflow import keras
import numpy as np
def get_embedding_size(cat_data):
no_of_unique_cat = len(np.unique(cat_data))
return int(min(np.ceil((no_of_unique_cat)/2), 50))
# 3 numerical variables
num_data = np.random.random(size=(10,3))
# 2 categorical variables
cat_data_1 = np.random.randint(0,4,10)
cat_data_2 = np.random.randint(0,5,10)
target = np.random.random(size=(10,1))
no_unique_categories_category_1 = len(np.unique(cat_data_1))
embedding_size_category_1 = get_embedding_size(cat_data_1)
inp_cat_data = keras.layers.Input(shape=(no_unique_categories_category_1,))
# 3 columns
inp_num_data = keras.layers.Input(shape=(num_data.shape[1],))
emb = keras.layers.Embedding(input_dim=no_unique_categories_category_1, output_dim=embedding_size_category_1)(inp_cat_data)
flatten = keras.layers.Flatten()(emb)
# Concatenate two layers
conc = keras.layers.Concatenate()([flatten, inp_num_data])
dense1 = keras.layers.Dense(3, activation=tf.nn.relu,)(conc)
# Creating output layer
out = keras.layers.Dense(1, activation=None)(dense1)
model = keras.Model(inputs=[inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
loss=keras.losses.mean_squared_error,
metrics=[keras.metrics.mean_squared_error])
one_hot_encoded_cat_data_1 = np.eye(cat_data_1.max()+1)[cat_data_1]
model.fit([one_hot_encoded_cat_data_1, num_data], target)
I wonder how could one add the additional categorical variable cat_data_2? I am also wondering, why is one hot encoding still used. Is the whole point of embedding not to make this necessary? Thanks!
model.layers[1].get_weights()[0]
[1]: https://mmuratarat.github.io/2019-06-12/embeddings-with-numeric-variables-Keras

How to calculate the confidence intervals for prediction in Regression? and also how to plot it in python

Fig 7.1, An Introduction To Statistical Learning
I am currently studying a book named Introduction to Statistical Learning with applications in R, and also converting the solutions to python language.
I am not able to get how to get the confidence intervals and plot them as shown in the above image(dashed lines).
I have plotted the line. Here's my code for that -
(I am using polynomial regression with predictiors - 'age' and response - 'wage',degree is 4)
poly = PolynomialFeatures(4)
X = poly.fit_transform(data['age'].to_frame())
y = data['wage']
# X.shape
model = sm.OLS(y,X).fit()
print(model.summary())
# So, what we want here is not only the final line, but also the standart error related to the line
# TO find that we need to calcualte the predictions for some values of age
test_ages = np.linspace(data['age'].min(),data['age'].max(),100)
X_test = poly.transform(test_ages.reshape(-1,1))
pred = model.predict(X_test)
plt.figure(figsize = (12,8))
plt.scatter(data['age'],data['wage'],facecolors='none', edgecolors='darkgray')
plt.plot(test_ages,pred)
Here data is WAGE data which is available in R.
This is the resulting graph i get -
I have used bootstraping to calculate the confidence intervals, for this i have used a self customed module -
import numpy as np
import pandas as pd
from tqdm import tqdm
class Bootstrap_ci:
def boot(self,X_data,y_data,R,test_data,model):
predictions = []
for i in tqdm(range(R)):
predictions.append(self.alpha(X_data,y_data,self.get_indices(X_data,200),test_data,model))
return np.percentile(predictions,2.5,axis = 0),np.percentile(predictions,97.5,axis = 0)
def alpha(self,X_data,y_data,index,test_data,model):
X = X_data.loc[index]
y = y_data.loc[index]
lr = model
lr.fit(pd.DataFrame(X),y)
return lr.predict(pd.DataFrame(test_data))
def get_indices(self,data,num_samples):
return np.random.choice(data.index, num_samples, replace=True)
The above module can be used as -
poly = PolynomialFeatures(4)
X = poly.fit_transform(data['age'].to_frame())
y = data['wage']
X_test = np.linspace(min(data['age']),max(data['age']),100)
X_test_poly = poly.transform(X_test.reshape(-1,1))
from bootstrap import Bootstrap_ci
bootstrap = Bootstrap_ci()
li,ui = bootstrap.boot(pd.DataFrame(X),y,1000,X_test_poly,LinearRegression())
This will give us the lower confidence interval, and upper confidence interval.
To plot the graph -
plt.scatter(data['age'],data['wage'],facecolors='none', edgecolors='darkgray')
plt.plot(X_test,pred,label = 'Fitted Line')
plt.plot(X_test,ui,linestyle = 'dashed',color = 'r',label = 'Confidence Intervals')
plt.plot(X_test,li,linestyle = 'dashed',color = 'r')
The resultant graph is
Following code results in the 95% confidence interval
from scipy import stats
confidence = 0.95
squared_errors = (<<predicted values>> - <<true y_test values>>) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors)))

Passing pandas NumPy arrays as feature vectors in scikit learn?

I have a vector of 5 different values that I use as my sample value, and the label is a single integer of 0, 1, or 3. The machine learning algorithms work when I pass an array as a sample, but I get this warning. How do I pass feature vectors without getting this warning?
import numpy as np
from numpy import random
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import pandas as pd
filepath = 'test.csv'
# example label values
index = [0,1,3,1,1,1,0,0]
# example sample arrays
data = []
for i in range(len(index)):
d = []
for i in range(6):
d.append(random.randint(50,200))
data.append(d)
feat1 = 'brightness'
feat2, feat3, feat4 = ['h', 's', 'v']
feat5 = 'median hue'
feat6 = 'median value'
features = [feat1, feat2, feat3, feat4, feat5, feat6]
df = pd.DataFrame(data, columns=features, index=index)
df.index.name = 'state'
with open(filepath, 'a') as f:
df.to_csv(f, header=f.tell() == 0)
states = pd.read_csv(filepath, usecols=['state'])
df_partial = pd.read_csv(filepath, usecols=features)
states = states.astype(np.float32)
states = states.values
labels = states
samples = np.array([])
for i, row in df_partial.iterrows():
r = row.values
samples = np.vstack((samples, r)) if samples.size else r
n_neighbors = 5
test_size = .3
labels, test_labels, samples, test_samples = train_test_split(labels, samples, test_size=test_size)
clf1 = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf1 = clf1.fit(samples, labels)
score1 = clf1.score(test_samples, test_labels)
print("Here's how the models performed \nknn: %d %%" %(score1 * 100))
Warning:
"DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). clf1 = clf1.fit(samples, labels)"
sklearn documentation for fit(self, X, Y)
Try replacing
states = states.values by states = states.values.flatten()
OR
clf1 = clf1.fit(samples, labels) by clf1 = clf1.fit(samples, labels.flatten()).
states = states.values holds the correct labels that were stored in your panda dataframe, however they are getting stored on different rows. Using .flatten() put all those labels on the same row. (https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.ndarray.flatten.html)
In Sklearn's KNeighborsClassifier documentation
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), they show in their example that the labels must be stored on the same row: y = [0, 0, 1, 1].
When you retrieve data from dataframe states, it is stored in multiple rows (column vector) whereas it expected values in single row.
You can also try using ravel() function which is used to create a contiguous flattened array.
numpy.ravel(array, order = ā€˜Cā€™) : returns contiguous flattened array (1D array with all the input-array elements and with the same type as it)
Try:
states = states.values.ravel() in place of states = states.values

One-hot encoding Tensorflow Strings

I have a list of strings as labels for training a neural network. Now I want to convert them via one_hot encoding so that I can use them for my tensorflow network.
My input list looks like this:
labels = ['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']
The requested outcome should be something like
one_hot [0,1,0,2,0]
What is the easiest way to do this? Any help would be much appreciated.
Cheers,
Andi
the desired outcome looks like LabelEncoder in sklearn, not like OneHotEncoder - in tf you need CategoryEncoder - BUT it is A preprocessing layer which encodes integer features.:
inp = layers.Input(shape=[X.shape[0]])
x0 = layers.CategoryEncoding(
num_tokens=3, output_mode="multi_hot")(inp)
model = keras.Model(inputs=[inp], outputs=[x0])
model.compile(optimizer= 'adam',
loss='categorical_crossentropy',
metrics=[tf.keras.metrics.CategoricalCrossentropy()])
print(model.summary())
this part gets encoding of unique values... And you can make another branch in this model to input your initial vector & fit it according labels from this reference-branch (it is like join reference-table with fact-table in any database) -- here will be ensemble of referenced-data & your needed data & output...
pay attention to -- num_tokens=3, output_mode="multi_hot" -- are being given explicitly... AND numbers from class_names get apriory to model use, as is Feature Engineering - like this (in pd.DataFrame)
import numpy as np
import pandas as pd
d = {'transport_col':['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']}
dataset_df = pd.DataFrame(data=d)
classes = dataset_df['transport_col'].unique().tolist()
print(f"Label classes: {classes}")
df= dataset_df['transport_col'].map(classes.index).copy()
print(df)
from manual example REF: Encode the categorical label into an integer.
Details: This stage is necessary if your classification label is represented as a string. Note: Keras expected classification labels to be integers.
in another architecture, perhaps, you could use StringLookup
vocab= np.array(np.unique(labels))
inp = tf.keras.Input(shape= labels.shape[0], dtype=tf.string)
x = tf.keras.layers.StringLookup(vocabulary=vocab)(inp)
but labels are dependent vars usually, as opposed to features, and shouldn't be used at Input
Everything in keras.docs
possible FULL CODE:
import numpy as np
import pandas as pd
import keras
X = np.array([['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']])
vocab= np.unique(X)
print(vocab)
y= np.array([[0,1,0,2,0]])
inp = layers.Input(shape=[X.shape[0]], dtype='string')
x0= tf.keras.layers.StringLookup(vocabulary=vocab, name='finish')(inp)
model = keras.Model(inputs=[inp], outputs=[x0])
model.compile(optimizer= 'adam',
loss='categorical_crossentropy',
metrics=[tf.keras.metrics.categorical_crossentropy])
print(model.summary())
from tensorflow.keras import backend as K
for layerIndex, layer in enumerate(model.layers):
print(layerIndex)
func = K.function([model.get_layer(index=0).input], layer.output)
layerOutput = func([X]) # input_data is a numpy array
print(layerOutput)
if layerIndex==1: # the last layer here
scale = lambda x: x - 1
print(scale(layerOutput))
res:
[[0 1 0 2 0]]
another possible Solution for your case - layers.TextVectorization
import numpy as np
import keras
input_array = np.atleast_2d(np.array(['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']))
vocab= np.unique(input_array)
input_data = keras.Input(shape=(None,), dtype='string')
layer = layers.TextVectorization( max_tokens=None, standardize=None, split=None, output_mode="int", vocabulary=vocab)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
print(output_dataset) # starts from 2 ... probably [0, 1] somehow concerns binarization ?
scale = lambda x: x - 2
print(scale(output_dataset))
result:
array([[0, 1, 0, 2, 0]])