Vectorized alternative to iterrows : Semantic Analysis - pandas

Hi I'm currently doing a semantic tweet analysis and want to improve my code running time with Numpy Vectorization.
I tried enhancing my code for a while but was not successful in doing so.
Could I just enter the formula within the loop iteration to a function and apply it via Numpy.vectorize?
ss = SentimentIntensityAnalyzer()
for index, row in tw_list["full_text"].iteritems():
score = ss.polarity_scores(row)
neg = score["neg"]
neu = score["neu"]
pos = score["pos"]
comp = score["compound"]
if neg > pos:
tw_list.loc[index, "sentiment"] = "negative"
elif pos > neg:
tw_list.loc[index, "sentiment"] = "positive"
else:
tw_list.loc[index, "sentiment"] = "neutral"
tw_list.loc[index, "neg"] = neg
tw_list.loc[index, "neu"] = neu
tw_list.loc[index, "pos"] = pos
tw_list.loc[index, "compound"] = comp

Instead of iterating over rows in dataframe, you can make use of apply function.
def get_sentiments(text):
score = ss.polarity_scores(text)
neg = score["neg"]
neu = score["neu"]
pos = score["pos"]
comp = score["compound"]
if neg > pos:
sentiment = "negative"
elif pos > neg:
sentiment = "positive"
else:
sentiment = "neutral"
return sentiment,neg,neu,pos,comp
tw_list[["sentiment","neg","neu","pos","comp"]] = tw_list["full_text"].apply(get_sentiments,result_type='broadcast')
This should give improvement in perfomance

Related

Calculating gradients of cusom loss function with Gradient.Tape

I am trying custom traning of the network using Gradient.Tape method.
This traning is unsupervised.
The details of network and cost function is as following,
My Network is,
def CreateNetwork(inplayer, hidlayer, outlayer,seed):
model = keras.Sequential()
model.add(Dense(hidlayer, input_dim=inplayer, kernel_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(inplayer),seed=seed), bias_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(inplayer),seed=seed), activation='tanh'))
model.add(Dense(outlayer, kernel_initializer=initializers.RandomNormal(mean=0.0,stddev=1/np.sqrt(hidlayer),seed=seed), bias_initializer=initializers.Zeros(), activation='linear'))
return model
and my custom cost function is defined as,
def H_tilda(J,U,nsamples,nsites,configs,out_matrix):
EigenValue = 0.0
for k in range(nsamples):
config = configs[k,:]
out_n = out_matrix[k,:]
exp = 0.0
for i in range(nsamples):
n = configs[i,:]
out_nprime = out_matrix[i,:]
#------------------------------------------------------------------------------------------------
# Calculation of Hopping Term
#------------------------------------------------------------------------------------------------
hop = 0.0
for j in range(nsites):
if j == 0:
k = [nsites-1,j+1]
elif j == (nsites - 1):
k = [j-1,0]
else:
k = [j-1,j+1]
if n[k[0]] != 0:
annihiliate1 = np.sqrt(n[k[0]])
n1 = np.copy(n)
n1[k[0]] = n1[k[0]] - 1
n1[j] = n1[j] +1
if (config == n1).all():
delta1 = 1
else:
delta1 = 0
else:
annihiliate1 = 0
n1 = np.zeros(nsites)
delta1 = 0
if n[k[1]] != 0:
annihiliate2 = np.sqrt(n[k[1]])
n2 = np.copy(n)
n2[k[1]] = n2[k[1]] -1
n2[j] = n2[j] + 1
if (config == n2).all():
delta2 = 1
else:
delta2 = 0
else:
annihiliate2 = 0
n2 = np.zeros(nsites)
delta2 = 0
create = np.sqrt(n[j] + 1)
hop = hop + create*(annihiliate1*delta1 + annihiliate2*delta2)
#------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------
# Calculation of Onsite Term
#------------------------------------------------------------------------------------------------
if (config == n).all():
ons = np.sum(np.dot(np.square(n),n - 1))
else:
ons = 0.0
#------------------------------------------------------------------------------------------------
phi_value = phi(out_nprime.numpy())
exp = exp + ((hop + ons) * phi_value)
Phi_value = phi(out_n.numpy())
EigenValue = EigenValue + exp/Phi_value
return np.real(EigenValue/nsamples)
I want to do custom traning using GradientTape method, for which I used following lines ,
optimizer = optimizers.SGD(learning_rate=1e-3)
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch(tf.convert_to_tensor(configs))
out_matrix = model(configs)
print(out_matrix)
eival = H_tilda(J,U,nsamples,nsites,configs,out_matrix)
print(eival)
gradients = tape.gradient(tf.convert_to_tensor(eival), model.trainable_weights)
print(gradients)
But the gradient I am getting is NONE,
output: [None, None, None, None]

tensorflow multi slice not reshape

I have 3D (64,64,64) shape (chair) when I reshape it using tf operation to (8,32,32,32) then do my operation Deep learning operation and then return it back using tf reshape to (64,64,64) the shape looks very bad, actually there is no shape only strange looks unknown shape (100% not looks like chair)
but if I use function that I build to slice 32 by 32 and I stack them as (8,32,32,32) I use it as input to my DL Model. the output (8,32,32,32) I use also combine function which I build to recombine by reversing the slice function I got good looking shape
the issue both function slice and combine numpy not tf. I have to train model end-to-end so I need equivalent function that slice or combine in tensorflow please
def slice(self,size, obj):
#print('inside')
oldi = 0
newi = 0
oldj = 0
newj = 0
oldk = 0
newk = 0
lst = []
s = obj.shape[0]
s += 1
for i in range(size, s, size):
if (newi == s - 1):
oldi = 0
else:
oldi = newi
for j in range(size, s, size):
if (newj == s - 1):
oldj = 0
else:
oldj = newj
for k in range(size, s, size):
newi = i
newj = j
newk = k
slc = obj[oldi:newi, oldj:newj, oldk:newk]
#print(oldi,':',newi,',',oldj,':',newj,',',oldk,':',newk)
#print(slc.shape)
lst.append(slc)
if (newk == s - 1):
oldk = 0
else:
oldk = newk
# print(slc.shape)
return lst
def combine(self,lst, shape, size):
oldi = 0
newi = 0
oldj = 0
newj = 0
oldk = 0
newk = 0
obj = np.zeros((shape, shape, shape))
s = shape
s += 1
counter = 0
for i in range(size, s, size):
if (newi == s - 1):
oldi = 0
else:
oldi = newi
for j in range(size, s, size):
if (newj == s - 1):
oldj = 0
else:
oldj = newj
for k in range(size, s, size):
newi = i
newj = j
newk = k
obj[oldi:newi, oldj:newj, oldk:newk] = lst[counter]
counter += 1
#print(oldi,':',newi,',',oldj,':',newj,',',oldk,':',newk)
# print(slc.shape)
if (newk == s - 1):
oldk = 0
else:
oldk = newk
return obj
in other words I want tensorflow operation mimic
the following function
def combine(self,lst, shape, size):
oldi = 0
newi = 0
oldj = 0
newj = 0
oldk = 0
newk = 0
obj = np.zeros((shape, shape, shape))
s = shape
s += 1
counter = 0
for i in range(size, s, size):
if (newi == s - 1):
oldi = 0
else:
oldi = newi
for j in range(size, s, size):
if (newj == s - 1):
oldj = 0
else:
oldj = newj
for k in range(size, s, size):
newi = i
newj = j
newk = k
obj[oldi:newi, oldj:newj, oldk:newk] = lst[counter]
counter += 1
#print(oldi,':',newi,',',oldj,':',newj,',',oldk,':',newk)
# print(slc.shape)
if (newk == s - 1):
oldk = 0
else:
oldk = newk
return obj

How to speed up simple linear algebra optimization probelm in Julia?

I implemented the LSDD changepoint detection method decribed in [1] in Julia, to see if I could make it faster than the existing python implementation [2], which is based on a grid search that looks for the optimal parameters.
I obtain the desired results but despite my best efforts, my grid search version of it takes about the same time to compute as the python one, which is still way too long for real applications.
I also tried using the Optimize package which only makes things worse (2 or 3 times slower).
Here is the grid search that I implemented :
using Random
using LinearAlgebra
function squared_distance(X::Array{Float64,1},C::Array{Float64,1})
sqd = zeros(length(X),length(C))
for i in 1:length(X)
for j in 1:length(C)
sqd[i,j] = X[i]^2 + C[j]^2 - 2*X[i]*C[j]
end
end
return sqd
end
function lsdd(x::Array{Float64,1},y::Array{Float64,1}; folds = 5, sigma_list = nothing , lambda_list = nothing)
lx,ly = length(x), length(y)
b = min(lx+ly,300)
C = shuffle(vcat(x,y))[1:b]
CC_dist2 = squared_distance(C,C)
xC_dist2, yC_dist2 = squared_distance(x,C), squared_distance(y,C)
Tx,Ty = length(x) - div(lx,folds), length(y) - div(ly,folds)
#Define the training and testing data sets
cv_split1, cv_split2 = floor.(collect(1:lx)*folds/lx), floor.(collect(1:ly)*folds/ly)
cv_index1, cv_index2 = shuffle(cv_split1), shuffle(cv_split2)
tr_idx1,tr_idx2 = [findall(x->x!=i,cv_index1) for i in 1:folds], [findall(x->x!=i,cv_index2) for i in 1:folds]
te_idx1,te_idx2 = [findall(x->x==i,cv_index1) for i in 1:folds], [findall(x->x==i,cv_index2) for i in 1:folds]
xTr_dist, yTr_dist = [xC_dist2[i,:] for i in tr_idx1], [yC_dist2[i,:] for i in tr_idx2]
xTe_dist, yTe_dist = [xC_dist2[i,:] for i in te_idx1], [yC_dist2[i,:] for i in te_idx2]
if sigma_list == nothing
sigma_list = [0.25, 0.5, 0.75, 1, 1.2, 1.5, 2, 2.5, 2.2, 3, 5]
end
if lambda_list == nothing
lambda_list = [1.00000000e-03, 3.16227766e-03, 1.00000000e-02, 3.16227766e-02,
1.00000000e-01, 3.16227766e-01, 1.00000000e+00, 3.16227766e+00,
1.00000000e+01]
end
#memory prealocation
score_cv = zeros(length(sigma_list),length(lambda_list))
H = zeros(b,b)
hx_tr, hy_tr = [zeros(b,1) for i in 1:folds], [zeros(b,1) for i in 1:folds]
hx_te, hy_te = [zeros(1,b) for i in 1:folds], [zeros(1,b) for i in 1:folds]
#h_tr,h_te = zeros(b,1), zeros(1,b)
theta = zeros(b)
for (sigma_idx,sigma) in enumerate(sigma_list)
#the expression of H is different for higher dimension
#H = sqrt((sigma^2)*pi)*exp.(-CC_dist2/(4*sigma^2))
set_H(H,CC_dist2,sigma,b)
#check if the sum is performed along the right dimension
set_htr(hx_tr,xTr_dist,sigma,Tx), set_htr(hy_tr,yTr_dist,sigma,Ty)
set_hte(hx_te,xTe_dist,sigma,lx-Tx), set_hte(hy_te,yTe_dist,sigma,ly-Ty)
for i in 1:folds
h_tr = hx_tr[i] - hy_tr[i]
h_te = hx_te[i] - hy_te[i]
#set_h(h_tr,hx_tr[i],hy_tr[i],b)
#set_h(h_te,hx_te[i],hy_te[i],b)
for (lambda_idx,lambda) in enumerate(lambda_list)
set_theta(theta,H,lambda,h_tr,b)
score_cv[sigma_idx,lambda_idx] += dot(theta,H*theta) - 2*dot(theta,h_te)
end
end
end
#retrieve the value of the optimal parameters
sigma_chosen = sigma_list[findmin(score_cv)[2][2]]
lambda_chosen = lambda_list[findmin(score_cv)[2][2]]
#calculating the new "optimal" solution
H = sqrt((sigma_chosen^2)*pi)*exp.(-CC_dist2/(4*sigma_chosen^2))
H_lambda = H + lambda_chosen*Matrix{Float64}(I, b, b)
h = (1/lx)*sum(exp.(-xC_dist2/(2*sigma_chosen^2)),dims = 1) - (1/ly)*sum(exp.(-yC_dist2/(2*sigma_chosen^2)),dims = 1)
theta_final = H_lambda\transpose(h)
f = transpose(theta_final).*sum(exp.(-vcat(xC_dist2,yC_dist2)/(2*sigma_chosen^2)),dims = 1)
L2 = 2*dot(theta_final,h) - dot(theta_final,H*theta_final)
return L2
end
function set_H(H::Array{Float64,2},dist::Array{Float64,2},sigma::Float64,b::Int16)
for i in 1:b
for j in 1:b
H[i,j] = sqrt((sigma^2)*pi)*exp(-dist[i,j]/(4*sigma^2))
end
end
end
function set_theta(theta::Array{Float64,1},H::Array{Float64,2},lambda::Float64,h::Array{Float64,2},b::Int64)
Hl = (H + lambda*Matrix{Float64}(I, b, b))
LAPACK.posv!('L', Hl, h)
theta = h
end
function set_htr(h::Array{Float64,1},dists::Array{Float64,2},sigma::Float64,T::Int16)
for (CVidx,dist) in enumerate(dists)
for (idx,value) in enumerate((1/T)*sum(exp.(-dist/(2*sigma^2)),dims = 1))
h[CVidx][idx] = value
end
end
end
function set_hte(h::Array{Float64,1},dists::Array{Float64,2},sigma::Array{Float64,1},T::Int16)
for (CVidx,dist) in enumerate(dists)
for (idx,value) in enumerate((1/T)*sum(exp.(-dist/(2*sigma^2)),dims = 1))
h[CVidx][idx] = value
end
end
end
function set_h(h,h1,h2,b)
for i in 1:b
h[i] = h1[i] - h2[i]
end
end
The set_H, set_h and set_theta functions are there because I read somewhere that modifying prealocated memory in place with a function was faster, but it did not make a great difference.
To test it, I use two random distribution as input data :
x,y = rand(500),1.5*rand(500)
lsdd(x,y) #returns a value around 0.3
Now here is the version of the code where I try to use Optimizer :
function Theta(sigma::Float64,lambda::Float64,x::Array{Float64,1},y::Array{Float64,1},folds::Int8)
lx,ly = length(x), length(y)
b = min(lx+ly,300)
C = shuffle(vcat(x,y))[1:b]
CC_dist2 = squared_distance(C,C)
xC_dist2, yC_dist2 = squared_distance(x,C), squared_distance(y,C)
#the subsets are not be mutually exclusive !
Tx,Ty = length(x) - div(lx,folds), length(y) - div(ly,folds)
shuffled_x, shuffled_y = [shuffle(1:lx) for i in 1:folds], [shuffle(1:ly) for i in 1:folds]
cv_index1, cv_index2 = floor.(collect(1:lx)*folds/lx)[shuffle(1:lx)], floor.(collect(1:ly)*folds/ly)[shuffle(1:ly)]
tr_idx1,tr_idx2 = [i[1:Tx] for i in shuffled_x], [i[1:Ty] for i in shuffled_y]
te_idx1,te_idx2 = [i[Tx:end] for i in shuffled_x], [i[Ty:end] for i in shuffled_y]
xTr_dist, yTr_dist = [xC_dist2[i,:] for i in tr_idx1], [yC_dist2[i,:] for i in tr_idx2]
xTe_dist, yTe_dist = [xC_dist2[i,:] for i in te_idx1], [yC_dist2[i,:] for i in te_idx2]
score_cv = 0
Id = Matrix{Float64}(I, b, b)
H = sqrt((sigma^2)*pi)*exp.(-CC_dist2/(4*sigma^2))
hx_tr, hy_tr = [transpose((1/Tx)*sum(exp.(-dist/(2*sigma^2)),dims = 1)) for dist in xTr_dist], [transpose((1/Ty)*sum(exp.(-dist/(2*sigma^2)),dims = 1)) for dist in yTr_dist]
hx_te, hy_te = [(lx-Tx)*sum(exp.(-dist/(2*sigma^2)),dims = 1) for dist in xTe_dist], [(ly-Ty)*sum(exp.(-dist/(2*sigma^2)),dims = 1) for dist in yTe_dist]
for i in 1:folds
h_tr, h_te = hx_tr[i] - hy_tr[i], hx_te[i] - hy_te[i]
#theta = (H + lambda * Id)\h_tr
theta = copy(h_tr)
Hl = (H + lambda*Matrix{Float64}(I, b, b))
LAPACK.posv!('L', Hl, theta)
score_cv += dot(theta,H*theta) - 2*dot(theta,h_te)
end
return score_cv,(CC_dist2,xC_dist2,yC_dist2)
end
function cost(params::Array{Float64,1},x::Array{Float64,1},y::Array{Float64,1},folds::Int8)
s,l = params[1],params[2]
return Theta(s,l,x,y,folds)[1]
end
"""
Performs the optinization
"""
function lsdd3(x::Array{Float64,1},y::Array{Float64,1}; folds = 4)
start = [1,0.1]
b = min(length(x)+length(y),300)
lx,ly = length(x),length(y)
#result = optimize(params -> cost(params,x,y,folds),fill(0.0,2),fill(50.0,2),start, Fminbox(LBFGS(linesearch=LineSearches.BackTracking())); autodiff = :forward)
result = optimize(params -> cost(params,x,y,folds),start, BFGS(),Optim.Options(f_calls_limit = 5, iterations = 5))
#bboptimize(rosenbrock2d; SearchRange = [(-5.0, 5.0), (-2.0, 2.0)])
#result = optimize(cost,[0,0],[Inf,Inf],start, Fminbox(AcceleratedGradientDescent()))
sigma_chosen,lambda_chosen = Optim.minimizer(result)
CC_dist2, xC_dist2, yC_dist2 = Theta(sigma_chosen,lambda_chosen,x,y,folds)[2]
H = sqrt((sigma_chosen^2)*pi)*exp.(-CC_dist2/(4*sigma_chosen^2))
h = (1/lx)*sum(exp.(-xC_dist2/(2*sigma_chosen^2)),dims = 1) - (1/ly)*sum(exp.(-yC_dist2/(2*sigma_chosen^2)),dims = 1)
theta_final = (H + lambda_chosen*Matrix{Float64}(I, b, b))\transpose(h)
f = transpose(theta_final).*sum(exp.(-vcat(xC_dist2,yC_dist2)/(2*sigma_chosen^2)),dims = 1)
L2 = 2*dot(theta_final,h) - dot(theta_final,H*theta_final)
return L2
end
No matter, which kind of option I use in the optimizer, I always end up with something too slow. Maybe the grid search is the best option, but I don't know how to make it faster... Does anyone have an idea how I could proceed further ?
[1] : http://www.mcduplessis.com/wp-content/uploads/2016/05/Journal-IEICE-2014-CLSDD-1.pdf
[2] : http://www.ms.k.u-tokyo.ac.jp/software.html

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})
Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

ValueError: too many values to unpack (expected 4)

I am getting "ValueError: too many values to unpack (expected 4)" with the below code. Please help me!!
I am trying to lemmatize and cut off common words and then add to library so I can identify most common words and find the relationship between words.
def build_dataset(words, vocabulary_size):
lexicon = []
for l in words:
all_words = word_tokenize(l.lower())
lexicon += list(all_words )
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
word = []
for w in w_counts:
if 5000 > w_counts[w] > 50 :
word.append(w)
print(len(word))
return word
count = [['UNK', -1]]
count.extend(collections.Counter(word).most_common(vocabulary_size - 1))
dictionary = dict()
for l2, _ in count:
dictionary[l2] = len(dictionary)
data = list()
unk_count = 0
for l2 in word:
if l2 in dictionary:
index = dictionary[l2]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)