ValueError: too many values to unpack (expected 4)

ValueError: too many values to unpack (expected 4) - tensorflow

I am getting "ValueError: too many values to unpack (expected 4)" with the below code. Please help me!!
I am trying to lemmatize and cut off common words and then add to library so I can identify most common words and find the relationship between words.
def build_dataset(words, vocabulary_size):
lexicon = []
for l in words:
all_words = word_tokenize(l.lower())
lexicon += list(all_words )
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
word = []
for w in w_counts:
if 5000 > w_counts[w] > 50 :
word.append(w)
print(len(word))
return word
count = [['UNK', -1]]
count.extend(collections.Counter(word).most_common(vocabulary_size - 1))
dictionary = dict()
for l2, _ in count:
dictionary[l2] = len(dictionary)
data = list()
unk_count = 0
for l2 in word:
if l2 in dictionary:
index = dictionary[l2]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)

Related

Vectorized alternative to iterrows : Semantic Analysis

Hi I'm currently doing a semantic tweet analysis and want to improve my code running time with Numpy Vectorization.
I tried enhancing my code for a while but was not successful in doing so.
Could I just enter the formula within the loop iteration to a function and apply it via Numpy.vectorize?
ss = SentimentIntensityAnalyzer()
for index, row in tw_list["full_text"].iteritems():
score = ss.polarity_scores(row)
neg = score["neg"]
neu = score["neu"]
pos = score["pos"]
comp = score["compound"]
if neg > pos:
tw_list.loc[index, "sentiment"] = "negative"
elif pos > neg:
tw_list.loc[index, "sentiment"] = "positive"
else:
tw_list.loc[index, "sentiment"] = "neutral"
tw_list.loc[index, "neg"] = neg
tw_list.loc[index, "neu"] = neu
tw_list.loc[index, "pos"] = pos
tw_list.loc[index, "compound"] = comp

Instead of iterating over rows in dataframe, you can make use of apply function.
def get_sentiments(text):
score = ss.polarity_scores(text)
neg = score["neg"]
neu = score["neu"]
pos = score["pos"]
comp = score["compound"]
if neg > pos:
sentiment = "negative"
elif pos > neg:
sentiment = "positive"
else:
sentiment = "neutral"
return sentiment,neg,neu,pos,comp
tw_list[["sentiment","neg","neu","pos","comp"]] = tw_list["full_text"].apply(get_sentiments,result_type='broadcast')
This should give improvement in perfomance

Fuzzy matching and iteration through DataFrame

I have these two DataFrames: I want to fuzzy match the Surname strings to the corresponding Names
dico = {'Name': ['Arthur','Henri','Lisiane','Patrice'],
"Age": ["20","18","62","73"],
"Studies": ['Economics','Maths','Psychology','Medical']
}
dico2 = {'Surname': ['Henri2','Arthur1','Patrice4','Lisiane3']}
dico = pd.DataFrame.from_dict(dico)
dico2 = pd.DataFrame.from_dict(dico2)
I want to fuzzy match the Surname strings to the corresponding Names to have an output as follows
Name Surname Age Studies
0 Arthur Arthur1 20 Economics
1 Henri Henri2 18 Maths
2 Lisiane Lisiane3 62 Psychology
3 Patrice Patrice4 73 Medical
and here is my code so far:
dico['Surname'] = []
for i in dico2:
lst = [0, 0, 0]
for j in dico:
if lst[0] < fuzz.ratio(i,j):
lst[0] = fuzz.ratio(i,j)
lst[1] = i
lst[2] = j
dico['Surname'].append(i)
but i get a ValueError: Length of values (0) does not match length of index (4), which I don't get why. Thanks !

dico = {'Name': ['Arthur','Henri','Lisiane','Patrice'],
"Age": ["20","18","62","73"],
"Studies": ['Economics','Maths','Psychology','Medical']
}
dico2 = {'Surname': ['Henri2','Arthur1','Patrice4','Lisiane3']}
dico = pd.DataFrame.from_dict(dico)
dico2 = pd.DataFrame.from_dict(dico2)
temp = pd.DataFrame()
for x in range (0, len(dico.Name)):
name_str = dico.Name[x]
temp = pd.concat([temp, dico2[dico2.Surname.str.contains(name_str)].Surname])
temp.columns=['Surname']
temp = temp.reset_index(drop = True)
dico = pd.concat([dico, temp], axis=1)

Solution
map_list = []
for name in dico['Name']:
best_ratio = None
for idx, surname in enumerate(dico2['Surname']):
if best_ratio == None:
best_ratio = fuzz.ratio(name, surname)
best_idx = 0
else:
ratio = fuzz.ratio(name, surname)
if ratio > best_ratio:
best_ratio = ratio
best_idx = idx
map_list.append(dico2['Surname'][best_idx]) # obtain surname
dico['Surname'] = pd.Series(map_list) # add column
dico = dico[["Name", "Surname", "Age", "Studies"]] # reorder columns
Result

The error originates from
dico['Surname'] = []
dico['Surname'] is length 4, while [] is length 0. You can instead collect your surnames in a list and then add the surnames to the dataframe in one go after the loop.
You also need to tell the outer loop to iterate over dico2['Surname'] instead of the entire dataframe.
surnames = []
for i in dico2['Surname']:
lst = [0, 0, 0]
for j in dico:
if lst[0] < fuzz.ratio(i,j):
lst[0] = fuzz.ratio(i,j)
lst[1] = i
lst[2] = j
surnames.append(i)
dico['Surname'] = surnames
EDIT: only fixed the error in question. Also see maxbachmann's advise on not calling fuzz.ratio twice.

How to multiply two Sparse matrices by Spars method with python?

I tried to multiply the two Sparse matrices, but I had trouble deleting extra rows that were all zeros, I usednumpy.delete(my_matrix, [n], axis=0)and got this error:
index 4 is out of bounds for axis 0 with size 3
def mult_mat(mat1, mat2):
col = mat1[0][1]
row = mat2[0][0]
row_mat1, row_mat2 = np.shape(mat1)[0], np.shape(mat2)[0]
if col != row:
return "Multiplication is not possible because the number" \
" of columns in the first matrix is opposite of the" \
" number of rows in the second matrix"
my_matrix = np.array([[0] * 3] * (mat1[0][2] * mat2[0][2]))
n = 0
for r in range(1, row_mat1):
for h in range(1, row_mat2):
if mat1[r][1] == mat2[h][0]:
my_matrix[n][0], my_matrix[n][1], my_matrix[n][2] = mat1[r][0], mat2[h][1], mat1[r][2] * mat2[h][2]
n += 1
row_my_matrix = np.shape(my_matrix)[0]
for n in range(row_my_matrix):
if my_matrix[n][0] == 0 & my_matrix[n][1] == 0 & my_matrix[n][2] == 0:
my_matrix = np.delete(my_matrix, [n], axis=0)
return my_matrix

Probabilistic Record Linkage in Pandas

I have two dataframes (X & Y). I would like to link them together and to predict the probability that each potential match is correct.
X = pd.DataFrame({'A': ["One", "Two", "Three"]})
Y = pd.DataFrame({'A': ["One", "To", "Free"]})

Method A
I have not yet fully understood the theory but there is an approach presented in:
Sayers, A., Ben-Shlomo, Y., Blom, A.W. and Steele, F., 2015. Probabilistic record linkage. International journal of epidemiology, 45(3), pp.954-964.
Here is my attempt to implementat it in Pandas:
# Probability that Matches are True Matches
m = 0.95
# Probability that non-Matches are True non-Matches
u = min(len(X), len(Y)) / (len(X) * len(Y))
# Priors
M_Pr = u
U_Pr = 1 - M_Pr
O_Pr = M_Pr / U_Pr # Prior odds of a match
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Likelihood ratio
def Likelihood_ratio(C):
return (m/u) - ((m/u) - ((1-m) / (1-u))) * (1-C)
L_R = np.vectorize(Likelihood_ratio, otypes=[float])
Z["G"] = L_R(Z['C'])
# Match weight
def Match_weight(G):
return math.log(G) * math.log(2)
M_W = np.vectorize(Match_weight, otypes=[float])
Z["R"] = M_W(Z['G'])
# Posterior odds
def Posterior_odds(R):
return math.exp( R / math.log(2)) * O_Pr
P_O = np.vectorize(Posterior_odds, otypes=[float])
Z["O"] = P_O(Z['R'])
# Probability
def Probability(O):
return O / (1 + O)
Pro = np.vectorize(Probability, otypes=[float])
Z["P"] = Pro(Z['O'])
I have verified that this gives the same results as in the paper. Here is a sensitivity check on m, showing that it doesn't make a lot of difference:
Method B
These assumptions won't apply to all applications but in some cases each row of X should match a row of Y. In that case:
The probabilities should sum to 1
If there are many credible candidates to match to then that should reduce the probability of getting the right one
then:
X["I"] = X.index
# Combine the dataframes
X['key'] = 1
Y['key'] = 1
Z = pd.merge(X, Y, on='key')
Z = Z.drop('key',axis=1)
X = X.drop('key',axis=1)
Y = Y.drop('key',axis=1)
# Levenshtein distance
def Levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
L_D = np.vectorize(Levenshtein_distance, otypes=[float])
Z["D"] = L_D(Z['A_x'], Z['A_y'])
# Max string length
def Max_string_length(X, Y):
return max(len(X), len(Y))
M_L = np.vectorize(Max_string_length, otypes=[float])
Z["L"] = M_L(Z['A_x'], Z['A_y'])
# Agreement weight
def Agreement_weight(D, L):
return 1 - ( D / L )
A_W = np.vectorize(Agreement_weight, otypes=[float])
Z["C"] = A_W(Z['D'], Z['L'])
# Normalised Agreement Weight
T = Z .groupby('I') .agg({'C' : sum})
D = pd.DataFrame(T)
D.columns = ['T']
J = Z.set_index('I').join(D)
J['P1'] = J['C'] / J['T']
Comparing it against Method A:
Method C
This combines method A with method B:
# Normalised Probability
U = Z .groupby('I') .agg({'P' : sum})
E = pd.DataFrame(U)
E.columns = ['U']
K = Z.set_index('I').join(E)
K['P1'] = J['P1']
K['P2'] = K['P'] / K['U']
We can see that method B (P1) doesn't take account of uncertainty whereas method C (P2) does.

Append array to itself by repeated function calls

im trying to append an array by repeatedly calling a function. When I put the append command in a loop this works fine, but not when the loop calls the function that's supposed to do the append.
import numpy as np
test_value = 555
i = 0
j = 0
test_array = np.empty([0, 3])
def test(test_value, i, j, test_array):
test_temp = []
test_temp.append(i)
test_temp.append(j)
test_temp.append(test_value)
test_temp_1 = test_temp
test_temp_2 = np.array(test_temp_1)
test_temp_2 = np.reshape(test_temp_2, (1,3))
test_array = np.append(test_array, test_temp_2, axis=0)
return test_array
for i in range(0,10):
i = i + 1
j = j + 2
test(test_value, i, j, test_array)
print ("test array", test_array)
Ideally what would happen is that test_array gets a new row added each time its looped, but so the final print of test_array stays empty.
Cheers

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

ValueError: too many values to unpack (expected 4) - tensorflow

Related

Vectorized alternative to iterrows : Semantic Analysis

Fuzzy matching and iteration through DataFrame

How to multiply two Sparse matrices by Spars method with python?

Probabilistic Record Linkage in Pandas

Append array to itself by repeated function calls

Categories

Resources