Finding top 3 dominant topics for LDA topic model - pandas

I am creating a datatable via this LDA modeling tutorial, (https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/) and instead of just finding the single most dominant topic, I want to expand to find the top 3 most dominant topics, along with each of their percent contributions and topic keywords.
To do that, is it best to create 2 additional functions to create 3 separate dataframes, and append each of the results? Or is there a simpler way to modify the format_topics_sentence function to pull the top 3 topics from the enumerated bag of words corpus?
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(ldamodel[corpus]):
row = row_list[0] if ldamodel.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)
table ouput

I had a similar requirement in a recent project, hopefully this helps you out, you will need to add topic keywords to below code:
topics_df1 = pd.DataFrame()
topics_df2 = pd.DataFrame()
topics_df3 = pd.DataFrame()
for i, row_list in enumerate(lda_model[corpus]):
row = row_list[0] if lda_model.per_word_topics else row_list
row = sorted(row, key=lambda x: (x[1]), reverse=True)
for j, (topic_num, prop_topic) in enumerate(row):
if len(row) >= 3:
if j ==0:
topics_df1 = topics_df1.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
elif j ==1:
topics_df2 = topics_df2.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
elif j ==2:
topics_df3 = topics_df3.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
else:
break
elif len(row) == 2:
if j ==0:
topics_df1 = topics_df1.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
elif j ==1:
topics_df2 = topics_df2.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
topics_df3 = topics_df3.append(pd.Series(['-', '-']), ignore_index=True)
elif len(row) == 1:
topics_df1 = topics_df1.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
topics_df2 = topics_df2.append(pd.Series(['-', '-']), ignore_index=True)
topics_df3 = topics_df3.append(pd.Series(['-', '-']), ignore_index=True)
topics_df1.rename(columns={0:'1st Topic', 1:'1st Topic Contribution'}, inplace=True)
topics_df2.rename(columns={0:'2nd Topic', 1:'2nd Topic Contribution'}, inplace=True)
topics_df3.rename(columns={0:'3rd Topic', 1:'3rd Topic Contribution'}, inplace=True)
topics_comb = pd.concat([topics_df1, topics_df2, topics_df3], axis=1, sort=False)
#Join topics dataframe to original data
new_df = pd.concat([data_ready, topics_comb], axis=1, sort=False)

Related

Error code 'could not convert string to float: 'PG-13'. How to fix it?

I am building a recommendation engine from a database from Kaggle.
df = pd.read_csv("netflix.csv")
df = df.drop(["ratingdescription"], axis=1)
df = pd.get_dummies(df, columns=["rating_level"])
df = df.dropna()
df = df[['title', 'rating', 'release_year', 'user_rating_score', 'user_rating_size']]
df['title'] = df['title'].astype('category')
df['title'] = df['title'].cat.codes
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df.drop(['title'], axis=1))
def recommend(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
else:
recommended_title = df.loc\[df.index == indices.flatten()\[i\], 'title'\].values\[0\]
recommended_title = df.loc\[df\['title'\] == recommended_title\]\['title'\].cat.categories\[recommended_title\]
print('Recommendation:', recommended_title)
def evaluate(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
recommended_titles = \[\]
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
`else:
recommended_title = df.loc[df.index == indices.flatten()[i], 'title'].values[0]
recommended_titles.append(recommended_title)
actual_titles = df.loc[df['rating'] == df.loc[df['title'] == title]['rating'].values[0], 'title']
actual_titles = actual_titles.drop(query_index)
actual_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in actual_titles]
recommended_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in recommended_titles]
precision, recall, _, _ = precision_recall_fscore_support(actual_titles, recommended_titles, average = 'macro')
print('Precision:', precision)
print('Recall:', recall)
recommend("The Shawshank Redemption", df, model_knn)
evaluate("The Shawshank Redemption", df, model_knn)
I have tried altering the code many times but it's either this or the error message
"KeyError: 'rating_level'" indicates that the column "rating_level" is not found in the dataframe **df**.
error received is this :
`ValueError: could not convert string to float: 'PG-13'

Fetch data from several similarly named tables in DAE Databricks?

I'm using PySpark in DAE Databricks to get HES data.
At the moment I do this:
df_test = sqlContext.sql("select * from db_name_2122")
ICD10_codes = ['X85','X87']
df_test = df_test.filter( (df_test.field1 == "something") &
(df_test.field.rlike('|'.join(ICD10_codes) )
df_test_2 = sqlContext.sql("select * from db_name_2021")
ICD10_codes = ['X85','X87']
df_test2 = df_test2.filter( (df_test2.field1 == "something") &
(df_test2.field.rlike('|'.join(ICD10_codes) )
I have to do this for financial years 1112, 1213, 1314, ..., 2122. This is a lot of copy-pasting of similar code and I know this is bad - both from experience of finding c+p errors and also reading stuff.
What I want to do:
Be able to select data where the same conditions are met in the same fields in 11 different financial year tables within a DB and pull it all into one table at the end.
Rather than what I'm doing now which is 11 different but similar copy and paste chunks of code, which are then appended together.
First, before the "appending" you can put all the dfs into the same list:
dfs = []
for i in range(11, 22):
df = sqlContext.sql(f"select * from db_name_{i}{i+1}")
ICD10_codes = ['X85','X87']
df = df.filter((df.field1 == "something") &
(df.field.rlike('|'.join(ICD10_codes))))
dfs.append(df)
And then do the "append". I don't know what do you mean by "append". This is how you could do a unionByName:
df_final = dfs[0]
for df in dfs[1:]:
df_final = df_final.unionByName(df)
Everything can be added into one loop:
ICD10_codes = ['X85','X87']
rng = range(11, 22)
for i in rng:
df = sqlContext.sql(f"select * from db_name_{i}{i+1}")
df = df.filter((df.field1 == "something") &
(df.field.rlike('|'.join(ICD10_codes))))
df_final = df if i == rng[0] else df_final.unionByName(df)
If your table names are more complex, you can put them directly into the list:
tables = ['db_name_1112_text1', 'db_name_1213_text56']
ICD10_codes = ['X85','X87']
for x in tables:
df = sqlContext.sql(f"select * from {x}")
df = df.filter((df.field1 == "something") &
(df.field.rlike('|'.join(ICD10_codes))))
df_final = df if x == tables[0] else df_final.unionByName(df)

Function giving error when run on the same dataframe more than once

Function giving error when run on the same data frame more than once. it works fine the first time but when run again on the same df it gives me this error:
IndexError: single positional indexer is out-of-bounds
def update_data(df):
df.drop(df.columns[[-1, -2, -3]], axis=1, inplace=True)
df.loc['Total'] = df.sum()
df.iloc[-1, 0] = 'Group'
df = df.set_index(list(df)[0])
for i in range(1, 21):
df.iloc[-1, i] = 100 + (100 * (
(df.iloc[-1, i] - df.iloc[-1, 0]) / abs(df.iloc[-1, 0])))
df.iloc[-1, 0] = 100
xax = list(df.columns.values)
yax = df.values[-1].tolist()
d = {'period': xax, 'level': yax}
index_level = pd.DataFrame(d)
index_level['level'] = index_level['level'].round(3)
return index_level
Using inplace=True in a function changes the input data frame. Of course there it doesn't work, your function presumes the data is in some format at the start of the function. That assumption is broken.
df = pd.DataFrame([{'x': 0}])
def change(df):
df.drop(columns=['x'], inplace=True)
return len(df)
change(df)
Out[346]: 1
df
Out[347]:
Empty DataFrame
Columns: []
Index: [0]

More efficient fillna(numpy)

I need an array version of a function similar to Pandas.fillna, in the forum I collected a lot of answers to create the following function, but it is still 3 times times slower than Pandas.fillna, I want to know if there is a better way to optimize, thank you.
def fillna(self,axis=None,mask=None,value=None,method='pad'):
""" array fillna
Parameters
----------
self : 1d/2d
axis : axis(0 or 1)
mask : Custom mask, or Built np.isfinite(x)
value : int
method : 'back', 'pad', 'mean'
--------
"""
x = np.asarray(self)
if mask is None: mask = np.isfinite(x)
if (not value is None)|(method=='mean'):
out = x.copy()
if x.ndim == 1:
if method=='mean':
out[~mask] = np.nanmean(x)
else: out[~mask] = value
else:
vask = ~mask * (np.nanmean(x,1)[:,None] if axis==1 else np.nanmean(x,0))
out[~mask] = vask[~mask]
else:
if axis is None: axis = 0
if x.ndim==1:
if method=='pad':
idx = np.where(mask,np.arange(mask.shape[0]),0)
np.maximum.accumulate(idx,axis=0,out=idx)
return x[idx]
elif method=='back':
idx = np.where(mask[::-1],np.arange(mask.shape[0]),0)
np.maximum.accumulate(idx,axis=0,out=idx)
return x[mask.shape[0]-idx[::-1]-1]
else: return x
if axis==1:
if method=='back': mask = mask[:, ::-1]
idx = np.where(mask,np.arange(mask.shape[1]),0)
else:
if method=='back': mask = mask[::-1,:]
idx = np.where(mask,np.arange(mask.shape[0])[:,None],0)
np.maximum.accumulate(idx,axis=axis,out=idx)
if axis==1:
if method=='back': idx = idx.shape[1]-idx[:, ::-1] - 1
out = x[np.arange(idx.shape[0])[:,None], idx]
else:
if method=='back': idx = idx.shape[0]-idx[::-1, :] - 1
out = x[idx,np.arange(idx.shape[1])]
return out

pandas: map color argument by multidict

I would like to map a color to each row in the dataframe as a function of two columns. It would be much easier with just one column as argument. But how can I achieve this with two columns ?
What I have done so far:
a = np.random.rand(3,10)
i = [[30,10], [10, 30], [60, 60]]
names = ['a', 'b']
index = pd.MultiIndex.from_tuples(i, names = names)
df = pd.DataFrame(a, index=index).reset_index()
c1 = plt.cm.Greens(np.linspace(0.2,0.8,3))
c2 = plt.cm.Blues(np.linspace(0.2,0.8,3))
#c3 = plt.cm.Reds(np.linspace(0.2,0.8,3))
color = np.vstack((c1,c2))
a = df.a.sort_values().values
b = df.b.sort_values().values
mapping = dict()
for i in range(len(a)):
mapping[a[i]] = {}
for ii in range(len(b)):
mapping[a[i]][b[ii]] = color[i+ii]
Maybe something similar to df['color'] = df.apply(lamda x: mapping[x.a][x.b]) ?
Looks like you answered your own question. Apply can happen across the rows by changing the axis argument to 1. df['color'] = df.apply(lambda x: mapping[x.a][x.b], axis =1)