Convert topicmodels output to JSON - text-mining

I use the following function to convert the topicmodels output to JSON output to use in ldavis.
topicmodels_json_ldavis <- function(fitted, corpus, doc_term){
## Required packages
library(topicmodels)
library(dplyr)
library(stringi)
library(tm)
library(LDAvis)
## Find required quantities
phi <- posterior(fitted)$terms %>% as.matrix
theta <- posterior(fitted)$topics %>% as.matrix
vocab <- colnames(phi)
doc_length <- vector()
for (i in 1:length(corpus)) {
temp <- paste(corpus[[i]]$content, collapse = ' ')
doc_length <- c(doc_length, stri_count(temp, regex = '\\S+'))
}
temp_frequency <- inspect(doc_term)
freq_matrix <- data.frame(ST = colnames(temp_frequency),
Freq = colSums(temp_frequency))
rm(temp_frequency)
## Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = doc_length,
term.frequency = freq_matrix$Freq)
return(json_lda)
}
but I receive the following error
Error in LDAvis::createJSON(phi = phi, theta = theta, vocab = vocab, doc.length = doc_length, : Length of doc.length not equal
to the number of rows in theta; both should be equal to the number of
documents in the data.
Here is my complete code:
data <- read.csv("textmining.csv")
corpus <- Corpus(DataframeSource(data.frame(data$reasonforleaving)))
# Remove punctuations and numbers because they are generally uninformative.
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
# Convert all words to lowercase.
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove stopwords such as "a", "the", etc.
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# Use the SnowballC package to do stemming.
library(SnowballC)
corpus <- tm_map(corpus, stemDocument)
# remove extra words
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "still")
corpus <- tm_map(corpus, toSpace, "also")
# Remove excess white spaces between words.
corpus <- tm_map(corpus, stripWhitespace)
# Inspect the first document to see what it looks like.
corpus[[1]]$content
dtm <- DocumentTermMatrix(corpus)
# remove empty documents
library(slam)
dtm = dtm[row_sums(dtm)>0,]
# Use topicmodels package to conduct LDA analysis.
burnin <- 500
iter <- 1000
keep <- 30
k <- 5
result55 <- LDA(dtm, 5)
ldaoutput = topicmodels_json_ldavis(result55,corpus, dtm)
Do you know why I receive the error?
Thanks

I had the same issue with same code, and found this function here :
topicmodels2LDAvis <- function(x, ...){
post <- topicmodels::posterior(x)
if (ncol(post[["topics"]]) < 3) stop("The model must contain > 2 topics")
mat <- x#wordassignments
LDAvis::createJSON(
phi = post[["terms"]],
theta = post[["topics"]],
vocab = colnames(post[["terms"]]),
doc.length = slam::row_sums(mat, na.rm = TRUE),
term.frequency = slam::col_sums(mat, na.rm = TRUE)
)
}
Much much simpler to use, just put as argument your LDA result :
result55 <- LDA(dtm, 5)
serVis(topicmodels2LDAvis(result55))

Problem
Your problem is in for (i in 1:length(corpus)) in
doc_length <- vector()
for (i in 1:length(corpus)) {
temp <- paste(corpus[[i]]$content, collapse = ' ')
doc_length <- c(doc_length, stri_count(temp, regex = '\\S+'))
}
Remember, you have removed some "empty" documents from your DocumentTermMatrix in dtm = dtm[row_sums(dtm)>0,],
so your vector length here is going to be too big.
Suggestion
You may want to keep a vector of the empty docs around as it will help you not only to generate the JSON but also to go back and forth between your empty and full document sets.
doc.length = colSums( as.matrix(tdm) > 0 )[!empty.docs]
My suggestion assumes you have the full tdm with empty docs in place

Related

tidymodels: "following required column is missing from `new_data` in step..."

I'm creating and fitting a workflow for a lasso regression model in {tidymodels}. The model fits fine, but when I go to predict the test set I get an error saying "the following required column is missing from `new_data`". Tha column ("price") is in both the train and test sets. Is this a bug? What am I missing?
Any help would be greatly appreciated.
# split the data (target variable in house_sales_df is "price")
split <- initial_split(house_sales_df, prop = 0.8)
train <- split %>% training()
test <- split %>% testing()
# create and fit workflow
lasso_prep_recipe <-
recipe(price ~ ., data = train) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric())
lasso_model <-
linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
lasso_workflow <- workflow() %>%
add_recipe(lasso_prep_recipe) %>%
add_model(lasso_model)
lasso_fit <- lasso_workflow %>%
fit(data = train)
# predict test set
predict(lasso_fit, new_data = test)
predict() results in this error:
Error in `step_normalize()`:
! The following required column is missing from `new_data` in step 'normalize_MXQEf': price.
Backtrace:
1. stats::predict(lasso_fit, new_data = test, type = "numeric")
2. workflows:::predict.workflow(lasso_fit, new_data = test, type = "numeric")
3. workflows:::forge_predictors(new_data, workflow)
5. hardhat:::forge.data.frame(new_data, blueprint = mold$blueprint)
7. hardhat:::run_forge.default_recipe_blueprint(...)
8. hardhat:::forge_recipe_default_process(...)
10. recipes:::bake.recipe(object = rec, new_data = new_data)
12. recipes:::bake.step_normalize(step, new_data = new_data)
13. recipes::check_new_data(names(object$means), object, new_data)
14. cli::cli_abort(...)
You are getting the error because all_numeric() in step_normalize() selects the outcome price which isn't avaliable at predict time. Use all_numeric_predictors() and you should be good
# split the data (target variable in house_sales_df is "price")
split <- initial_split(house_sales_df, prop = 0.8)
train <- split %>% training()
test <- split %>% testing()
# create and fit workflow
lasso_prep_recipe <-
recipe(price ~ ., data = train) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
lasso_model <-
linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
lasso_workflow <- workflow() %>%
add_recipe(lasso_prep_recipe) %>%
add_model(lasso_model)
lasso_fit <- lasso_workflow %>%
fit(data = train)
# predict test set
predict(lasso_fit, new_data = test)

For loop to read in multiple tables from SQLite database

I would like to create a for loop that reads in multiple tables from a SQLite database. I would like it to either read the first 300 tables, but ideally I would like to get it to read 300 random tables from my database into R.
For each table read in, I would like it to go through the written code, save the graph at the end then start over with a new table. If possible I would like the all of the tables to be on the same graph. I have written the code for a single table, but I am unsure as to how I could proceed from here.
for (i in 1:300){
# Reads the selected table in database
ind1 <- dbReadTable(mydb, i)
# Formats the SQL data to appropriate R data structure
cols <- c("Mortality", "AnimalID", "Species", "Sex", "CurrentCohort",
"BirthYear", "CaptureUnit","CaptureSubunit",
"CaptureArea", "ProjectName")
ind[cols] <- lapply(ind[cols], factor) ## as.factor() could also be used
ind$DateAndTime <- as.POSIXct(ind$DateAndTime, tz = "UTC",
origin = '1970-01-01')
# Converts the Longitude and Latitude to UTMs
ind <- convert_utm(ind1)
ind_steps <- ind %>%
# It's always a good idea to *double check* that your data are sorted
# properly before using lag() or lead() to get the previous/next value.
arrange(AnimalID, DateAndTime) %>%
# If we group_by() AnimalID, lead() will insert NAs in the proper
# places when we get to the end of one individual's data and the beginning
# of the next
group_by(AnimalID) %>%
# Now rename our base columns to reflect that they are the step's start point
rename(x1 = utm_x,
y1 = utm_y,
t1 = DateAndTime) %>%
# Attach the step's end point
mutate(x2 = lead(x1),
y2 = lead(y1),
t2 = lead(t1)) %>%
# Calculate differences in space and time
mutate(dx = x2 - x1,
dy = y2 - y1,
DateAndTime = as.numeric(difftime(t2, t1, units = "hours"))) %>%
# Calculate step length
mutate(sl = sqrt(dx^2 + dy^2)) %>%
# Calculate absolute angle
mutate(abs_angle = (pi/2 - atan2(dy, dx)) %% (2*pi)) %>%
# Calculate relative angle
mutate(rel_diff = (abs_angle - lag(abs_angle)) %% (2*pi),
rel_angle = ifelse(rel_diff > pi, rel_diff - 2*pi, rel_diff)) %>%
# Drop this uneccesary column
select(-rel_diff) %>%
# Drop incomplete final step
filter(!is.na(x2))
ind_steps <- ind_steps %>%
mutate(NSD = (x2 - x1[1])^2 + (y2 - y1[1])^2)
# Plot NSD
ind_steps %>%
ggplot(aes(x = t2, y = NSD)) +
geom_line() +
theme_bw()
}
Any help would be greatly appreciated!
If there are 1000 tables you can use sample to get random 300 from them, create a list with length 300 to store the plots and if you want to plot them together you can use cowplot::plot_grid.
random_tables <- sample(1000, 300, replace = TRUE)
plot_list <- vector('list', 300)
for (i in seq_along(random_tables)){
# Reads the selected table in database
ind1 <- dbReadTable(mydb, random_tables[i])
#...Rest of the code
#....
#....
# Plot NSD
plot_list[[i]] <- ggplot(ind_steps, aes(x = t2, y = NSD)) +
geom_line() + theme_bw()
}
cowplot::plot_grid(plotlist = plot_list, nrow = 30, ncol = 10)

replicate stargazer output like for linear models with spatial models

I am trying to reproduce nice stargazer model (lm) output for model that is not supperted by stargazer.
can linear model stargazer output be produced by hand? Since we can create a dataframe from every model and than insert the created dataframe to stargazer:
library(spdep)
data(afcon, package="spData")
afcon$Y = rnorm(42, 50, 20)
cns <- knearneigh(cbind(afcon$x, afcon$y), k=7, longlat=T)
scnsn <- knn2nb(cns, row.names = NULL, sym = T)
W <- nb2listw(scnsn, zero.policy = TRUE)
ols <- lm(totcon ~ Y, data = afcon)
spatial.lag <- lagsarlm(totcon ~ Y, data = afcon, W)
summary(model)
stargazer(ols, type = "text")
summary(spatial.lag)
data.frame(
spatial.lag$coefficients,
spatial.lag$rest.se
) %>%
rename(coeffs = spatial.lag.coefficients,
se = spatial.lag.rest.se) %>%
stargazer(type = "text", summary = F)
when we do stargazer(ols) output is very nice, I woud like to reproduce same output by hand for spatial.lag is there a way how to do so, how superscript etc...
You mean ^{*}? If so it's not possible in stargazer!! I've already tried it so I recommend you to check the xtable package like I did here.
I will show one approach that can be used: stargazer is really nice and you CAN even create table like above even with the model objects that are not yet supported, e.g. lets say that quantile regression model is not supported by stargazer (even thought is is):
Trick is, you need to be able to obtain coefficients and standart error e.g. as vector. Then supply stargazer with model object that is suppoerted e.g. lm as a template and then mechanically specify which coefficients and standart errors should be used:
library(stargazer)
library(tidyverse)
library(quantreg)
df <- mtcars
model1 <- lm(hp ~ factor(gear) + qsec + disp, data = df)
quantreg <- rq(hp ~ factor(gear) + qsec + disp, data = df)
summary_qr <- summary(quantreg, se = "boot")
# Standart Error for quant reg
se_qr = c(211.78266, 29.17307, 58.61105, 9.70908, 0.12090)
stargazer(model1, model1,
coef = list(NULL, summary_qr$coefficients),
se = list(NULL, se_qr),
type = "text")

Error while finding topics quantity on Latent Dirichlet Allocation model using ldatuning library

This is the outcome error and I can tell this is because there is at least one document without some term, but I don't get why and how I can solve it.
prep_fun = function(x) {
x %>%
str_to_lower %>% #make text lower case
str_replace_all("[^[:alpha:]]", " ") %>% #remove non-alpha symbols - chao punctuation y #
str_replace_all("\\s+", " ") %>% #collapse multiple spaces
str_replace_all("\\W*\\b\\w\\b\\W*", " ") #Remuevo letras individuales
}
tok_fun <- function(x) {
tokens <- word_tokenizer(x)
textstem::lemmatize_words(tokens)
}
it_patentes <- itoken(data$Abstract,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab <- create_vocabulary(it_patentes, ngram = c(ngram_min = 1L, ngram_max = 3L),
stopwords = tm::stopwords("english"))
pruned_vocab <- prune_vocabulary(vocab, term_count_min = max(vocab$term_count)*.01,
doc_proportion_min = 0.001)
vectorizer <- vocab_vectorizer(pruned_vocab)
dtm <- create_dtm(it_patentes, vectorizer,type = "dgTMatrix", progressbar = FALSE)
> #Plot the metrics to get number of topics
> t1 <- Sys.time()
> tunes <- FindTopicsNumber(
+ dtm = dtm,
+ topics = c(2:25),
+ metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010"),
+ method = "Gibbs",
+ control = list(seed = 17),
+ mc.cores = 4L,
+ verbose = TRUE
+ )
fit models...Error in checkForRemoteErrors(val) :
4 nodes produced errors; first error: Each row of the input matrix needs to contain at least one non-zero entry
> print(difftime(Sys.time(), t1, units = 'sec'))
Time difference of 9.155343 secs
> FindTopicsNumber_plot(tunes)
Error in base::subset(values, select = 2:ncol(values)) :
object 'tunes' not found
Even though I know ldatuning is made for topicmodels, I don't think there might be a huge difference to get a number to start testing, is there?
ldatuning expects input dtm matrix in a different format (format from topicmodels package). You need to convert dtm (sparse matrix from Matrix package) to a format which ldatuning can understand

shiny sliderInput range minimum and maximum values

I need to adjust a histogram output in shiny with a range values:
(where m is an arbitrary matrix)
#ui.R
sliderInput(inputId="adjust", label="Choose adjacency threshold", value=c(0.001, 0.9), min=0.0001, max=1),
plotOutput("hist")
#server.R
df<-reactive({
idx = m > min(input$adjust) & m < max(input$adjust)
data.frame(
id = row(m)[idx],
value = m[idx])
})
output$hist<-renderPlot({hist(df()$values)})
However this doesn't seem to affect the histogram- it re-renders when I toggle the slider but its the same each time.... it takes a very long time and it just seems to take all of the values in into account?
Does anyone know how to make this work?
when I try to print the slider's min/max value- nothing comes to the page:
#ui.R
verbatimTextOutput("x")
#server
output$x<-renderPrint({min(input$adjust)})
Hence I might be approaching this the completely wrong way... does anyone know how to do this?
FULL EXAMPLE
library(shiny)
runApp(list(ui = fluidPage(sliderInput(inputId="adjust", label="Choose adjacency threshold", value=c(0.001, 0.9), min=0.0001, max=1),
plotOutput("hist")
server=function(input, output){
adjacentmat<-reactive({adjacency(dat)})
data<-reactive({
adj_mat<-adjacentmat()
adj_mat[adj_mat < input$adjust] <- 0
m<-adj_mat
idx = m > min(input$adjust) & m < max(input$adjust)
data.frame(
source = row(m)[idx],
target = col(m)[idx],
corr = m[idx])
})
output$hist<-renderPlot({hist(data()$corr)})
}
)
generate the dat variable with the following code:
library('dplyr')
set.seed(1)
# generate a couple clusters
nodes_per_cluster <- 30
n <- 10
nvals <- nodes_per_cluster * n
# cluster 1 (increasing)
cluster1 <- matrix(rep((1:n)/4, nodes_per_cluster) +
rnorm(nvals, sd=1),
nrow=nodes_per_cluster, byrow=TRUE)
# cluster 2 (decreasing)
cluster2 <- matrix(rep((n:1)/4, nodes_per_cluster) +
rnorm(nvals, sd=1),
nrow=nodes_per_cluster, byrow=TRUE)
# noise cluster
noise <- matrix(sample(1:2, nvals, replace=TRUE) +
rnorm(nvals, sd=1.5),
nrow=nodes_per_cluster, byrow=TRUE)
dat <- rbind(cluster1, cluster2, noise)
colnames(dat) <- paste0('n', 1:n)
rownames(dat) <- c(paste0('cluster1_', 1:nodes_per_cluster),
paste0('cluster2_', 1:nodes_per_cluster),
paste0('noise_', 1:nodes_per_cluster))
This works for me:
library(shiny)
runApp(list(ui = fluidPage(
mainPanel(sliderInput("test", "Select values", value= c(.001,.9), min= 0.0001, max= 1)),
verbatimTextOutput("test2")
),
server = function(input, output, session) {
output$test2 <- renderPrint(min(input$test))
}))
I'm guessing your problem is somewhere in the code you haven't shown us. Can you give the code for the entire running example of your problem?