Add new column to MonetDBLite table - sql

I am trying to use R + MonetDB as a large-data analysis stack and am having trouble creating a new column and populating it with data from my analysis. Here is a toy example:
library(MonetDBLite)
library(DBI)
data(mtcars)
db <- dbConnect(MonetDB.R::MonetDB(), embedded="./test.db")
# load mtcars into the database
dbWriteTable(conn=db, value = mtcars, name = "mtcars", overwrite=TRUE)
## Add a new column
dbSendQuery(db, "ALTER TABLE mtcars ADD v1 DOUBLE;")
## insert data into the new column
v1 <- mtcars["mpg"] * pi
dbSendQuery(db, "INSERT INTO mtcars (v1) VALUES (?)", bind.data=v1)
And the error message:
Error in .local(conn, statement, ...) :
Unable to execute statement 'INSERT INTO mtcars (v1) VALUES ('c(65.9734457253857, 65.9734457253857, 71.6283125018473, 67.23008278...'.
Server says 'ERROR: 49.6371639267187, 61.8893752757189, 47.1238898038469, 67.2300827868216)' to type dbl failed.
' [#conversion of string 'c(65.9734457253857, 65.9734457253857, 71.6283125018473, 67.2300827868216, 58.7477826221291, 56.8628270299753, 44.924774946334, 76.6548607475909, 71.6283125018473, 60.318578948924, 55.9203492338983, 51.5221195188726, 54.3495529071034, 47.7522083345649, 32.6725635973338, 32.6725635973338, 46.18141200777, 101.787601976309, 95.5044166691297, 106.499990956694, 67.5442420521806, 48.6946861306418, 47.7522083345649, 41.7831822927443, 60.318578948924, 85.7654794430014, 81.6814089933346, 95.5044166691297,
].
In addition: Warning message:
In if (is.na(value)) statement <- sub("?", "NULL", statement, fixed = TRUE) else if (valueClass %in% :
the condition has length > 1 and only the first element will be used
From this error I surmise that maybe bind.data can't be used with MonetDBLite?
Question:
How can I add a column(s) to a MonetDBLite table and populate it with data from my R session?

First of all the "INSERT" command in your last statement is not correct. You will need the "UPDATE" statement.
That being said, I propose a solution where you can populate your MonetDBLite table directly from R:
library(MonetDBLite)
library(DBI)
data(mtcars)
db <- dbConnect(MonetDB(), embedded="./test.db")
# I added a rownbr to the dataset so it will be easier later
mtcars$rownbr <- 1:nrow(mtcars)
# load mtcars into the database
dbWriteTable(conn=db, value = mtcars, name = "mtcars", overwrite=TRUE)
## Add a new column
dbSendQuery(db, "ALTER TABLE mtcars ADD v1 DOUBLE;")
## insert data into the new column
v1 <- mtcars["mpg"] * pi
for (i in 1:nrow(mtcars)){
myquery <- paste0("UPDATE mtcars SET v1 = ",v1$mpg[i], "where rownbr =",i," ;")
dbSendQuery(db, myquery )
}

There seems to be an issue with parameterized SQL queries via dbBind with MonetDBLite (see https://github.com/hannesmuehleisen/MonetDBLite-R/issues/16). The code below works with SQLite:
library(RSQLite)
data(mtcars)
db <- dbConnect(SQLite(), ":memory:")
# load mtcars into the database
dbWriteTable(conn=db, value = mtcars, name = "mtcars", overwrite = TRUE, row.names=TRUE)
## Add a new column
dbSendQuery(db, "ALTER TABLE mtcars ADD v1 DOUBLE;")
## do computation with R
mtcars$v1 <- mtcars$mpg * pi
mtcars$row_names <- rownames(mtcars)
update_query <- dbSendQuery(db, 'update mtcars set "v1"=$v1 WHERE row_names=$row_names')
dbBind(update_query, mtcars[, c("v1", "row_names")]) # send the updated data
dbClearResult(update_query) # release the prepared statement
dbReadTable(db, "mtcars")
dbDisconnect(db)
However, with MonetDBLite, it is producing an error (and a warning) for the dbBind step:
> dbBind(update_query, mtcars[, c("v1", "row_names")]) # send the updated data
Error in vapply(params, function(x) { : values must be length 1,
but FUN(X[[1]]) result is length 32
In addition: Warning message:
In if (is.na(x)) "NULL" else if (is.numeric(x) || is.logical(x)) { :
the condition has length > 1 and only the first element will be used
A workaround I came up with is to use the glue_data_sql in the glue package to "manually" compose the queries (without needing to loop through rows):
library(MonetDBLite)
library(DBI)
data(mtcars)
db <- dbConnect(MonetDB(), embedded="./test.db")
dbWriteTable(conn=db, value = mtcars, name = "mtcars", overwrite = TRUE, row.names=TRUE)
dbSendQuery(db, "ALTER TABLE mtcars ADD v1 DOUBLE;")
library(glue)
mtcars$row_names <- rownames(mtcars)
mtcars$v1 <- mtcars$mpg * pi
update_query <- glue_data_sql(mtcars, "update mtcars set v1 = {v1} where row_names = {row_names};", .con=db)
lapply(update_query, dbSendQuery, conn=db)
# verify
dbReadTable(db, "mtcars")
dbDisconnect(db)
Yet another workaround that does the insert in one query (closer to #Zelazny7 's original attempt):
library(MonetDBLite)
library(DBI)
data(mtcars)
db <- dbConnect(MonetDB(), embedded="./test.db")
dbSendQuery(db, "CREATE TABLE mtcars (
row_names VARCHAR(32),
v1 DOUBLE);")
library(glue)
mtcars$row_names <- rownames(mtcars)
mtcars$v1 <- mtcars$mpg * pi
insert_values <- glue_data(mtcars, "('{row_names}', {v1})")
insert_values <- glue_collapse(insert_values, sep=", ", last="")
insert_query <- glue("INSERT INTO mtcars (row_names, v1) VALUES {insert_values}")
dbSendQuery(db, insert_query)
dbReadTable(db, "mtcars")
dbDisconnect(db)

Related

Optimization : splitting a column into a thoursand columns in R or SQLite

I need to analyse data from a very large dataset. For that, I need to separate a character variable into more than a thousand columns.
The structure of this variable is :
number$number$number$ and so on for a thousand numbers
My data is stored in a .db file from SQLite. I then imported it in R using the package "RSQLite".
I tried splitting this column into multiple columns using dplyr :
#d is a data.table with my data
d2=d %>% separate(column_to_separate, paste0("S",c(1:number_of_final_columns)))
It works, but it is also taking forever. Do someone have a solution to split this column faster (either on R or using SQLite) ?
Thanks.
You may use the tidyfast package (see here), that leverages on data.table. In this test, it is approximately three times faster:
test <- data.frame(
long.var = rep(paste0("V", 1:1000, "$", collapse = ""), 1000)
)
system.time({
test |>
tidyr::separate(long.var, into = paste0("N", 1:1001), sep="\\$")
})
#> user system elapsed
#> 0.352 0.012 0.365
system.time({
test |>
tidyfast::dt_separate(long.var, into = paste0("N", 1:1001), sep="\\$")
})
#> user system elapsed
#> 0.117 0.000 0.118
Created on 2023-02-03 with reprex v2.0.2
You can try to write the file as is and then try to load it with fread, which is in general rather fast.
library(data.table)
library(dplyr)
library(tidyr)
# Prepare example
x <- matrix(rnorm(1000*10000), ncol = 1000)
dta <- data.frame(value = apply(x, 1, function(x) paste0(x, collapse = "$")))
# Run benchmark
microbenchmark::microbenchmark({
dta_2 <- dta %>%
separate(col = value, sep = "\\$", into = paste0("col_", 1:1000))
},
{
tmp_file <- tempfile()
fwrite(dta, tmp_file)
dta_3 <- fread(tmp_file, sep = "$", header = FALSE)
}, times = 3
)
Edit: I tested the speed and it seems faster than dt_seperate from tidyfast, but it depends on the size of your dataset.

Create dynamic SQL query depending on user input in R Shiny App

I have an Shiny App where User can filter a SQL Database of Movies. So far, you can only filter by different countries.
con <- dbConnect(RSQLite::SQLite(), 'Movies.db')
movies_data <- dbReadTable(con, 'Movies')
ui <- fluidPage(
fluidRow(
selectInput(
inputId = "country",
label = "Country:",
choices = movies_data$journal,
multi=T
),
br(),
fluidRow(width="100%",
dataTableOutput("table")
)
)
)
server <- function(input, output, session) {
output$table <- renderDataTable({
dbGetQuery(
conn = con,
statement = 'SELECT * FROM movies WHERE country IN ( ? )',
params = list(input$country))
})
}
shinyApp(ui = ui, server = server)
Now i want to give the user more Filters, for example Actor or Genre. All Filters are Multiselect and optional. How can i create the Statement dynamic? Would i use some switch statement for every possible combination (i.e. no Filter on Country but only Action Movies)? This seems ab it bit exhausting to me.
First off, you say the filter is optional but I see no way to disable it in your code. I'm assuming that deselecting all options is your way of disabling the filter, or at least that it's intended to work that way. If all options are selected for any filter, then the current approach should work fine, and will just show all films.
You can probably just construct the overall query piece by piece, and then paste it all together at the end.
Base query: 'SELECT * FROM movies'
Country filter: 'country in ' input country
Actor filter: 'actor in' input actor
Genre filter: 'genre in' input genre
Then you put it all together with paste.
To summarize: Base query. Then, if any of the filters are active, add a WHERE. Join all filters together, separating by AND. Pass the final query in as a direct string.
You can even put the filters into a list for easier parsing.
# Here, filterList is a list containing input$country, input$actor, input$genre
# and filterNames contains the corresponding names in the database
# e.g. filterList <- list("c1", list("a1", "a2"), "g1")
# filterNames <- filterNames <- list("c", "a", "g")
baseQuery <- "SELECT * FROM movies"
# If any of the filters have greater than 0 value, this knows to do the filters
filterCheck <- any(sapply(filterList, length)>0)
# NOTE: If you have a different selection available for None
# just modify the sapply function accordingly
if(filterCheck)
{
baseQuery <- paste(baseQuery, "WHERE")
# This collapses multiselects for a filter into a single string with a comma separator
filterList <- sapply(filterList, paste, collapse = ", ")
# Now you construct the filters
filterList <- sapply(1:length(filterList), function(x)
paste0(filterNames[x], " IN (", filterList[x], ")"))
# Paste the filters together
filterList <- paste(filterList, collapse = " and ")
baseQuery <- paste(baseQuery, filterList)
}
# Final output using the sample input above:
# "SELECT * FROM movies WHERE c IN (c1) and a IN (a1, a2) and g IN (g1)"
Now use baseQuery as the direct query statement

Stored Procedure with R Script in SSMS 2016 Error

I have a stored procedure in SQL Server 2016, within which I'm executing an external R script. I'm having trouble figuring out why the procedure works for me, but whenever any one person runs it, they get this error. After they run it one time, I also cannot run the procedure and this error is the default result:
Location: tmpilb.cpp:2532
Expression: fFalse
SPID: 76
Process ID: 2036
Description: Attempt to access expired blob handle (1)
Msg 21, Level 20, State 1, Line 0
Warning: Fatal error 3624 occurred at Jun 19 2017 3:07PM. Note the error
and time, and contact your system administrator.
Msg 596, Level 21, State 1, Line 0
Cannot continue the execution because the session is in the kill state.
Msg 0, Level 20, State 0, Line 0
A severe error occurred on the current command. The results, if any, should
be discarded.
The stored procedure executes R script that links to a separate database via RODBC, pulls data, performs manipulations, and sends the results back into SQL to populate a table.
EDIT: Including Stored Procedure code:
USE [Park_Analytics]
GO
/****** Object: StoredProcedure [dbo].[storedproc] Script Date: 6/19/2017 2:40:38 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
ALTER procedure [dbo].[storedproc]
#startdate date NULL
AS
set #startdate = ISNULL(#startdate,DATEADD(wk,DATEDIFF(wk,7,GETDATE()),-15))
declare #enddate datetime = dateadd(D,1,#startdate)
create table #temp_results
(Park NVARCHAR(max) NOT NULL,
Turbine int NOT NULL,
avg_wtgp_kw float NULL,
avg_metwndspd_ms float NULL,
avg_metextmp_degc float NULL,
st_wtgp_kw float NULL,
st_metwndspd_ms float NULL,
st_metextmp_degc float NULL,
site_avg_wtgp_kw float NULL,
site_avg_metwndspd_ms float NULL,
site_avg_metextmp_degc float NULL,
site_st_wtgp_kw float NULL,
site_st_metwndspd_ms float NULL,
site_st_metextmp_degc float NULL,
Z_Score_avg_wtgp_kw float NULL,
Z_Score_avg_metwndspd_ms float NULL,
Z_Score_avg_metextmp_degc float NULL,
Alarm_Level_Z_Score_avg_wtgp_kw float NULL,
Alarm_Level_Z_Score_avg_metwndspd_ms float NULL,
Alarm_Level_Z_Score_avg_metextmp_degc float NULL)
insert into #temp_results
EXEC sp_execute_external_script
#language = N'R'
,#script = N'
library(RODBC)
# String to connect to global database
piglobal_str <- ''driver={SQL Server};server=Server.corp.org\\Servername;database=Assets;uid=generic_login;pwd=generic_pwd''
# Connect to the database
Assets <- odbcDriverConnect(piglobal_str)
# Write SQL Query to execute within R
QueryString <- ''SELECT
w.wpp_name as wpp,
pd.wt,
pd.datetime,
pd.datetime_local,
avg_wtgp_kw,
avg_metwndspd_ms,
avg_metextmp_degc,
FROM [Assets].[dbo].[PI_Main10mindata] pd
left join [Assets].[dbo].PI_Main10mindata_temp pt on pt.wpp=pd.wpp and pt.datetime = pd.datetime and pt.wt=pd.wt
left join [Assets].[dbo].[pi_wpps] w on w.wpp_pi = pd.wpp
where pt.datetime between DATEADD(wk,DATEDIFF(wk,7,GETDATE()),-15)
and DATEADD(wk,DATEDIFF(wk,7,GETDATE()),-14)
order by pt.datetime''
# Set query results to a dataframe. as.is parameter is to keep all datatypes exactly the same as they are read.
pidata <- sqlQuery(Assets, QueryString,as.is=TRUE)
# Closing the RODBC Channel
odbcClose(Assets)
# Basic data processing. Setting the datatype to numeric.
pidata[,-which(names(pidata) %in% c("datetime","datetime_local","wpp"))] <-
sapply(pidata[,-which(names(pidata) %in% c("datetime","datetime_local","wpp"))], function(x) as.numeric(x))
# Filter on availability and on kW output
filtered <- pidata[pidata$avg_wtgst_int %in% c(100, 200) & pidata$avg_wtgp_kw > 500 & !(is.na(pidata$avg_wtgp_kw)),]
# Remove date columns and state int column. They wont be important from here on out until theyre put back into the SQL table.
df_turbinefilter <- filtered[,-which(names(filtered) %in% c("datetime","datetime_local","avg_wtgst_int"))]
# Taking out the "avg_" in front of all of the columns. Will make sense later when I rename columns after aggregations.
colnames(df_turbinefilter) <- gsub("avg_","",colnames(df_turbinefilter),perl = TRUE)
## PERFORM AGGREGATIONS ##
library(reshape2)
df_melt <- melt(df_turbinefilter, id= c("wpp","wt"))
agged <- dcast(df_melt, wpp + wt ~ variable, mean)
df_meltst <- melt(df_turbinefilter, id=c("wpp","wt"))
aggedst <- dcast(df_meltst, wpp + wt ~ variable, sd)
# Aggregated data joining
# list of data.frames
list_of_df <- list(agged, aggedst)
# names of data.frames
names(list_of_df) <- c("avg", "st")
# my sequence and names of data.frames in a list
my_seq <- seq_along(list_of_df)
my_list_names <- names(list_of_df)
# Renaming new columns. Including avg_ and st_ for average and standard deviation.
for (i in my_seq) {
names(list_of_df[[my_seq[i]]]) <-
paste(my_list_names[i], names(list_of_df[[my_seq[i]]]), sep = "_")
}
# Binding the columns from the agged table (to keep the nomenclature) with the other aggregate calcs
scored_data <- cbind(agged[,which(names(agged) %in% c("wpp","wt"))],
list_of_df$avg[,-which(names(list_of_df$avg) %in% c("avg_wpp","avg_wt"))],
list_of_df$st[,-which(names(list_of_df$st) %in% c("st_wpp","st_wt"))] )
## SITE AGGREGATIONS ##
df_sitefilter <- df_turbinefilter[,-which(names(df_turbinefilter) %in% c("wt"))]
df_meltsite <- melt(df_sitefilter, id="wpp")
aggedsite <- dcast(df_meltsite, wpp ~ variable, mean)
df_meltstsite <- melt(df_sitefilter, id="wpp")
aggedstsite <- dcast(df_meltstsite, wpp ~ variable, sd)
# Bug - have to create a separate vector
wpp <- aggedsite$wpp
# Aggegrated data joining
# list of data.frames
list_of_dfs <- list(aggedsite, aggedstsite)
# names of data.frames
names(list_of_dfs) <- c("site_avg", "site_st")
# my sequence and names of data.frames in a list
my_seqs <- seq_along(list_of_dfs)
my_list_namess <- names(list_of_dfs)
# Renaming new columns.
for (i in my_seqs) {
names(list_of_dfs[[my_seqs[i]]]) <-
paste(my_list_namess[i], names(list_of_dfs[[my_seqs[i]]]), sep = "_")
}
# Binding the columns from the aggedsite table (after setting wpp as an object) to the other aggregate tables
scored_data_site <- cbind(wpp,
list_of_dfs$site_avg[,-which(names(list_of_dfs$site_avg) %in% c("site_avg_wpp"))],
list_of_dfs$site_st[,-which(names(list_of_dfs$site_st) %in% c("site_st_wpp"))] )
## Main SIT Table: All turbine avgs, standard deviations, and site averages and standard deviations.
library(dplyr)
parent_df <- left_join(scored_data, scored_data_site, by = "wpp")
# Creating variables to keep the for-loop dynamic in the case that changes need to be made.
# Finding the number of columns associated with the tags. This will aid in calculating statistics.
tag_length <- length(df_turbinefilter[,-which(names(pidata) %in% c("wpp", "wt"))])
# Finding the first index column number where the condition is met. This is how the for loop will start.
tag_start <- which.max(!(names(parent_df) %in% c("wpp","wt")))
length_before_calcs <- length(parent_df)
## PERFORMING STATISTICAL CALCULATIONS AND ADDING NEW COLUMNS: WTG Avg-Site Avg/(Site St.Dev ^2/WTG St. Dev)
for (i in tag_start:tag_length) {
parent_df[,ncol(parent_df) + 1] <-
(parent_df[,i]-parent_df[,i+2*tag_length])/((parent_df[,i+3*(tag_length)])^2/parent_df[,i+(tag_length)])
names(parent_df)[ncol(parent_df)] <- paste0("Z_Score_",names(parent_df[i]))
}
## ADDING AN ALARM LEVEL COLUMN (0, 1, 2) DEPENDING ON THE CURRENT Z_SCORE
for (i in (length_before_calcs+1):ncol(parent_df)) {
parent_df[,ncol(parent_df) + 1] <-
ifelse(abs(parent_df[,i])<1, 0,ifelse(1<=abs(parent_df[,i]), 1, ifelse(abs(parent_df[,i]) >=2,2,0)))
names(parent_df)[ncol(parent_df)] <- paste0("Alarm_Level_", names(parent_df[i]))
}
OutputDataSet <- parent_df'
,#input_data_1 = N''
insert into SIT_WTG(
Date,
Park,
Turbine,
avg_wtgp_kw,
avg_metwndspd_ms,
avg_metextmp_degc,
st_wtgp_kw,
st_metwndspd_ms,
st_metextmp_degc,
site_avg_wtgp_kw,
site_avg_metwndspd_ms,
site_avg_metextmp_degc,
site_st_wtgp_kw,
site_st_metwndspd_ms,
site_st_metextmp_degc,
Z_Score_avg_wtgp_kw,
Z_Score_avg_metwndspd_ms,
Z_Score_avg_metextmp_degc,
Alarm_Level_Z_Score_avg_wtgp_kw,
Alarm_Level_Z_Score_avg_metwndspd_ms,
Alarm_Level_Z_Score_avg_metextmp_degc)
select #startdate as Date , *
from #temp_results t
drop table #temp_results
After doing quite a bit of research, I realized that this stored procedure was not being automatically recompiled. I used
WITH RECOMPILE AS
in the beginning of the stored procedure and this solved my problem.

R: RSQLite randomly corrupted databasefiles occur, do I need to close connection?

I load several databasefiles (SQLite) and subject them to a simple query:
library("RSQLite")
drv <- dbDriver ("SQLite")
get.wa2 <- function(file){
con <- dbConnect (drv, dbname = file)
table <- dbGetQuery (con, "Select data3 from data where data2 like 'xxx' ")
return(table)
}
database.files<- dir(database.path)
database.files <- database.files[grep(".db$",database.files, perl = T)] ### select only database files
count.wa <- sapply(database.files,get.wa2)
I run into problems since my files are randomly corrupted, or wiped.. appearing as 0 byte in filesize.
Am I doing something wrong and should I be closing connections after each query. What is best practice here?
Try an additional logical qualifier vector:
database.files<- dir(database.path)
database.files <- database.files[grep(".db$",database.files, perl = T) &
file.info(database.files)[,"size"] > 0 ]
?file.info
If the error occurs as a result of processing, then you need to look at ?try and the various error handling capacities that R provides.

Providing lookup list from R vector as SQL table for RODBC lookup

I have a list of IDs in an R vector.
IDlist <- c(23, 232, 434, 35445)
I would like to write an RODBC sqlQuery with a clause stating something like
WHERE idname IN IDlist
Do I have to read the whole table and then merge it to the idList vector within R? Or how can I provide these values to the RODBC statement, so recover only the records I'm interested in?
Note: As the list is quite long, pasting individual values into the SQL statement, as in the answer below, won't do it.
You could always construct the statement using paste
IDlist <- c(23, 232, 434, 35445)
paste("WHERE idname IN (", paste(IDlist, collapse = ", "), ")")
#[1] "WHERE idname IN ( 23, 232, 434, 35445 )"
Clearly you would need to add more to this to construct your exact statement
I put together a solution to a similar problem by combining the tips here and here and running in batches. Approximate code follows (retyped from an isolated machine):
#assuming you have a list of IDs you want to match in vIDs and an RODBC connection in mycon
#queries that don't change
q_create_tmp <- "create table #tmptbl (ID int)"
q_get_records <- "select * from mastertbl as X join #tmptbl as Y on (X.ID = Y.ID)"
q_del_tmp <- "drop table #tmptbl"
#initialize counters and storage
start_row <- 1
batch_size <- 1000
allresults <- data.frame()
while(start_row <= length(vIDs) {
end_row <- min(length(vIDs), start_row+batch_size-1)
q_fill_tmp <- sprintf("insert into #tmptbl (ID) values %s", paste(sprintf("(%d)", vIDs[start_row:end_row]), collapse=","))
q_all <- list(q_create_tmp, q_fill_tmp, q_get_records, q_del_tmp)
sqlOutput <- lapply(q_all, function(x) sqlQuery(mycon, x))
allresults <- rbind(allresults, sqlOutput[[3]])
start_row <- end_row + 1
}