can't execute mysql request via R - sql

I am having difficulty. I need to get the desired result through mysql query in R .
my attempt.
load libraries which i use.
library(RMySQL)
library(data.table)
library(dplyr)
library(Rcpp)
library(zoo)
library(gsubfn)
library(proto)
library(RSQLite)
library(DBI)
library(RMariaDB)
library(dbx)
then i try perform connect
getDf <- function (connect, sql)
{
str(paste("EXECUTE: ", sql))
query <- dbSendQuery(connect, sql)
df <- dbFetch(query, n = -1)
dbClearResult(query)
str(paste("ROW FETCHED:", nrow(df)))
df
}
then
db_user <- 'k'
db_password <- 'F'
db_name <- 'yyy'
db_table <- 'mytable'
db_host <- 'xxx' # for local access
db_port <- 3306
readDB <- dbConnect(RMariaDB::MariaDB(), user = db_user, pass = db_password, dbname = db_name, host = db_host, port = db_port)
and here i try get prepared data via query mysql
df333 <- getDf(readDB, paste("UPDATE incoming_aggregation_google ia
JOIN data_aggregation_google_median dm ON
(ia.agency_id = dm.agency_id)
AND (ia.search_category_id = dm.search_category_id)
AND (ia.offer_category_id = dm.offer_category_id)
AND (ia.flight_codes = dm.flight_codes)
AND (ia.search_type_category_id = dm.search_type_category_id)
SET ia.prediction_diff_percent_base_price = dm.median_diff_percent_base_price + (0.4 * RAND() - 0.2);
UPDATE incoming_aggregation_google_general ia
JOIN data_aggregation_google_general_median dm ON
(ia.agency_id = dm.agency_id)
AND (ia.offer_category_id = dm.offer_category_id)
AND (ia.search_type_category_id = dm.search_type_category_id)
AND (ia. service_discount_category_id = dm. service_discount_category_id)
SET ia.prediction_diff_percent_base_price = dm.median_diff_percent_base_price + (0.4 * RAND() - 0.2);
UPDATE incoming_aggregation_google_general da
JOIN data_aggregation_google_general_median i ON
(
i.agency_id=da.agency_id
AND i.offer_category_id=da.offer_category_id
AND i.service_discount_category_id=da.service_discount_category_id
AND i.search_type_category_id=da.search_type_category_id
)
set da.prediction_diff_percent_base_price = i.median_diff_percent_base_price;", db_table))
but the error
Error: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'UPDATE incoming_aggregation_google_general ia
JOIN data_aggregation_google_gener' at line 9 [1064]
Please help me understand what I did wrong and how to fix it? If what I want to do is not can be done, is there a way to translate the essence of this query into logic R.
Thanks for your any valuable help

Related

R: SQL query for mean extraction

I'd like to aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2 using a SQL query with glue package. But when I try:
library(dplyr)
library(rgdal)
library(rgeos)
library(DBI)
library(glue)
# get AOI
download.file(
"https://github.com/Leprechault/trash/raw/main/stands_example.zip",
zip_path <- tempfile(fileext = ".zip")
)
unzip(zip_path, exdir = tempdir())
# Open the files
setwd(tempdir())
stands_ds <- read.csv("pred_target_stands.csv", sep=";") # Data set
stands_ds <- stands_ds %>%
mutate(DATA_S2 = ymd(DATA_S2))
stands_ds$CLASS<-c(rep("A",129),rep("B",130))
stands_ds$CD<-abs(rnorm(length(stands_ds[,1]),mean=50))
# Crete like a SQL server condition
bq_conn<- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
DBI::dbWriteTable(bq_conn, "stands_ds", stands_ds)
# Aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2
sqlInput_pred_FARM <- glue::glue_sql("SELECT * FROM stands_ds AVG(CD) GROUP BY CD_TALHAO = {x} AND ID_UNIQUE = {y} AND DATA_S2 = {z}",
x = "001G", y = "CERROCOROADO_001G",
z = "2021-04-02",.con=bq_conn)
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, as.character(sqlInput_pred_FARM, stringsAsFactors = T))
I always have Error: near "(": syntax error as output. Please, any help with it?
Sorry #KU99 but your solution doesn't return the mean values as I expected. Now, I try something new like an object creation for my mean operation and despite some ugly results with REPLICATE(DATE()), now works. The solution is:
# Aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2
sqlInput_pred_FARM <- glue::glue_sql("SELECT REPLICATE(CD_TALHAO,1) AS TALHAO, REPLICATE(ID_UNIQUE,1) AS ID, REPLICATE(DATE(DATA_S2),1) AS DATE, AVG(CD) AS CD FROM stands_ds GROUP BY CD_TALHAO = {x},ID_UNIQUE = {y}, DATA_S2 = {z} ORDER BY CD_TALHAO = {x},ID_UNIQUE = {y}, DATA_S2 = {z}",
x = "001G", y = "CERROCOROADO_001G",
z = "2021-04-02",.con=bq_conn)
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, as.character(sqlInput_pred_FARM, stringsAsFactors = T))
pred_attack_BQ_FARM
# TALHAO ID DATE CD
#1 001C CERROCOROADO_001C -4661-02-24 49.93823
#2 001G CERROCOROADO_001G -4661-02-24 50.12102
Try the following:
statement <- "SELECT * FROM stands_ds AVG(CD)
GROUP BY CD_TALHAO = ? AND ID_UNIQUE = ? AND DATA_S2 = ?"
pars <- list("001G", "CERROCOROADO_001G","2021-04-02")
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, statement, params = pars)

How to send data frame from r to SQL?

#setwd('Desktop/IE332')
install.packages("wakefield")
install.packages("RMySQL")
install.packages("randomNames")
install.packages('password')
install.packages('OpenRepGrid')
library(RMySQL)
library(password)
library(wakefield)
library(randomNames)
industriesData <- read.csv('Industries.csv')
skills <- read.csv('Skills.csv')
sp500 <- read.csv("http://www.princeton.edu/~otorres/sandp500.csv")
companies <- sample(sp500$Name, 100)
locations <- c('Northwest', 'Midwest', 'Northeast', 'South', 'Southwest', 'Southeast',
'International') # Locations
gpas <- c(4,3.5,3,2.5,2)
n <- 100
locPrefs <- numeric(n)
studentSkills <- matrix(nrow=100,ncol=10)
studentInd <- matrix(nrow=100,ncol=5)
jobSkills <- matrix(nrow=100,ncol=5)
for(j in 1:n){ # Samples random skills assigned to students
studentSkills[j,] <- sample(skills[,1],10,replace=FALSE)
studentInd[j,] <- sample(industriesData[,1],5,replace=FALSE)
jobSkills[j,] <- sample(skills[,1],5,replace=FALSE)
}
studentData <- data.frame('first names'=randomNames(n, which.names = 'first'),'last
names'=randomNames(n, which.names = 'last'),'username'=seq(1,
n),'password'=password(8,numbers=TRUE),'gpa'=gpa(n, mean = 85.356, sd = 3.2, name =
"GPA"),'visa'=sample(c("N","Y"), size = n, replace = TRUE, prob = c(.78, .22)), 'loc
pref'=sample(locations,n,replace = TRUE), 'skill'=studentSkills, 'Industry'=studentInd) # Student data
employerData <- data.frame('company names'=companies, 'pref
gpa'=sample(gpas,n,replace=TRUE), 'sponser?'=sample(c('N','Y'), size=n, replace = TRUE, prob
= c(.78, .22)), 'job id'=sample(seq(100,999),n,replace=FALSE),'pref skill'=jobSkills,
'industry'=sample(industriesData[,1],n,replace=TRUE),'location'=sample(locations,n,replace =
TRUE)) # Employer data
I am trying to send certain columns of the studentData and employerData to tables in SQL, how would i go about doing that? I have a table named students where I would like to upload the first and last names of the studentsData data frame into this SQL table.

dateRangeInput in reactive SQl query- date output is in wrong format

Has anyone faced the problem with dateRangeInput? I want to use it to filter my data, however the output of the dateRangeInput is wrong as 2016-02-21, and I need it to be 21.02.2016. I thought that the format=dd.mm.yyyy will solve it, however I do not get any change..
My code:
library(ROracle)
library(shiny)
library(DT)
server <- shinyServer(
function(input, output, session) {
con <- dbConnect(dbDriver("Oracle"),"xx/K",username="user",password="pwd")
tableList <- dbListTables(con,schema="K")
updateSelectizeInput(session, "tabnames", server = TRUE, choices = tableList)
output$date_ui=renderUI({
dateRangeInput('date',
label = 'Datum: dd.mm.yyyy',
start = Sys.Date()-1, end = Sys.Date()+1,
separator = " bis ",
format = 'dd.mm.yyyy',language = "de")
})
sqlOutput <- reactive({
sqlInput <- paste("select rownum * from K.",input$tabnames, "where dati_create between to_date('",format(input$date[1]),"','dd.mm.yyyy') and to_date('",input$date[2],"','dd.mm.yyyy')")
print(sqlInput) # I have printed it to see the format of the date
dbGetQuery(con$cc, sqlInput, stringsAsFactors = T)
})
output$table <- DT::renderDataTable(sqlOutput(), server=TRUE, rownames=TRUE, filter="top", options=list(pageLength=10))
session$onSessionEnded(function() { dbDisconnect(con) })
})
ui_panel <-
tabPanel("Test",
sidebarLayout(
sidebarPanel(
),
mainPanel(
selectizeInput("tabnames",label = "server side", choices = NULL),
uiOutput("date_ui"),
tableOutput("out"),
tableOutput("table")
)
)
)
ui <- shinyUI(navbarPage("Test",ui_panel))
runApp(list(ui=ui,server=server))
After printing sqlInput:
[1] "select rownum * from K.xy where dati_create between to_date(' 2016-02-21 ','dd.mm.yyyy') and to_date(' 2016-02-23 ','dd.mm.yyyy')"
Error in .oci.GetQuery(conn, statement, data = data, prefetch = prefetch, :
ORA-01861: literal does not match format string
Does anyone have an idea how I can change the format of the date in dateRangeInput?
Thanks for any help!

python- how to join two seperate queries based on common value

I have two queries which links to different databases
query = "select name ,ctry from xxxx where xxxx"
cursor.execute(query)
results1 = list(cursor.fetchall())
for row in results1:
query1 = "SELECT sessionname, country FROM xxx where and sessions.sessionname = '"+row[0] +"'"
cur.execute(query1)
results2.append(cur.fetchall())
How to connect them if they have common value(sessionname and name) and save it's output to file. Both queries are located in different dbo (oracle, postgresql)
My code is here :
try:
query = """select smat.s_name "SQLITE name" ,smed.m_ctry as "Country", smed.m_name "HDD Label" from smart.smed2smat ss, smart.smed smed, smart.smat smat where ss.M2S_SMAT=smat.s_id and ss.m2s_smed=smed.m_id and smed.m_name like '{0}%' order by smat.s_name""" .format(line_name)
cursor.execute(query)
columns = [i[0] for i in cursor.description]
results1 = cursor.fetchall()
for row in results1:
query1 = "SELECT sessions.sessionname, projects.country , projects.projectname FROM momatracks.sessions, momatracks.projects, momatracks.sessionsgeo where sessions.projectid = projects.id and sessionsgeo.sessionname = sessions.sessionname and sessions.sessionname = '"+row[0] +"' order by sessions.sessionname"
cur.execute(query1)
results2 =cur.fetchall()
print "results1 -----> \n:", row
tmp=[]
output_items = []
for tmp in results2:
print "---> \n", tmp
try:
stations_dict = dict([(item[0], item[1:]) for item in tmp])
for item in row:
output_item = list(item) + stations_dict.get(item[0], [])
output_items.append(output_item)
except Exception, f:
print str (f)
cursor.close()
cur.close()
except Exception, g:
print str ( g )
except Exception, e:
print str ( e )
My results from row and tmp are :
row - WE246JP_2015_10_11__14_53_33', 'NLD', '031_025_SQLITE_NLD1510_03INDIA
and
tmp - WE246JP_2015_10_11__14_53_33', 'NLD', 'NLD15_N2C1-4_NL'
How to properly connect them? I want output look like this :
output_items - WE246JP_2015_10_11__14_53_33', 'NLD', '031_025_SQLITE_NLD1510_03INDIA', 'NLD15_N2C1-4_NL'
At the moment i get this error :
can only concatenate list (not "str") to list
Also value station_dict looks like this :( And this is not what i intended to do
'W': 'E246JP_2015_10_11__15_23_33', 'N': 'LD15_N2C1-4_NL3'
I know there is something wrong with my code which is simmilar to join. Can anyone explain this to me ? I used method below :
http://forums.devshed.com/python-programming-11/join-arrays-based-common-value-sql-left-join-943177.html
If the sessions are exactly the same in both databases then just zip the results:
query = """
select
smat.s_name "SQLITE name",
smed.m_ctry as "Country",
smed.m_name "HDD Label"
from
smart.smed2smat ss
inner join
smart.smed smed on ss.M2S_SMAT = smat.s_id
inner join
smart.smat smat on ss.m2s_smed = smed.m_id
where smed.m_name like '{0}%'
order by smat.s_name
""".format(line_name)
cursor.execute(query)
results1 = cursor.fetchall()
query1 = """
select
sessions.sessionname,
projects.country,
projects.projectname
from
momatracks.sessions,
inner join
momatracks.projects on sessions.projectid = projects.id
inner join
momatracks.sessionsgeo on sessionsgeo.sessionname = sessions.sessionname
where sessions.sessionname in {}
order by sessions.sessionname
""".format(tuple([row[0] for row in results1]))
cur.execute(query1)
results2 = cur.fetchall()
zipped = zip(results1, results2)
output_list = [(m[0][0], m[0][1], m[0][2], m[1][2]) for m in zipped]
If the sessions are different then make each result a dictionary to join.
I think you can use a subquery here. There's no way for me to test it, but I think it should look like this:
SELECT *
FROM (SELECT smat.s_name "SQLITE name" ,
smed.m_ctry as "Country",
smed.m_name "HDD Label"
FROM smart.smed2smat ss,
smart.smed smed,
smart.smat smat
WHERE ss.M2S_SMAT=smat.s_id
AND ss.m2s_smed=smed.m_id
AND smed.m_name like '{0}%'
ORDER BY smat.s_name) t1,
(SELECT sessions.sessionname,
projects.country ,
projects.projectname
FROM momatracks.sessions,
momatracks.projects,
momatracks.sessionsgeo
WHERE sessions.projectid = projects.id
AND sessionsgeo.sessionname = sessions.sessionname
AND sessions.sessionname = '"+row[0] +"'
ORDER BY sessions.sessionname) t2
WHERE t1."SQLITE name" = t2.sessionname ;

Appending to existing SQLite table when addition has fewer columns, without reading database into R

Is there some simple way, either on the SQL side or on the R side, to append a data.frame to an existing table that has more columns? The missing columns should just be filled in with NAs. Bonus points if it gracefully handles table 2 having more columns than table 1?
library(RSQLite)
# Create
db <- dbConnect( SQLite(), dbname="~/temp/test.sqlite" )
# Write test
set.seed(1)
n <- 1000
testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
dbWriteTable( conn = db, name = "test", value = testDat, row.names=FALSE )
testDat2 <- data.frame( key=seq(n+1,n+100), x=runif(100) )
> dbWriteTable( conn = db, name="test", value = testDat2, row.names=FALSE, append=TRUE )
[1] FALSE
Warning message:
In value[[3L]](cond) :
RS-DBI driver: (error in statement: table test has 6 columns but 2 values were supplied)
I could envision a wrapper for this as well. The algorithm would look something like:
Read 1 row from existing SQL table.
Get column names from that read.
Add column names to data.frame with the non-included names; fill with missings.
dbWriteTable now that the data.frame has the same columns as the SQL table.
That's simplified by the fact that SQLite has mutable class. But I'd rather not reinvent the wheel if it already exists.
EDIT
Just a note to clarify: these datasets are large. The SQL database will be about 30GB, and the data.frame (actually a data.table for obvious reasons) is about 4GB. So solutions that require reading the SQL table into R are non-starters.
An alternative algorithm would be to do it in SQL:
Write R data.frame to a temporary SQL table.
SQL magic to append that table onto the main SQL table.
Delete temporary SQL table.
Profit.
One solution would be to read one table in pieces of, say, 1000 rows using dbSendQuery and add them in the other table (adding required columns).
res <- dbSendQuery(con, "SELECT * from tests")
while(!dbHasCompleted(res)) {
data <- fetch(res, n = 1000);
### Put the data in the other table
}
There is also a way to do it in a single SQLite query. If you know which columns to add (fill with NULL) the SQL query would look like this:
INSERT INTO target_table SELECT col1,NULL,col2,col3,NULL,NULL,col4 FROM source_table
rbind.fill from plyr offers a nice way to do this concatenation from within R:
require(plyr)
X <- rbind.fill(testDat, stDat2)
Not sure this quite answers your question though, since it looks like you want to do the append on the connection side.
Solution that also deals with factors:
#' Function to return column names from a SQLite database
#' #param conn An RSQLite connection to a database
#' #param name Character string giving the name of the table you want column names for
#' #export dbGetColnames
#' #return Character vector of column names
dbGetColnames <- function(conn, name) {
x <- dbGetQuery( conn, paste0("SELECT sql FROM sqlite_master WHERE tbl_name = '",name,"' AND type = 'table'") )[1,1]
x <- sub( "^.*\\((.+)\\).*$", "\\1", x )
x <- str_split(x,",")[[1]]
x <- gsub('[\t\n"]','', x)
x <- gsub('^ *','', x)
vapply( str_split( x ," " ), first, "" )
}
#' Write a table via RSQLite with factors stored in another table
#' Handles data.tables efficiently for large datasets
#' #param conn The connection object (created with e.g. dbConnect)
#' #param name The name of the table to write
#' #param value The data.frame to write to the database
#' #param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will create a table called mytable_factor_color which will store the levels information)
#' #param append a logical specifying whether to append to an existing table in the DBMS.
#' #param \dots Options to pass along to dbWriteTable (e.g. append=TRUE)
#' #return A boolean indicating whether the table write was successful
#' #export dbWriteFactorTable
#' #examples
#' library(RSQLite)
#' load_all( file.path(.db,"R-projects","taRifx") )
# Create
#' dbFilename <- tempfile()
#' db <- dbConnect( SQLite(), dbname=dbFilename )
# Write test
#' set.seed(1)
#' n <- 1000
#' testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' dbWriteTable( conn = db, name = "test", value = testDat, row.names=FALSE )
#' testDat2 <- data.frame( key=seq(n+1,n+100), x=runif(100) )
#' dbWriteTable( conn = db, name="test", value = testDat2, row.names=FALSE, append=TRUE )
# Read test
#' testRecovery <- dbGetQuery(db, "SELECT * FROM test")
#' testSelection <- dbGetQuery(db, "SELECT * FROM test WHERE g3=='h' OR g3=='e' ")
#' testSelection
# Test removing rows matching criteria
#' for(i in 1:10) dbWriteTable( conn = db, name = "test", value = testDat, row.names=FALSE, append=TRUE )
#' dbSendQuery( db, "DELETE FROM test WHERE g3=='a'" )
#' # Test factor conversion
#' testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' if(dbExistsTable(db,"test_factor_g3")) dbRemoveTable(db,"test_factor_g3")
#' dbWriteFactorTable( conn = db, name = "test", value = testDat, row.names=FALSE )
#' dbGetQuery(db, "SELECT * FROM test")
#' dbGetQuery(db, "SELECT * FROM test_factor_g3")
#' testDat$g3 <- factor( sample(letters[6:15],n,replace=TRUE) )
#' dbWriteFactorTable( conn = db, name = "test", value = testDat, row.names=FALSE, append=TRUE )
#' dbGetQuery(db, "SELECT * FROM test_factor_g3")
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' dbWriteFactorTable( conn = db, name = "test", value = as.data.table(testDat), row.names=FALSE )
#' dbReadFactorTable( conn = db, name = "test" )
#' dbReadFactorTable( conn = db, name = "test", query="WHERE g3=='a'" )
#' # -- Test merging of tables where the columns don't line up -- #
#' set.seed(1)
#' n <- 1000
#' testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' dbWriteFactorTable( conn = db, name = "test", value = testDat, row.names=FALSE )
#' dbGetQuery( db, "SELECT * FROM test" )
#' # Add a table with columns that are a subset of the SQL table
#' testDat2 <- data.frame( key=seq(n+1,n+100), y=runif(100) )
#' dbWriteFactorTable( conn = db, name="test", value = testDat2, row.names=FALSE, append=TRUE )
#' dbGetQuery( db, "SELECT * FROM test" )
#' # Add a table where the columns are a superset of the SQL table's
#' testDat3 <- data.frame( key=seq(n+101,n+200), x=runif(100), n=runif(100) )
#' dbWriteFactorTable( conn = db, name="test", value = testDat3, row.names=FALSE, append=TRUE )
#' dbGetQuery( db, "SELECT * FROM test" )
#' # Finish up
#' dbDisconnect(db) # close connection
#' unlink( dbFilename ) # delete tempfile
dbWriteFactorTable <- function( conn, name, value, factorName="_factor_", append=FALSE, ... ) {
require(RSQLite)
# Test inputs
stopifnot(class(conn)=="SQLiteConnection")
stopifnot(class(name)=="character")
stopifnot("data.frame" %in% class(value))
stopifnot(class(factorName)=="character")
if( grepl("[.]",factorName) ) stop("factorName must use valid characters for SQLite")
if( "data.table" %in% class(value) ) {
dt <- TRUE # Is value a data.table, if so use more efficient methods
} else {
dt <- FALSE
}
# Convert factors to character
factorCols <- names( Filter( function(x) x=="factor", vapply( value, class, "" ) ) )
if(length(factorCols>0)) {
for( cl in which( colnames(value) %in% factorCols ) ) {
cn <- colnames(value)[cl]
factorTable <- data.frame( levels=levels(value[[ cn ]]) )
factorTable$levelKey <- seq(nrow(factorTable))
fctNm <- paste0(name,factorName,cn)
fctTableExists <- dbExistsTable( conn = conn , name = fctNm)
# Write out the factor table
if( append & fctTableExists ) {
oldFactorTable <- dbGetQuery( conn = conn, paste("SELECT levelKey, levels FROM",fctNm) )
levelExists <- factorTable$levels %in% oldFactorTable$levels
if(!all(levelExists)) {
startLevelKey <- max( oldFactorTable$levelKey ) + 1
addLevels <- factorTable$levels[!levelExists]
newFactorTable <- data.frame(
levels = addLevels,
levelKey = seq( startLevelKey, startLevelKey + length(addLevels) - 1 )
)
dbWriteTable( conn = conn, name = fctNm, value = newFactorTable, row.names = FALSE, append = TRUE )
} # If all levels exist, don't update the table -- go straight to converting the factor to character
} else {
if(fctTableExists) {
warning(paste("Append set to FALSE but the factor table named",fctNm,"exists. Deleting."))
dbRemoveTable( conn=conn, name = fctNm )
}
dbWriteTable( conn = conn, name = fctNm, value = factorTable, row.names = FALSE )
}
# Convert variable cl to character in the main data.frame (value) that we'll write to the main SQL table
if( dt ) set( x=value, j=cl, value=as.character(value[[ cn ]]) )
}
if( !dt ) value <- japply( value, which( colnames(value) %in% factorCols ), as.character )
} else {
#warning("No factor columns detected.")
}
if( append ) {
# If we're appending, check that the number of columns of the new table is equal to the number of columns of the old table
# Only run this code if we're appending, because otherwise the table won't exist
sqlColnames <- dbGetColnames( conn, name )
colnamesSubset <- !all( sqlColnames %in% colnames(value) )
colnamesSuperset <- !all( colnames(value) %in% sqlColnames )
if( colnamesSuperset ) {
addCols <- colnames(value)[ !colnames(value) %in% sqlColnames ]
for( ac in addCols ) {
warning(paste("Adding column",ac,"to SQL table"))
dbSendQuery( conn,
paste(
"ALTER TABLE",
name,
"ADD COLUMN",
ac,
"DEFAULT NULL"
)
)
}
} # If it's a superset but not a subset, then we're done (allow it to return back to the second if where it just writes value directly)
if( colnamesSubset ) {
# Write our database to a temporary table
tempTableName <- "temp_dbWriteFactorTable"
if(dbExistsTable(conn,tempTableName)) dbRemoveTable(conn,tempTableName)
dbWriteTable( conn = conn, name=tempTableName, value = value, row.names=FALSE, append=FALSE )
# Add any columns to input data.frame that are in target table, then merge
sqlColnames <- dbGetColnames( conn, name ) # Reset these now that we've possibly tinkered with them in the superset section
dfColnames <- sqlColnames
dfColnames[ !sqlColnames %in% colnames(value) ] <- "null"
status <- dbSendQuery( conn,
paste(
"INSERT INTO", name,
"(",paste(sqlColnames,collapse=","),")",
"SELECT",
paste( dfColnames, collapse="," ),
"FROM",
tempTableName
)
)
# Remove temporary table
dbRemoveTable(conn,tempTableName)
}
}
if( !append || (append & !colnamesSubset) ) { # Either we're not appending, or the columns in the input and target tables exactly match (possibly after we added columns with the superset code)
status <- dbWriteTable( conn = conn, name = name, value = value, append=append, ... )
}
return( status )
}
#' Read a table via RSQLite with factors stored in another table
#' #param conn The connection object (created with e.g. dbConnect)
#' #param name The name of the table to read
#' #param query A character string containing sequel statements to be appended onto the query (e.g. "WHERE x==3")
#' #param dt Whether to return a data.table vs. a plain-old data.frame
#' #param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will expect there to be a table called mytable_factor_color which holds the levels information)
#' #param \dots Options to pass along to dbGetQuery
#' #return A data.table or data.frame
#' #export dbReadFactorTable
dbReadFactorTable <- function( conn, name, query="", dt=TRUE, factorName="_factor_", ... ) {
require(RSQLite)
# Test inputs
stopifnot(class(conn)=="SQLiteConnection")
stopifnot(class(name)=="character")
stopifnot(class(factorName)=="character")
if( grepl("[.]",factorName) ) stop("factorName must use valid characters for SQLite")
# Read main table
if( dt ) {
value <- as.data.table( dbGetQuery( conn, paste("SELECT * FROM",name,query), ... ) )
} else {
value <- dbGetQuery( conn, paste("SELECT * FROM",name,query), ... )
}
# Convert factors to character
factorCols <- sub( paste0("^.*",name,factorName,"(.+)$"), "\\1",
Filter( Negate(is.na),
str_extract( dbListTables( conn ), paste0(".*",name,factorName,".*") )
)
)
if( length(factorCols>0) ) {
for( cn in factorCols ) {
fctNm <- paste0(name,factorName,cn)
factorTable <- dbGetQuery( conn, paste0("SELECT * FROM ",fctNm) )
factorLevels <- factorTable$levels[ order( factorTable$levelKey ) ] # sort by levelKey so we maintain a consistent reference category (SQL databases don't guarantee the row order remains the same)
if( dt ) {
cl <- which( colnames(value) %in% cn )
set( x=value, j=cl, value=factor( value[[ cn ]], levels=factorLevels ) )
} else {
value[[ cn ]] <- factor( value[[ cn ]], levels=factorLevels )
}
}
} else {
#warning("No factor columns detected.")
}
value
}
This will show up in taRifx at some point, I suspect. The part that I added to solve this question is the if(colnamesSubset) block.