Query if value in list using R and PostgreSQL - sql

I have a dataframe like this
df1
ID value
1 c(YD11,DD22,EW23)
2 YD34
3 c(YD44,EW23)
4
And I want to query another database to tell me how many rows had these values in them. This will eventually be done in a loop through all rows but for now I just want to know how to do it for one row.
Let's say the database looks like this:
sql_database
value data
YD11 2222
WW20 4040
EW23 2114
YD44 3300
XH29 2040
So if I just looked at row 1, I would get:
dbGetQuery(con,
sprintf("SELECT * FROM sql_database WHERE value IN %i",
df1$value[1]) %>%
nrow()
OUTPUT:
2
And the other rows would be :
Row 2: 0
Row 3: 2
Row 4: 0
I don't need the loop created but because my code doesn't work I would like to know how to query all rows of a table which have a value in an R list.

You do not need a for loop for this.
library(tidyverse)
library(DBI)
library(dbplyr)
df1 <- tibble(
id = 1:4,
value = list(c("YD11","DD22","EW23"), "YD34", c("YD44","EW23"), NA)
)
# creating in memory database table
df2 <- tibble(
value = c("YD11", "WW20", "EW23", "YD44", "XH29"),
data = c(2222, 4040, 2114, 3300, 2040)
)
con <- dbConnect(RSQLite::SQLite(), ":memory:")
# Add auxilary schema
tmp <- tempfile()
DBI::dbExecute(con, paste0("ATTACH '", tmp, "' AS some_schema"))
copy_to(con, df2, in_schema("some_schema", "some_sql_table"), temporary = FALSE)
# counting rows
df1 %>%
unnest(cols = c(value)) %>%
left_join(tbl(con, dbplyr::in_schema("some_schema", "some_sql_table")) %>% collect(), by = "value") %>%
mutate(data = if_else(is.na(data), 0, 1)) %>%
group_by(id) %>%
summarise(n = sum(data))

Related

How to add a vector to a table in backend using dbplyr (R)

I created a table from a data source using tbl(). I need to add a column including 1:nrow() to my dataset and tried different methods but I didn't succeed. My code is as below:
nrow_df1 <- df1 %>% summarise(n = n()) %>% pull(n)
df1 <- df1 %>% mutate(ID = 1:nrow_df1, step = 1)
It doesn't add column ID to my dataset and only adds column step.
Using as.data.frame(), it works but so slow.
Do you have any ideas? thanks in advance
For this case, you can use row_number().
library(dbplyr)
library(DBI)
# simulate a fake database
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "mtcars", mtcars)
# add in the row
tbl(con, "mtcars") %>%
mutate(ID = row_number())
dbDisconnect(con)
I found the answer. It is to use row_number() but as.numeric is also needed to convert the output from integer64 to numeric:
df1 <- df1 %>% mutate(ID = as.numeric(row_number(a column)), step = 1)

Dropping containing NA rows with dbplyr

here is how I ran some SQL queries by dbplyr
library(tidyverse)
library(dbplyr)
library(DBI)
library(RPostgres)
library(bit64)
library(tidyr)
drv <- dbDriver('Postgres')
con <- dbConnect(drv,dbname='mydb',port=5432,user='postgres')
table1 <- tbl(con,'table1')
table2 <- tbl(con,'table2')
table3 <- tbl(con,'table3')
table1 %>% mutate(year=as.integer64(year)) %>% left_join(table2,by=c('id'='id')) %>%
left_join(table3,by=c('year'='year'))
I wanna drop some rows which include NA then collect my final table but couldn't find anything helpful works with dbplyr queries.
I tried to pipe drop_na() from tidyr and some other base functions (complete.cases() etc.). Would you suggest me anything to succeed my aim ? Piping an SQL query (like WHERE FOO IS NOT NULL) to dbplyr query is also welcome.
Thanks in advance.
Try using !is.na(col_name) as part of a filter:
library(dplyr)
library(dbplyr)
df = data.frame(my_num = c(1,2,3))
df = tbl_lazy(df, con = simulate_mssql())
output = df %>% filter(!is.na(my_num))
Calling show_query(output) to check the generated sql gives:
<SQL>
SELECT *
FROM `df`
WHERE (NOT(((`my_num`) IS NULL)))
The extra brackets are part of how dbplyr does its translation.
If you want to do this for multiple columns, try the following approach based on this answer:
library(rlang)
library(dplyr)
library(dbplyr)
df = data.frame(c1 = c(1,2,3), c2 = c(9,8,7))
df = tbl_lazy(df, con = simulate_mssql())
colnames = c("c1","c2")
conditions = paste0("!is.na(",colnames,")")
output = df %>%
filter(!!!parse_exprs(conditions))
Calling show_query(output) shows both columns appear in the generated query:
<SQL>
SELECT *
FROM `df`
WHERE ((NOT(((`c1`) IS NULL))) AND (NOT(((`c2`) IS NULL))))
Well, actually I still don't get a satisfying solution. What I exactly wanted to do is to drop containing NA rows in R environment without typing an SQL query, I think dbplyr doesn't support this function yet.
Then I wrote a little and simple code to make my wish come true;
main_query<-table1 %>% mutate(year=as.integer64(year)) %>% left_join(table2,by=c('id'='id')) %>%
left_join(table3,by=c('year'='year'))
colnames <- main_query %>% colnames
query1 <- main_query %>% sql_render %>% paste('WHERE')
query2<-''
for(i in colnames){
if(i == tail(colnames,1)){query2<-paste(query2,i,'IS NOT NULL')}
else{query2<-paste(query2,i,'IS NOT NULL AND')}
}
desiredTable <- dbGetQuery(con,paste(query1,query2))
Yeah, I know it doesn't seem magical but maybe someone can make use of it.

How to update a SQL table with an R dataframe?

This seems like it should be simple but as a complete beginner to SQL, I've spent a large amount of time trying to figure this out without any luck
Say I have a dataframe in R like this:
df1 <- data.frame(value = c(1, 2, 3, 4),
ID = c("foo", "bar", "baz", "waffle"))
# value ID
# 1 1 foo
# 2 2 bar
# 3 3 baz
# 4 4 waffle
I can load it into a SQLite database easily:
table_name <- "mytable"
library("RSQLite")
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, table_name, df1, row.names = FALSE)
dbListTables(con)
# [1] "mytable"
Now say I create another dataframe, that has a mix of
entries which are exactly the same as the first df
entries which are partially present in the first df but have updated values
entries which are not present in the first df
-
df2 <- data.frame(value = c(1, 3, 5, 6),
ID = c("foo", "bar", "abc", "zzz"))
# value ID
# 1 1 foo # present in df1
# 2 3 bar # updated from df1
# 3 5 abc # not in df1
# 4 6 zzz # not in df1
Now, I want to update my SQLite table with this new dataframe;
currently existing entries which are exactly the same should be skipped
currently existing entries which are different should be updated
new entries should be appended
-
My best guess is that the code required would be structured like this:
if(! dbExistsTable(con, table_name)){
# write the df2 to table, if the table doesn't already exist
dbWriteTable(con, table_name, df2, row.names = FALSE)
} else {
# update the entries in the table with the entries in df2
for(i in seq(nrow(df2))){
irow <- df2[i, ]
# check if irow is already in the table
# if its already in the table, update the table's irow entry if its different
# otherwise append irow to the table
# or break() if irow is already present and identical
}
}
But what code would be used to actually check if the row (irow) is already present in the SQLite, and then update it? It seems like it might be some usage of dbBind() but I haven't been able to find a working example of how to use it for this purpose, and the docs are not clear. Is this kind of method going to be efficient for millions of entries and an arbitrary number of columns?

Pass SQL functions in dplyr filter function on database

I'm using dplyr's automatic SQL backend to query subtable from a database table. E.g.
my_tbl <- tbl(my_db, "my_table")
where my_table in the database looks like
batch_name value
batch_A_1 1
batch_A_2 2
batch_A_2 3
batch_B_1 8
batch_B_2 9
...
I just want the data from batch_A_#, regardless of the number.
If I were writing this in SQL, I could use
select * where batch_name like 'batch_A_%'
If I were writing this in R, I could use a few ways to get this: grepl(), %in%, or str_detect()
# option 1
subtable <- my_tbl %>% select(batch_name, value) %>%
filter(grepl('batch_A_', batch_name, fixed = T))
# option 2
subtable <- my_tbl %>% select(batch_name, value) %>%
filter(str_detect(batch_name, 'batch_A_'))
All of these give the following Postgres error: HINT: No function matches the given name and argument types. You might need to add explicit type casts
So, how do I pass in SQL string functions or matching functions to help make the generated dplyr SQL query able to use a more flexible range of functions in filter?
(FYI the %in% function does work, but requires listing out all possible values. This would be okay combined with paste to make a list, but does not work in a more general regex case)
A "dplyr-only" solution would be this
tbl(my_con, "my_table") %>%
filter(batch_name %like% "batch_A_%") %>%
collect()
Full reprex:
suppressPackageStartupMessages({
library(dplyr)
library(dbplyr)
library(RPostgreSQL)
})
my_con <-
dbConnect(
PostgreSQL(),
user = "my_user",
password = "my_password",
host = "my_host",
dbname = "my_db"
)
my_table <- tribble(
~batch_name, ~value,
"batch_A_1", 1,
"batch_A_2", 2,
"batch_A_2", 3,
"batch_B_1", 8,
"batch_B_2", 9
)
copy_to(my_con, my_table)
tbl(my_con, "my_table") %>%
filter(batch_name %like% "batch_A_%") %>%
collect()
#> # A tibble: 3 x 2
#> batch_name value
#> * <chr> <dbl>
#> 1 batch_A_1 1
#> 2 batch_A_2 2
#> 3 batch_A_2 3
dbDisconnect(my_con)
#> [1] TRUE
This works because any functions that dplyr doesn't know how to
translate will be passed along as is, see
?dbplyr::translate\_sql.
Hat-tip to #PaulRougieux for his recent comment
here
Using dplyr
Get the table batch_name from the database as dataframe and use it for further data analysis.
library("dplyr")
my_db <- src_postgres(dbname = "database-name",
host = "localhost",
port = 5432,
user = "username",
password = "password")
df <- tbl(my_db, "my_table")
df %>% filter(batch_name == "batch_A_1")
Using DBI and RPostgreSQL
Get the table by sending sql query
library("DBI")
library("RPostgreSQL")
m <- dbDriver("PostgreSQL")
con <- dbConnect(drv = m,
dbname = "database-name",
host = "localhost",
port = 5432,
user = "username",
password = "password")
df <- dbGetQuery(con, "SELECT * FROM my_table WHERE batch_name %LIKE% 'batch_A_%'")
library("dplyr")
df %>% filter(batch_name == "batch_A_1")

Match fields within one data frame with column names in another data frame

I have two data frames. In the last column ("Bill") in the first data frame, I want to apply a function (fixed price + Quantity*price/qty). In order to apply the function, R should match the values in the first column of df1 to the column names of df2.
I have solved the problem by creating a function and several ifelse statements, but I would want to use a statement that automatically matches the values in df1 with the column names in df2. The data set that I have contains more than 2 million rows and I would need to apply the same rationale into building other similar functions. It would be nice to use something that does not require a loop or takes too long to process.
### Set up your data frames like so ###
Code <- c("a1", "a2", "c3", "a1")
Name <- c("Dan", "David", "Anna", "Lisa")
Quantity <- c(30, 12, 10, 10)
df1 <- as.data.frame(cbind("Code" = Code, "Name" = Name, "Quantity" = Quantity), stringsAsFactors = F)
df1$Quantity <- as.numeric(df1$Quantity)
fixed_price <- c(12, 5, 23)
price_per_qty <- c(1, 4, 7)
df2 <- as.data.frame(rbind("fixed_price" = fixed_price, "price_per_qty" = price_per_qty))
colnames(df2) <- c("a1", "a2", "c3")
### Combine dataframe 1 and 2 into a single dataframe ###
# Code below pulls individual columns from df2 based on the
# index provided by the "Code" column in df1, transposes them
# so they'll line up with df1, then column binds them to df1
df3 <- cbind(df1, t(df2[,df1$Code]))
# the bill is calculated simply enough
bill <- df3[4] + df3[3] * df3[5]
colnames(bill) <- "bill"
# Finally, output the results as you wanted
cbind(df3, bill)
So I have a fairly similar answer to graggsd, but here is what worked for me. I merged two data frames based on the key word "Code" and then combined it into the big data frame into combined_data. I then used a function which I think is what you defined above and then passed the respective data frames through it.
df2 <- t(data.frame(c(12,1),c(5,4),c(23,7)))
rownames(df2) <- c("a1","a2","c3")
test <- rownames(df2)
df2 <- cbind.data.frame(df2,test)
colnames(df2) <- c("fixed price","price/qty","Code")
df1 <- data.frame(c("a1","a2","c3","a1"), c("Dan","David","Anna","Lisa"),c(30,12,10,10))
colnames(df1) <- c("Code","Name","Quantity")
combined_data <- dplyr::inner_join(df1,df2, by = "Code")
f1 <- function(x,y,z){
x + y * z
}
bill <- f1(combined_data[,4],combined_data[,3],combined_data[,5])
finalDataSet <- cbind.data.frame(combined_data,bill)
The final data set:
Code Name Quantity fixed price price/qty bill
1 a1 Dan 30 12 1 42
2 a2 David 12 5 4 53
3 c3 Anna 10 23 7 93
4 a1 Lisa 10 12 1 22