How to send data frame from r to SQL? - sql

#setwd('Desktop/IE332')
install.packages("wakefield")
install.packages("RMySQL")
install.packages("randomNames")
install.packages('password')
install.packages('OpenRepGrid')
library(RMySQL)
library(password)
library(wakefield)
library(randomNames)
industriesData <- read.csv('Industries.csv')
skills <- read.csv('Skills.csv')
sp500 <- read.csv("http://www.princeton.edu/~otorres/sandp500.csv")
companies <- sample(sp500$Name, 100)
locations <- c('Northwest', 'Midwest', 'Northeast', 'South', 'Southwest', 'Southeast',
'International') # Locations
gpas <- c(4,3.5,3,2.5,2)
n <- 100
locPrefs <- numeric(n)
studentSkills <- matrix(nrow=100,ncol=10)
studentInd <- matrix(nrow=100,ncol=5)
jobSkills <- matrix(nrow=100,ncol=5)
for(j in 1:n){ # Samples random skills assigned to students
studentSkills[j,] <- sample(skills[,1],10,replace=FALSE)
studentInd[j,] <- sample(industriesData[,1],5,replace=FALSE)
jobSkills[j,] <- sample(skills[,1],5,replace=FALSE)
}
studentData <- data.frame('first names'=randomNames(n, which.names = 'first'),'last
names'=randomNames(n, which.names = 'last'),'username'=seq(1,
n),'password'=password(8,numbers=TRUE),'gpa'=gpa(n, mean = 85.356, sd = 3.2, name =
"GPA"),'visa'=sample(c("N","Y"), size = n, replace = TRUE, prob = c(.78, .22)), 'loc
pref'=sample(locations,n,replace = TRUE), 'skill'=studentSkills, 'Industry'=studentInd) # Student data
employerData <- data.frame('company names'=companies, 'pref
gpa'=sample(gpas,n,replace=TRUE), 'sponser?'=sample(c('N','Y'), size=n, replace = TRUE, prob
= c(.78, .22)), 'job id'=sample(seq(100,999),n,replace=FALSE),'pref skill'=jobSkills,
'industry'=sample(industriesData[,1],n,replace=TRUE),'location'=sample(locations,n,replace =
TRUE)) # Employer data
I am trying to send certain columns of the studentData and employerData to tables in SQL, how would i go about doing that? I have a table named students where I would like to upload the first and last names of the studentsData data frame into this SQL table.

Related

Portfolio Optimization Using Quadprog Gives the Same Result for Every time even after changing variables

I have a task to construct the efficient frontier using 25 portfolios (monthly data). I tired writing a quadprog code for calculating minimum variance portfolio weights for a given expected rate of return. However, regardless of the expected return, the solver values give me the same set weights and variance, which the global minimum variance portfolio. I found the answer using an analytical solution. Attached are the codes:
basedf <- read.csv("test.csv", header = TRUE, sep = ",")
data <- basedf[,2:26]
ret <- as.data.frame(colMeans(data))
variance <- diag(var(data))
covmat <-as.matrix(var(data))
###minimum variance portfolio calculation
Q <- 2*cov(data)
A <- rbind(rep(1,25))
a <- 1
result <- solve.QP(Dmat = Q,
dvec = rep(0,25),
Amat = t(A),
bvec = a,
meq = 1)
w <-result$solution
w
var <- result$value
var
sum(w)
this is another set of codes giving the me same value::
mvp <- function(e,ep){
Dmat <- 2*cov(e)
dvec <- rep(0, ncol(e))
Amat <- cbind(rep(1, ncol(e)), colMeans(e))
bvec <- c(1, ep)
result <- solve.QP(Dmat = Dmat, dvec = dvec, Amat = Amat, bvec = bvec, meq=1)
wp <- result$solution
varP <- result$value
ret_values <- list(wp, varP)
names(ret_values) <- c("wp", "VarP")
return(ret_values)
}
z <- mvp(data, -.005)
z$wp
sum(z$wp)
z$VarP
ef <- function(e, min_e, max_e){
list_e <- seq(min_e,max_e, length=50)
loop <- sapply(list_e, function(x) mvp(e, x)$VarP)
effF <- as.data.frame(cbind(list_e,loop))
minvar <- min(effF$loop)
L <- effF$loop==minvar
minret <- effF[L,]$list_e
minpoint <- as.data.frame(cbind(minret,minvar))
minvarwp <- mvp(e, min_e)$wp
rlist <- list(effF, minpoint, minvarwp)
names(rlist) <- c( "eFF", "minPoint", "wp")
return(rlist)
}
in the efficient frontier, all the 50 portfolios have same level of variance. can anyone tell me whats wrong with solver equation??? thanks.
I tried quadprog but couldnt solve it.

R: SQL query for mean extraction

I'd like to aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2 using a SQL query with glue package. But when I try:
library(dplyr)
library(rgdal)
library(rgeos)
library(DBI)
library(glue)
# get AOI
download.file(
"https://github.com/Leprechault/trash/raw/main/stands_example.zip",
zip_path <- tempfile(fileext = ".zip")
)
unzip(zip_path, exdir = tempdir())
# Open the files
setwd(tempdir())
stands_ds <- read.csv("pred_target_stands.csv", sep=";") # Data set
stands_ds <- stands_ds %>%
mutate(DATA_S2 = ymd(DATA_S2))
stands_ds$CLASS<-c(rep("A",129),rep("B",130))
stands_ds$CD<-abs(rnorm(length(stands_ds[,1]),mean=50))
# Crete like a SQL server condition
bq_conn<- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
DBI::dbWriteTable(bq_conn, "stands_ds", stands_ds)
# Aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2
sqlInput_pred_FARM <- glue::glue_sql("SELECT * FROM stands_ds AVG(CD) GROUP BY CD_TALHAO = {x} AND ID_UNIQUE = {y} AND DATA_S2 = {z}",
x = "001G", y = "CERROCOROADO_001G",
z = "2021-04-02",.con=bq_conn)
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, as.character(sqlInput_pred_FARM, stringsAsFactors = T))
I always have Error: near "(": syntax error as output. Please, any help with it?
Sorry #KU99 but your solution doesn't return the mean values as I expected. Now, I try something new like an object creation for my mean operation and despite some ugly results with REPLICATE(DATE()), now works. The solution is:
# Aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2
sqlInput_pred_FARM <- glue::glue_sql("SELECT REPLICATE(CD_TALHAO,1) AS TALHAO, REPLICATE(ID_UNIQUE,1) AS ID, REPLICATE(DATE(DATA_S2),1) AS DATE, AVG(CD) AS CD FROM stands_ds GROUP BY CD_TALHAO = {x},ID_UNIQUE = {y}, DATA_S2 = {z} ORDER BY CD_TALHAO = {x},ID_UNIQUE = {y}, DATA_S2 = {z}",
x = "001G", y = "CERROCOROADO_001G",
z = "2021-04-02",.con=bq_conn)
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, as.character(sqlInput_pred_FARM, stringsAsFactors = T))
pred_attack_BQ_FARM
# TALHAO ID DATE CD
#1 001C CERROCOROADO_001C -4661-02-24 49.93823
#2 001G CERROCOROADO_001G -4661-02-24 50.12102
Try the following:
statement <- "SELECT * FROM stands_ds AVG(CD)
GROUP BY CD_TALHAO = ? AND ID_UNIQUE = ? AND DATA_S2 = ?"
pars <- list("001G", "CERROCOROADO_001G","2021-04-02")
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, statement, params = pars)

Interactive Plots in R

Using the plotly library, I made the following plot in R:
library(dplyr)
library(ggplot2)
library(plotly)
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,5,5))
df <- df %>% mutate(var3 = ifelse(var1 <= 5 & var2 <= 5, "a", ifelse(var1 <= 10 & var2 <= 10, "b", "c")))
plot = df %>%
ggplot() + geom_point(aes(x=var1, y= var2, color= var3))
ggplotly(plot)
This is a simple scatter plot - two random variables are generated, and then the colors of the points are decided by some criteria (e.g. if var1 and var2 are between certain ranges).
From here, I could also summary statistics:
df$var3 = as.factor(df$var3)
summary = df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
# A tibble: 3 x 4
var3 Mean_var1 Mean_var2 count
* <fct> <dbl> <dbl> <int>
1 a -1.70 0.946 158
2 b 4.68 4.94 260
3 c 15.8 6.49 582
My question: is it possible to add some buttons to this plot which would allow the user to color the points based on custom choices? E.g. something like this :
Now, the user can type in any range they want - and the color of the points change, and the some summary statistics are generated.
Can someone please show me how to do this in R?
I had this idea - first I would create this massive table that would create all possible range combinations of "var1" and "var2":
vec1 <- c(-20:40,1)
vec2 <- c(-20:40,1)
a <- expand.grid(vec1, vec2)
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
df <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & j <= 10, "b", "c")))
}
}
Then, depending on which ranges the user wants - an SQL style statement isolate the rows from this massive table corresponding to those ranges :
custom_df = df[df$var1 > -20 & df$var1 <10 & df$var1 > -20 & df$var2 <10 , ]
Then, an individual grap would be made for "custom_df" and summary statistics would also be recorded for "custom_df":
summary = custom_df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
But I am not sure how to neatly and efficiently do this in R.
Can someone please show me how to do this?
Thanks
I have built a small shiny app to perform most of your requirements. Based on your pre-defined large dataframe df, user can define the following:
Choose the minimum and maximum value for variables var1 and var2.
Choose criteria to define the variable var3, which is used to display different colors of data points. This is a range now.
Save plot as a HTML file.
Summary stats displayed as a table.
You can define further options to provide the user the option to choose color and so on. For that perhaps you should google on how to use scale_color_manual().
Update: Added user option to choose red and green color based on var1 and var2 range values.
library(shiny)
library(plotly)
library(dplyr)
library(DT)
### define a large df
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,15,15))
ui <- fluidPage(
titlePanel(p("My First Test App", style = "color:red")),
sidebarLayout(
sidebarPanel(
p("Choose Variable limits"),
# Horizontal line ----
tags$hr(),
uiOutput("var1a"), uiOutput("var1b"),
uiOutput("var2a"), uiOutput("var2b"),
uiOutput("criteria")
),
mainPanel(
DTOutput("summary"), br(),
plotlyOutput("plot"),
br(), br(), br(),
uiOutput("saveplotbtn")
)
)
)
server <- function(input, output, session){
output$var1a <- renderUI({
tagList(
numericInput("var11", "Variable 1 min",
min = min(df$var1), max = max(df$var1), value = min(df$var1))
)
})
output$var1b <- renderUI({
if (is.null(input$var11)){
low1 <- min(df$var1)
}else low1 <- max(min(df$var1),input$var11) ## cannot be lower than var 1 minimum
tagList(
numericInput("var12", "Variable 1 max", min = low1, max = max(df$var1), value = max(df$var1))
)
})
output$var2a <- renderUI({
tagList(
numericInput("var21", "Variable 2 min",
min = min(df$var2), max = max(df$var2), value = min(df$var2))
)
})
output$var2b <- renderUI({
if (is.null(input$var21)){
low2 <- min(df$var2)
}else low2 <- max(min(df$var2),input$var21) ## cannot be lower than var 2 minimum
tagList(
numericInput("var22", "Variable 2 max", min = low2, max = max(df$var2), value = max(df$var2))
)
})
output$criteria <- renderUI({
req(input$var11,input$var12,input$var21,input$var22)
tagList(
sliderInput("crit11", "Variable 1 red color range:",
min = -10, max = 0, value = c(-10,0)),
sliderInput("crit12", "Variable 2 red color range:",
min = -25, max = 0, value = c(-25,0)),
sliderInput("crit21", "Variable 1 green color range:",
min = 0.1, max = 10, value = c(0.1,10)),
sliderInput("crit22", "Variable 2 green color range:",
min = 0.1, max = 20, value = c(0.1,20))
)
})
dat <- reactive({
req(input$crit11,input$crit12,input$crit21,input$crit22)
df <- df %>% filter(between(var1, input$var11, input$var12)) %>%
filter(between(var2, input$var21, input$var22))
# df1 <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & var2 <= j , "b", "c")))
df1 <- df %>% mutate(var3 = ifelse(between(var1, input$crit11[1], input$crit11[2]) & between(var2, input$crit12[1], input$crit12[2]), "a",
ifelse(between(var1, input$crit21[1], input$crit21[2]) & between(var2, input$crit22[1], input$crit22[2]), "b", "c")))
})
summari <- reactive({
req(dat())
df1 <- dat()
df1$var3 = as.factor(df1$var3)
summary = df1 %>%
group_by(var3) %>%
dplyr::summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
})
output$summary <- renderDT(summari())
rv <- reactiveValues()
observe({
req(dat())
p <- ggplot(data=dat()) + geom_point(aes(x=var1, y= var2, color= var3))
pp <- ggplotly(p)
rv$plot <- pp
})
output$plot <- renderPlotly({
rv$plot
})
output$saveplotbtn <- renderUI({
div(style="display: block; padding: 5px 350px 5px 50px;",
downloadBttn("saveHTML",
HTML("HTML"),
style = "fill",
color = "default",
size = "lg",
block = TRUE,
no_outline = TRUE
) )
})
output$saveHTML <- downloadHandler(
filename = function() {
paste("myplot", Sys.Date(), ".html", sep = "")
},
content = function(file) {
htmlwidgets::saveWidget(as_widget(rv$plot), file, selfcontained = TRUE) ## self-contained
}
)
}
shinyApp(ui, server)

Imputing binary missing data with hmi and mice -- Error: C stack usage 7969776 is too close to the limit

I'm running HMI on two level data (students in courses) with missing data at the student level. The code throws the following error (Error: C stack usage 7969776 is too close to the limit) when I include a binary value (gender) with missing data.
Below is a reproducible example.
library(MCMCglmm)
library(hmi)
library(mice)
df <- data.frame(post = rtnorm(100,60,20,0,100),
pre = rtnorm(100,40,20,0,100),
gender = rbinom(n=100, size=1, prob=0.20),
course = rep( c("A","B","C","D"), 100*c(0.1,0.2,0.65,0.05) ))
df$post[1:round((0.3)*length(df$post),digits = 0)] <- NA
df$gender[round((0.2)*length(df$post),digits = 0):round((0.5)*length(df$post),
digits = 0)] <- NA
hmi_test <- hmi(df, model_formula = post ~ 1 + pre + gender + (1|course ),
M = 2, maxit = 5, list_of_types = NULL, nitt = 3000, burnin = 1000)
list_of_types_maker(df)

Colors strips settings in faced-wrap ggplot

To a 3 year old post
ggplot2: facet_wrap strip color based on variable in data set
Baptiste has given the following solution:
d <- data.frame(fruit = rep(c("apple", "orange", "plum", "banana", "pear", "grape")),
farm = rep(c(0,1,3,6,9,12), each=6),
weight = rnorm(36, 10000, 2500),
size=rep(c("small", "large")))
p1 = ggplot(data = d, aes(x = farm, y = weight)) +
geom_jitter(position = position_jitter(width = 0.3),
aes(color = factor(farm)), size = 2.5, alpha = 1) +
facet_wrap(~fruit)
dummy <- ggplot(data = d, aes(x = farm, y = weight))+ facet_wrap(~fruit) +
geom_rect(aes(fill=size), xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) +
theme_minimal()
library(gtable)
g1 <- ggplotGrob(p1)
g2 <- ggplotGrob(dummy)
gtable_select <- function (x, ...)
{
matches <- c(...)
x$layout <- x$layout[matches, , drop = FALSE]
x$grobs <- x$grobs[matches]
x
}
panels <- grepl(pattern="panel", g2$layout$name)
strips <- grepl(pattern="strip-t", g2$layout$name)
g2$layout$t[panels] <- g2$layout$t[panels] - 1
g2$layout$b[panels] <- g2$layout$b[panels] - 1
new_strips <- gtable_select(g2, panels | strips)
library(grid)
grid.newpage()
grid.draw(new_strips)
gtable_stack <- function(g1, g2){
g1$grobs <- c(g1$grobs, g2$grobs)
g1$layout <- transform(g1$layout, z= z-max(z), name="g2")
g1$layout <- rbind(g1$layout, g2$layout)
g1
}
## ideally you'd remove the old strips, for now they're just covered
new_plot <- gtable_stack(g1, new_strips)
grid.newpage()
grid.draw(new_plot)
(I have just updated the "strip-t" pattern and opened the grid library as it was suggested in the old post)
I repost this because it's an old brillant stuff and I want to use it myself for a presentation.
I'm a beginner in ggplot and this could also help me for various scripts.
Here are my questions :
- How is it possible to choose the color and not to give the same blue and red please? In my script, I have 3 colors to set, and I hope it can be less agressive. Is it possible to do it ?
- Another question, is it possible to integrate this in the legend, i.e to know what are this colors refering ?
Many thanks
you can change the strip colours with the fill scale in the dummy plot. Combining the legends is a bit tricky, but here's a starting point.
library(ggplot2)
library(gtable)
library(gridExtra)
library(grid)
gtable_stack <- function(g1, g2){
g1$grobs <- c(g1$grobs, g2$grobs)
g1$layout <- transform(g1$layout, z= z-max(z), name="g2")
g1$layout <- rbind(g1$layout, g2$layout)
g1
}
gtable_select <- function (x, ...)
{
matches <- c(...)
x$layout <- x$layout[matches, , drop = FALSE]
x$grobs <- x$grobs[matches]
x
}
d <- data.frame(fruit = rep(c("apple", "orange", "plum", "banana", "pear", "grape")),
farm = rep(c(0,1,3,6,9,12), each=6),
weight = rnorm(36, 10000, 2500),
size=rep(c("small", "large")))
p1 = ggplot(data = d, aes(x = farm, y = weight)) +
geom_jitter(position = position_jitter(width = 0.3),
aes(color = factor(farm)), size = 2.5, alpha = 1) +
facet_wrap(~fruit)
dummy <- ggplot(data = d, aes(x = farm, y = weight))+ facet_wrap(~fruit) +
geom_rect(aes(fill=size), xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) +
theme_minimal() + scale_fill_brewer(palette = "Pastel2")
g1 <- ggplotGrob(p1)
g2 <- ggplotGrob(dummy)
# extract legends
leg <- g1$grobs[[grep("guide-box", g1$layout$name)]]
dummy_leg <- g2$grobs[[grep("guide-box", g2$layout$name)]]
combined_leg <- rbind.gtable(leg, dummy_leg)
g1$grobs[[grep("guide-box", g1$layout$name)]] <- combined_leg
# move dummy panels one cell up
panels <- grepl(pattern="panel", g2$layout$name)
strips <- grepl(pattern="strip-t", g2$layout$name)
g2$layout$t[panels] <- g2$layout$t[panels] - 1
g2$layout$b[panels] <- g2$layout$b[panels] - 1
new_strips <- gtable_select(g2, panels | strips)
# stack new strips on top of gtable
# ideally you'd remove the old strips, for now they're just covered
new_plot <- gtable_stack(g1, new_strips)
grid.newpage()
grid.draw(new_plot)