#setwd('Desktop/IE332')
install.packages("wakefield")
install.packages("RMySQL")
install.packages("randomNames")
install.packages('password')
install.packages('OpenRepGrid')
library(RMySQL)
library(password)
library(wakefield)
library(randomNames)
industriesData <- read.csv('Industries.csv')
skills <- read.csv('Skills.csv')
sp500 <- read.csv("http://www.princeton.edu/~otorres/sandp500.csv")
companies <- sample(sp500$Name, 100)
locations <- c('Northwest', 'Midwest', 'Northeast', 'South', 'Southwest', 'Southeast',
'International') # Locations
gpas <- c(4,3.5,3,2.5,2)
n <- 100
locPrefs <- numeric(n)
studentSkills <- matrix(nrow=100,ncol=10)
studentInd <- matrix(nrow=100,ncol=5)
jobSkills <- matrix(nrow=100,ncol=5)
for(j in 1:n){ # Samples random skills assigned to students
studentSkills[j,] <- sample(skills[,1],10,replace=FALSE)
studentInd[j,] <- sample(industriesData[,1],5,replace=FALSE)
jobSkills[j,] <- sample(skills[,1],5,replace=FALSE)
}
studentData <- data.frame('first names'=randomNames(n, which.names = 'first'),'last
names'=randomNames(n, which.names = 'last'),'username'=seq(1,
n),'password'=password(8,numbers=TRUE),'gpa'=gpa(n, mean = 85.356, sd = 3.2, name =
"GPA"),'visa'=sample(c("N","Y"), size = n, replace = TRUE, prob = c(.78, .22)), 'loc
pref'=sample(locations,n,replace = TRUE), 'skill'=studentSkills, 'Industry'=studentInd) # Student data
employerData <- data.frame('company names'=companies, 'pref
gpa'=sample(gpas,n,replace=TRUE), 'sponser?'=sample(c('N','Y'), size=n, replace = TRUE, prob
= c(.78, .22)), 'job id'=sample(seq(100,999),n,replace=FALSE),'pref skill'=jobSkills,
'industry'=sample(industriesData[,1],n,replace=TRUE),'location'=sample(locations,n,replace =
TRUE)) # Employer data
I am trying to send certain columns of the studentData and employerData to tables in SQL, how would i go about doing that? I have a table named students where I would like to upload the first and last names of the studentsData data frame into this SQL table.
Related
I have a task to construct the efficient frontier using 25 portfolios (monthly data). I tired writing a quadprog code for calculating minimum variance portfolio weights for a given expected rate of return. However, regardless of the expected return, the solver values give me the same set weights and variance, which the global minimum variance portfolio. I found the answer using an analytical solution. Attached are the codes:
basedf <- read.csv("test.csv", header = TRUE, sep = ",")
data <- basedf[,2:26]
ret <- as.data.frame(colMeans(data))
variance <- diag(var(data))
covmat <-as.matrix(var(data))
###minimum variance portfolio calculation
Q <- 2*cov(data)
A <- rbind(rep(1,25))
a <- 1
result <- solve.QP(Dmat = Q,
dvec = rep(0,25),
Amat = t(A),
bvec = a,
meq = 1)
w <-result$solution
w
var <- result$value
var
sum(w)
this is another set of codes giving the me same value::
mvp <- function(e,ep){
Dmat <- 2*cov(e)
dvec <- rep(0, ncol(e))
Amat <- cbind(rep(1, ncol(e)), colMeans(e))
bvec <- c(1, ep)
result <- solve.QP(Dmat = Dmat, dvec = dvec, Amat = Amat, bvec = bvec, meq=1)
wp <- result$solution
varP <- result$value
ret_values <- list(wp, varP)
names(ret_values) <- c("wp", "VarP")
return(ret_values)
}
z <- mvp(data, -.005)
z$wp
sum(z$wp)
z$VarP
ef <- function(e, min_e, max_e){
list_e <- seq(min_e,max_e, length=50)
loop <- sapply(list_e, function(x) mvp(e, x)$VarP)
effF <- as.data.frame(cbind(list_e,loop))
minvar <- min(effF$loop)
L <- effF$loop==minvar
minret <- effF[L,]$list_e
minpoint <- as.data.frame(cbind(minret,minvar))
minvarwp <- mvp(e, min_e)$wp
rlist <- list(effF, minpoint, minvarwp)
names(rlist) <- c( "eFF", "minPoint", "wp")
return(rlist)
}
in the efficient frontier, all the 50 portfolios have same level of variance. can anyone tell me whats wrong with solver equation??? thanks.
I tried quadprog but couldnt solve it.
I'd like to aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2 using a SQL query with glue package. But when I try:
library(dplyr)
library(rgdal)
library(rgeos)
library(DBI)
library(glue)
# get AOI
download.file(
"https://github.com/Leprechault/trash/raw/main/stands_example.zip",
zip_path <- tempfile(fileext = ".zip")
)
unzip(zip_path, exdir = tempdir())
# Open the files
setwd(tempdir())
stands_ds <- read.csv("pred_target_stands.csv", sep=";") # Data set
stands_ds <- stands_ds %>%
mutate(DATA_S2 = ymd(DATA_S2))
stands_ds$CLASS<-c(rep("A",129),rep("B",130))
stands_ds$CD<-abs(rnorm(length(stands_ds[,1]),mean=50))
# Crete like a SQL server condition
bq_conn<- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
DBI::dbWriteTable(bq_conn, "stands_ds", stands_ds)
# Aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2
sqlInput_pred_FARM <- glue::glue_sql("SELECT * FROM stands_ds AVG(CD) GROUP BY CD_TALHAO = {x} AND ID_UNIQUE = {y} AND DATA_S2 = {z}",
x = "001G", y = "CERROCOROADO_001G",
z = "2021-04-02",.con=bq_conn)
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, as.character(sqlInput_pred_FARM, stringsAsFactors = T))
I always have Error: near "(": syntax error as output. Please, any help with it?
Sorry #KU99 but your solution doesn't return the mean values as I expected. Now, I try something new like an object creation for my mean operation and despite some ugly results with REPLICATE(DATE()), now works. The solution is:
# Aggregate CD mean by CD_TALHAO, ID_UNIQUE and DATA_S2
sqlInput_pred_FARM <- glue::glue_sql("SELECT REPLICATE(CD_TALHAO,1) AS TALHAO, REPLICATE(ID_UNIQUE,1) AS ID, REPLICATE(DATE(DATA_S2),1) AS DATE, AVG(CD) AS CD FROM stands_ds GROUP BY CD_TALHAO = {x},ID_UNIQUE = {y}, DATA_S2 = {z} ORDER BY CD_TALHAO = {x},ID_UNIQUE = {y}, DATA_S2 = {z}",
x = "001G", y = "CERROCOROADO_001G",
z = "2021-04-02",.con=bq_conn)
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, as.character(sqlInput_pred_FARM, stringsAsFactors = T))
pred_attack_BQ_FARM
# TALHAO ID DATE CD
#1 001C CERROCOROADO_001C -4661-02-24 49.93823
#2 001G CERROCOROADO_001G -4661-02-24 50.12102
Try the following:
statement <- "SELECT * FROM stands_ds AVG(CD)
GROUP BY CD_TALHAO = ? AND ID_UNIQUE = ? AND DATA_S2 = ?"
pars <- list("001G", "CERROCOROADO_001G","2021-04-02")
pred_attack_BQ_FARM <- dbGetQuery(bq_conn, statement, params = pars)
Using the plotly library, I made the following plot in R:
library(dplyr)
library(ggplot2)
library(plotly)
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,5,5))
df <- df %>% mutate(var3 = ifelse(var1 <= 5 & var2 <= 5, "a", ifelse(var1 <= 10 & var2 <= 10, "b", "c")))
plot = df %>%
ggplot() + geom_point(aes(x=var1, y= var2, color= var3))
ggplotly(plot)
This is a simple scatter plot - two random variables are generated, and then the colors of the points are decided by some criteria (e.g. if var1 and var2 are between certain ranges).
From here, I could also summary statistics:
df$var3 = as.factor(df$var3)
summary = df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
# A tibble: 3 x 4
var3 Mean_var1 Mean_var2 count
* <fct> <dbl> <dbl> <int>
1 a -1.70 0.946 158
2 b 4.68 4.94 260
3 c 15.8 6.49 582
My question: is it possible to add some buttons to this plot which would allow the user to color the points based on custom choices? E.g. something like this :
Now, the user can type in any range they want - and the color of the points change, and the some summary statistics are generated.
Can someone please show me how to do this in R?
I had this idea - first I would create this massive table that would create all possible range combinations of "var1" and "var2":
vec1 <- c(-20:40,1)
vec2 <- c(-20:40,1)
a <- expand.grid(vec1, vec2)
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
df <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & j <= 10, "b", "c")))
}
}
Then, depending on which ranges the user wants - an SQL style statement isolate the rows from this massive table corresponding to those ranges :
custom_df = df[df$var1 > -20 & df$var1 <10 & df$var1 > -20 & df$var2 <10 , ]
Then, an individual grap would be made for "custom_df" and summary statistics would also be recorded for "custom_df":
summary = custom_df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
But I am not sure how to neatly and efficiently do this in R.
Can someone please show me how to do this?
Thanks
I have built a small shiny app to perform most of your requirements. Based on your pre-defined large dataframe df, user can define the following:
Choose the minimum and maximum value for variables var1 and var2.
Choose criteria to define the variable var3, which is used to display different colors of data points. This is a range now.
Save plot as a HTML file.
Summary stats displayed as a table.
You can define further options to provide the user the option to choose color and so on. For that perhaps you should google on how to use scale_color_manual().
Update: Added user option to choose red and green color based on var1 and var2 range values.
library(shiny)
library(plotly)
library(dplyr)
library(DT)
### define a large df
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,15,15))
ui <- fluidPage(
titlePanel(p("My First Test App", style = "color:red")),
sidebarLayout(
sidebarPanel(
p("Choose Variable limits"),
# Horizontal line ----
tags$hr(),
uiOutput("var1a"), uiOutput("var1b"),
uiOutput("var2a"), uiOutput("var2b"),
uiOutput("criteria")
),
mainPanel(
DTOutput("summary"), br(),
plotlyOutput("plot"),
br(), br(), br(),
uiOutput("saveplotbtn")
)
)
)
server <- function(input, output, session){
output$var1a <- renderUI({
tagList(
numericInput("var11", "Variable 1 min",
min = min(df$var1), max = max(df$var1), value = min(df$var1))
)
})
output$var1b <- renderUI({
if (is.null(input$var11)){
low1 <- min(df$var1)
}else low1 <- max(min(df$var1),input$var11) ## cannot be lower than var 1 minimum
tagList(
numericInput("var12", "Variable 1 max", min = low1, max = max(df$var1), value = max(df$var1))
)
})
output$var2a <- renderUI({
tagList(
numericInput("var21", "Variable 2 min",
min = min(df$var2), max = max(df$var2), value = min(df$var2))
)
})
output$var2b <- renderUI({
if (is.null(input$var21)){
low2 <- min(df$var2)
}else low2 <- max(min(df$var2),input$var21) ## cannot be lower than var 2 minimum
tagList(
numericInput("var22", "Variable 2 max", min = low2, max = max(df$var2), value = max(df$var2))
)
})
output$criteria <- renderUI({
req(input$var11,input$var12,input$var21,input$var22)
tagList(
sliderInput("crit11", "Variable 1 red color range:",
min = -10, max = 0, value = c(-10,0)),
sliderInput("crit12", "Variable 2 red color range:",
min = -25, max = 0, value = c(-25,0)),
sliderInput("crit21", "Variable 1 green color range:",
min = 0.1, max = 10, value = c(0.1,10)),
sliderInput("crit22", "Variable 2 green color range:",
min = 0.1, max = 20, value = c(0.1,20))
)
})
dat <- reactive({
req(input$crit11,input$crit12,input$crit21,input$crit22)
df <- df %>% filter(between(var1, input$var11, input$var12)) %>%
filter(between(var2, input$var21, input$var22))
# df1 <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & var2 <= j , "b", "c")))
df1 <- df %>% mutate(var3 = ifelse(between(var1, input$crit11[1], input$crit11[2]) & between(var2, input$crit12[1], input$crit12[2]), "a",
ifelse(between(var1, input$crit21[1], input$crit21[2]) & between(var2, input$crit22[1], input$crit22[2]), "b", "c")))
})
summari <- reactive({
req(dat())
df1 <- dat()
df1$var3 = as.factor(df1$var3)
summary = df1 %>%
group_by(var3) %>%
dplyr::summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
})
output$summary <- renderDT(summari())
rv <- reactiveValues()
observe({
req(dat())
p <- ggplot(data=dat()) + geom_point(aes(x=var1, y= var2, color= var3))
pp <- ggplotly(p)
rv$plot <- pp
})
output$plot <- renderPlotly({
rv$plot
})
output$saveplotbtn <- renderUI({
div(style="display: block; padding: 5px 350px 5px 50px;",
downloadBttn("saveHTML",
HTML("HTML"),
style = "fill",
color = "default",
size = "lg",
block = TRUE,
no_outline = TRUE
) )
})
output$saveHTML <- downloadHandler(
filename = function() {
paste("myplot", Sys.Date(), ".html", sep = "")
},
content = function(file) {
htmlwidgets::saveWidget(as_widget(rv$plot), file, selfcontained = TRUE) ## self-contained
}
)
}
shinyApp(ui, server)
I'm running HMI on two level data (students in courses) with missing data at the student level. The code throws the following error (Error: C stack usage 7969776 is too close to the limit) when I include a binary value (gender) with missing data.
Below is a reproducible example.
library(MCMCglmm)
library(hmi)
library(mice)
df <- data.frame(post = rtnorm(100,60,20,0,100),
pre = rtnorm(100,40,20,0,100),
gender = rbinom(n=100, size=1, prob=0.20),
course = rep( c("A","B","C","D"), 100*c(0.1,0.2,0.65,0.05) ))
df$post[1:round((0.3)*length(df$post),digits = 0)] <- NA
df$gender[round((0.2)*length(df$post),digits = 0):round((0.5)*length(df$post),
digits = 0)] <- NA
hmi_test <- hmi(df, model_formula = post ~ 1 + pre + gender + (1|course ),
M = 2, maxit = 5, list_of_types = NULL, nitt = 3000, burnin = 1000)
list_of_types_maker(df)
To a 3 year old post
ggplot2: facet_wrap strip color based on variable in data set
Baptiste has given the following solution:
d <- data.frame(fruit = rep(c("apple", "orange", "plum", "banana", "pear", "grape")),
farm = rep(c(0,1,3,6,9,12), each=6),
weight = rnorm(36, 10000, 2500),
size=rep(c("small", "large")))
p1 = ggplot(data = d, aes(x = farm, y = weight)) +
geom_jitter(position = position_jitter(width = 0.3),
aes(color = factor(farm)), size = 2.5, alpha = 1) +
facet_wrap(~fruit)
dummy <- ggplot(data = d, aes(x = farm, y = weight))+ facet_wrap(~fruit) +
geom_rect(aes(fill=size), xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) +
theme_minimal()
library(gtable)
g1 <- ggplotGrob(p1)
g2 <- ggplotGrob(dummy)
gtable_select <- function (x, ...)
{
matches <- c(...)
x$layout <- x$layout[matches, , drop = FALSE]
x$grobs <- x$grobs[matches]
x
}
panels <- grepl(pattern="panel", g2$layout$name)
strips <- grepl(pattern="strip-t", g2$layout$name)
g2$layout$t[panels] <- g2$layout$t[panels] - 1
g2$layout$b[panels] <- g2$layout$b[panels] - 1
new_strips <- gtable_select(g2, panels | strips)
library(grid)
grid.newpage()
grid.draw(new_strips)
gtable_stack <- function(g1, g2){
g1$grobs <- c(g1$grobs, g2$grobs)
g1$layout <- transform(g1$layout, z= z-max(z), name="g2")
g1$layout <- rbind(g1$layout, g2$layout)
g1
}
## ideally you'd remove the old strips, for now they're just covered
new_plot <- gtable_stack(g1, new_strips)
grid.newpage()
grid.draw(new_plot)
(I have just updated the "strip-t" pattern and opened the grid library as it was suggested in the old post)
I repost this because it's an old brillant stuff and I want to use it myself for a presentation.
I'm a beginner in ggplot and this could also help me for various scripts.
Here are my questions :
- How is it possible to choose the color and not to give the same blue and red please? In my script, I have 3 colors to set, and I hope it can be less agressive. Is it possible to do it ?
- Another question, is it possible to integrate this in the legend, i.e to know what are this colors refering ?
Many thanks
you can change the strip colours with the fill scale in the dummy plot. Combining the legends is a bit tricky, but here's a starting point.
library(ggplot2)
library(gtable)
library(gridExtra)
library(grid)
gtable_stack <- function(g1, g2){
g1$grobs <- c(g1$grobs, g2$grobs)
g1$layout <- transform(g1$layout, z= z-max(z), name="g2")
g1$layout <- rbind(g1$layout, g2$layout)
g1
}
gtable_select <- function (x, ...)
{
matches <- c(...)
x$layout <- x$layout[matches, , drop = FALSE]
x$grobs <- x$grobs[matches]
x
}
d <- data.frame(fruit = rep(c("apple", "orange", "plum", "banana", "pear", "grape")),
farm = rep(c(0,1,3,6,9,12), each=6),
weight = rnorm(36, 10000, 2500),
size=rep(c("small", "large")))
p1 = ggplot(data = d, aes(x = farm, y = weight)) +
geom_jitter(position = position_jitter(width = 0.3),
aes(color = factor(farm)), size = 2.5, alpha = 1) +
facet_wrap(~fruit)
dummy <- ggplot(data = d, aes(x = farm, y = weight))+ facet_wrap(~fruit) +
geom_rect(aes(fill=size), xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) +
theme_minimal() + scale_fill_brewer(palette = "Pastel2")
g1 <- ggplotGrob(p1)
g2 <- ggplotGrob(dummy)
# extract legends
leg <- g1$grobs[[grep("guide-box", g1$layout$name)]]
dummy_leg <- g2$grobs[[grep("guide-box", g2$layout$name)]]
combined_leg <- rbind.gtable(leg, dummy_leg)
g1$grobs[[grep("guide-box", g1$layout$name)]] <- combined_leg
# move dummy panels one cell up
panels <- grepl(pattern="panel", g2$layout$name)
strips <- grepl(pattern="strip-t", g2$layout$name)
g2$layout$t[panels] <- g2$layout$t[panels] - 1
g2$layout$b[panels] <- g2$layout$b[panels] - 1
new_strips <- gtable_select(g2, panels | strips)
# stack new strips on top of gtable
# ideally you'd remove the old strips, for now they're just covered
new_plot <- gtable_stack(g1, new_strips)
grid.newpage()
grid.draw(new_plot)