I'm scraping data from the web to do my own little side project. My goal is to go through every single page (divided by month and year) and take each day and how many calls were made. I start with a blank data frame, and append the dates and number of calls from each month as I go on.
The problem is... when I bind the rows together, the dates turn into numbers instead. How do I fix this? (If I run the code for just one column, the dates are in the proper form.)
The issue will be in the last part of the code, but I provided the other stuff for reference.
#Scrapes dates from the police call map
#ex: "Tuesday, Sept. 30"
get_date <- function(url,year){
read_html(url) %>%
html_nodes('.bull li a') %>%
html_text() %>%
str_trim() %>%
unlist() %>%
#adds the current year to the end
paste(year)
}
#Scrapes number of calls for each day.
#ex: "262 calls for service"
get_num_calls <- function(url){
read_html(url) %>%
html_nodes('.bull b') %>%
html_text() %>%
str_trim() %>%
unlist() %>%
#Parse number is here to eliminate the "calls for service" text
parse_number()
}
#This function converts date to proper POSIXct format.
convert <- function(date,month,year){
if (month %in% abbr) {
parse_date_time(date, orders = "%A, %b. %d %Y")
} else if (month %in% full) {
parse_date_time(date, orders = "%A, %B. %d %Y")
} else {
#September is not in an accepted format, so it needs to be changed.
date <- sub("Sept.", "Sep.", date)
parse_date_time(date, orders = "%A, %b. %d %Y")
}
}
#years and months to cycle through the URLs
years <- as.character(2009:2018)
months <- c("jan", "feb","mar","apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec")
#months that are abbreviated in the data.
abbr <- c("jan", "feb", "aug", "oct", "nov", "dec")
#months that are NOT abbreviated in the data.
full <- c("mar", "april", "may", "jun", "jul")
#url_base will have "month/year/" added to the end in the loop
url_base <- "http://projects.registerguard.com/police/eugene/"
#No matter which of these I use, it doesn't work properly.
Police <- c()
Police <- data.frame(date = as.POSIXct(date(), origin = lubridate::origin),
calls = integer())
for (year in years){
for (month in months){
url <- paste(url_base, year, '/', month, '/', sep = "")
date <- get_date(url,year)
date <- convert(date,month,year)
calls <- get_num_calls(url)
new <- cbind(date, calls)
Police <- rbdf(Police, new)
}
}
Related
I have a task to construct the efficient frontier using 25 portfolios (monthly data). I tired writing a quadprog code for calculating minimum variance portfolio weights for a given expected rate of return. However, regardless of the expected return, the solver values give me the same set weights and variance, which the global minimum variance portfolio. I found the answer using an analytical solution. Attached are the codes:
basedf <- read.csv("test.csv", header = TRUE, sep = ",")
data <- basedf[,2:26]
ret <- as.data.frame(colMeans(data))
variance <- diag(var(data))
covmat <-as.matrix(var(data))
###minimum variance portfolio calculation
Q <- 2*cov(data)
A <- rbind(rep(1,25))
a <- 1
result <- solve.QP(Dmat = Q,
dvec = rep(0,25),
Amat = t(A),
bvec = a,
meq = 1)
w <-result$solution
w
var <- result$value
var
sum(w)
this is another set of codes giving the me same value::
mvp <- function(e,ep){
Dmat <- 2*cov(e)
dvec <- rep(0, ncol(e))
Amat <- cbind(rep(1, ncol(e)), colMeans(e))
bvec <- c(1, ep)
result <- solve.QP(Dmat = Dmat, dvec = dvec, Amat = Amat, bvec = bvec, meq=1)
wp <- result$solution
varP <- result$value
ret_values <- list(wp, varP)
names(ret_values) <- c("wp", "VarP")
return(ret_values)
}
z <- mvp(data, -.005)
z$wp
sum(z$wp)
z$VarP
ef <- function(e, min_e, max_e){
list_e <- seq(min_e,max_e, length=50)
loop <- sapply(list_e, function(x) mvp(e, x)$VarP)
effF <- as.data.frame(cbind(list_e,loop))
minvar <- min(effF$loop)
L <- effF$loop==minvar
minret <- effF[L,]$list_e
minpoint <- as.data.frame(cbind(minret,minvar))
minvarwp <- mvp(e, min_e)$wp
rlist <- list(effF, minpoint, minvarwp)
names(rlist) <- c( "eFF", "minPoint", "wp")
return(rlist)
}
in the efficient frontier, all the 50 portfolios have same level of variance. can anyone tell me whats wrong with solver equation??? thanks.
I tried quadprog but couldnt solve it.
In R/ggplot2 ,when I use geom_bar(stat='identity',position='fill'),
the 'sales' tip show '0.80000',how to change it to '80.0%' ?
(I know mutate a new variable use scales::percent(sales),can work in geom_point)
library(tidyverse)
library(plotly)
test_data <- data.frame(category=c('A','B','A','B'),
sub_category=c('a1','b1','a2','b2'),
sales=c(1,2,4,5))
p <- test_data %>%
ggplot(aes(x=category,y=sales,
fill=sub_category))+
geom_bar(stat='identity',position='fill')
ggplotly(p)
One option (and perhaps the easiest one) would be to compute your percentages manually instead of making use of position = "fill" and create the tooltip manually via the text aesthetic which makes it easy to style the numbers, ... as you like:
library(plotly)
test_data <- data.frame(
category = c("A", "B", "A", "B"),
sub_category = c("a1", "b1", "a2", "b2"),
sales = c(1, 2, 4, 5)
)
test_data <- test_data %>%
group_by(category) %>%
mutate(pct = sales / sum(sales))
p <- test_data %>%
ggplot(aes(x = category, y = pct, fill = sub_category)) +
geom_col(aes(text = paste0(
"category: ", category, "<br>",
"sub_category: ", sub_category, "<br>",
"sales: ", scales::percent(pct)
)))
ggplotly(p, tooltip = "text")
total_amount <- 1000000
df <- data.frame("price"= c(226,186,456,615,549),
"firms"= c("VRSN","TXN","DPZ","IDXX","ORLY"))
FUN <- function(q, price=df$price){
total <- sum(price * q)
errs <- c( (total-total_amount)^2, ( ( q[1]*price)/sum(q[1]*price+q[2]*price+q[3]*price+q[4]*price+q[5]*price) +
q[2]*price/sum(q[1]*price+q[2]*price+q[3]*price+q[4]*price+q[5]*price) +
q[3]*price/sum(q[1]*price+q[2]*price+q[3]*price+q[4]*price+q[5]*price)
+ q[4]*price/sum(q[1]*price+q[2]*price+q[3]*price+q[4]*price+q[5]*price)
+ q[5]*price/sum(q[1]*price+q[2]*price+q[3]*price+q[4]*price+q[5]*price) ))
sum(errs)
}
init_q <- rep(1,nrow(df))
res <- optim(init_q, FUN, lower=rep(0,nrow(df)), method="L-BFGS-B")
res
res$par
round(res$par)
sum(round(res$par)*df$price) - total_amount
dt <- df$firms %>% as.data.frame()
dt$lot <- round(res$par,0)
dt$price <- df$price
dt$dnm <- dt$lot*dt$price/sum(dt$lot*dt$price)
dt
I wrote a code that minimize the total amount of money that I had, however I also want to be able to assign weights to stocks and make them have shares as integer.
How could I do that ?
Using the plotly library, I made the following plot in R:
library(dplyr)
library(ggplot2)
library(plotly)
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,5,5))
df <- df %>% mutate(var3 = ifelse(var1 <= 5 & var2 <= 5, "a", ifelse(var1 <= 10 & var2 <= 10, "b", "c")))
plot = df %>%
ggplot() + geom_point(aes(x=var1, y= var2, color= var3))
ggplotly(plot)
This is a simple scatter plot - two random variables are generated, and then the colors of the points are decided by some criteria (e.g. if var1 and var2 are between certain ranges).
From here, I could also summary statistics:
df$var3 = as.factor(df$var3)
summary = df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
# A tibble: 3 x 4
var3 Mean_var1 Mean_var2 count
* <fct> <dbl> <dbl> <int>
1 a -1.70 0.946 158
2 b 4.68 4.94 260
3 c 15.8 6.49 582
My question: is it possible to add some buttons to this plot which would allow the user to color the points based on custom choices? E.g. something like this :
Now, the user can type in any range they want - and the color of the points change, and the some summary statistics are generated.
Can someone please show me how to do this in R?
I had this idea - first I would create this massive table that would create all possible range combinations of "var1" and "var2":
vec1 <- c(-20:40,1)
vec2 <- c(-20:40,1)
a <- expand.grid(vec1, vec2)
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
df <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & j <= 10, "b", "c")))
}
}
Then, depending on which ranges the user wants - an SQL style statement isolate the rows from this massive table corresponding to those ranges :
custom_df = df[df$var1 > -20 & df$var1 <10 & df$var1 > -20 & df$var2 <10 , ]
Then, an individual grap would be made for "custom_df" and summary statistics would also be recorded for "custom_df":
summary = custom_df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
But I am not sure how to neatly and efficiently do this in R.
Can someone please show me how to do this?
Thanks
I have built a small shiny app to perform most of your requirements. Based on your pre-defined large dataframe df, user can define the following:
Choose the minimum and maximum value for variables var1 and var2.
Choose criteria to define the variable var3, which is used to display different colors of data points. This is a range now.
Save plot as a HTML file.
Summary stats displayed as a table.
You can define further options to provide the user the option to choose color and so on. For that perhaps you should google on how to use scale_color_manual().
Update: Added user option to choose red and green color based on var1 and var2 range values.
library(shiny)
library(plotly)
library(dplyr)
library(DT)
### define a large df
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,15,15))
ui <- fluidPage(
titlePanel(p("My First Test App", style = "color:red")),
sidebarLayout(
sidebarPanel(
p("Choose Variable limits"),
# Horizontal line ----
tags$hr(),
uiOutput("var1a"), uiOutput("var1b"),
uiOutput("var2a"), uiOutput("var2b"),
uiOutput("criteria")
),
mainPanel(
DTOutput("summary"), br(),
plotlyOutput("plot"),
br(), br(), br(),
uiOutput("saveplotbtn")
)
)
)
server <- function(input, output, session){
output$var1a <- renderUI({
tagList(
numericInput("var11", "Variable 1 min",
min = min(df$var1), max = max(df$var1), value = min(df$var1))
)
})
output$var1b <- renderUI({
if (is.null(input$var11)){
low1 <- min(df$var1)
}else low1 <- max(min(df$var1),input$var11) ## cannot be lower than var 1 minimum
tagList(
numericInput("var12", "Variable 1 max", min = low1, max = max(df$var1), value = max(df$var1))
)
})
output$var2a <- renderUI({
tagList(
numericInput("var21", "Variable 2 min",
min = min(df$var2), max = max(df$var2), value = min(df$var2))
)
})
output$var2b <- renderUI({
if (is.null(input$var21)){
low2 <- min(df$var2)
}else low2 <- max(min(df$var2),input$var21) ## cannot be lower than var 2 minimum
tagList(
numericInput("var22", "Variable 2 max", min = low2, max = max(df$var2), value = max(df$var2))
)
})
output$criteria <- renderUI({
req(input$var11,input$var12,input$var21,input$var22)
tagList(
sliderInput("crit11", "Variable 1 red color range:",
min = -10, max = 0, value = c(-10,0)),
sliderInput("crit12", "Variable 2 red color range:",
min = -25, max = 0, value = c(-25,0)),
sliderInput("crit21", "Variable 1 green color range:",
min = 0.1, max = 10, value = c(0.1,10)),
sliderInput("crit22", "Variable 2 green color range:",
min = 0.1, max = 20, value = c(0.1,20))
)
})
dat <- reactive({
req(input$crit11,input$crit12,input$crit21,input$crit22)
df <- df %>% filter(between(var1, input$var11, input$var12)) %>%
filter(between(var2, input$var21, input$var22))
# df1 <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & var2 <= j , "b", "c")))
df1 <- df %>% mutate(var3 = ifelse(between(var1, input$crit11[1], input$crit11[2]) & between(var2, input$crit12[1], input$crit12[2]), "a",
ifelse(between(var1, input$crit21[1], input$crit21[2]) & between(var2, input$crit22[1], input$crit22[2]), "b", "c")))
})
summari <- reactive({
req(dat())
df1 <- dat()
df1$var3 = as.factor(df1$var3)
summary = df1 %>%
group_by(var3) %>%
dplyr::summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
})
output$summary <- renderDT(summari())
rv <- reactiveValues()
observe({
req(dat())
p <- ggplot(data=dat()) + geom_point(aes(x=var1, y= var2, color= var3))
pp <- ggplotly(p)
rv$plot <- pp
})
output$plot <- renderPlotly({
rv$plot
})
output$saveplotbtn <- renderUI({
div(style="display: block; padding: 5px 350px 5px 50px;",
downloadBttn("saveHTML",
HTML("HTML"),
style = "fill",
color = "default",
size = "lg",
block = TRUE,
no_outline = TRUE
) )
})
output$saveHTML <- downloadHandler(
filename = function() {
paste("myplot", Sys.Date(), ".html", sep = "")
},
content = function(file) {
htmlwidgets::saveWidget(as_widget(rv$plot), file, selfcontained = TRUE) ## self-contained
}
)
}
shinyApp(ui, server)
#setwd('Desktop/IE332')
install.packages("wakefield")
install.packages("RMySQL")
install.packages("randomNames")
install.packages('password')
install.packages('OpenRepGrid')
library(RMySQL)
library(password)
library(wakefield)
library(randomNames)
industriesData <- read.csv('Industries.csv')
skills <- read.csv('Skills.csv')
sp500 <- read.csv("http://www.princeton.edu/~otorres/sandp500.csv")
companies <- sample(sp500$Name, 100)
locations <- c('Northwest', 'Midwest', 'Northeast', 'South', 'Southwest', 'Southeast',
'International') # Locations
gpas <- c(4,3.5,3,2.5,2)
n <- 100
locPrefs <- numeric(n)
studentSkills <- matrix(nrow=100,ncol=10)
studentInd <- matrix(nrow=100,ncol=5)
jobSkills <- matrix(nrow=100,ncol=5)
for(j in 1:n){ # Samples random skills assigned to students
studentSkills[j,] <- sample(skills[,1],10,replace=FALSE)
studentInd[j,] <- sample(industriesData[,1],5,replace=FALSE)
jobSkills[j,] <- sample(skills[,1],5,replace=FALSE)
}
studentData <- data.frame('first names'=randomNames(n, which.names = 'first'),'last
names'=randomNames(n, which.names = 'last'),'username'=seq(1,
n),'password'=password(8,numbers=TRUE),'gpa'=gpa(n, mean = 85.356, sd = 3.2, name =
"GPA"),'visa'=sample(c("N","Y"), size = n, replace = TRUE, prob = c(.78, .22)), 'loc
pref'=sample(locations,n,replace = TRUE), 'skill'=studentSkills, 'Industry'=studentInd) # Student data
employerData <- data.frame('company names'=companies, 'pref
gpa'=sample(gpas,n,replace=TRUE), 'sponser?'=sample(c('N','Y'), size=n, replace = TRUE, prob
= c(.78, .22)), 'job id'=sample(seq(100,999),n,replace=FALSE),'pref skill'=jobSkills,
'industry'=sample(industriesData[,1],n,replace=TRUE),'location'=sample(locations,n,replace =
TRUE)) # Employer data
I am trying to send certain columns of the studentData and employerData to tables in SQL, how would i go about doing that? I have a table named students where I would like to upload the first and last names of the studentsData data frame into this SQL table.