In tidyr::pivot_wider, `values_fn = sum(.,na.rm=TRUE)` failed - tidyr

In tidyr::pivot_wider, values_fn = sum(.,na.rm=TRUE) failed ,how to handle it?
library(tidyverse)
test_data <- data.frame(
category=c('A','A','A','B','B','B'),
sub_category=c('a','b','b','a','b','b'),
amount=1:6
)
test_data %>% pivot_wider(names_from ='category',
values_from ='amount' ,
values_fn = sum(.,na.rm=TRUE))

You can make it a function to handle this:
library(tidyverse)
test_data <- data.frame(
category=c('A','A','A','B','B','B'),
sub_category=c('a','b','b','a','b','b'),
amount=1:6
)
test_data %>% pivot_wider(names_from ='category',
values_from ='amount' ,
values_fn = function(x) sum(x, na.rm = TRUE))
#> # A tibble: 2 x 3
#> sub_category A B
#> <chr> <int> <int>
#> 1 a 1 4
#> 2 b 5 11
The new syntax for making an anonymous function (\(x)) works too:
test_data %>% pivot_wider(names_from ='category',
values_from ='amount' ,
values_fn = \(x) sum(x, na.rm = TRUE))
#> # A tibble: 2 x 3
#> sub_category A B
#> <chr> <int> <int>
#> 1 a 1 4
#> 2 b 5 11
Created on 2022-03-25 by the reprex package (v2.0.1)

Related

How can plot my own data in a grid in a map sf but return vacum

I am trying to summarize some statistics in the grid that I made, however something fails when I try to do it.
This is my data
head(catk)
Simple feature collection with 6 features and 40 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 303.22 ymin: -61.43 xmax: 303.95 ymax: -60.78
Geodetic CRS: WGS 84
# A tibble: 6 × 41
X1 day month year c1_id greenweight_caught_kg obs_haul_id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 1 4 12 1997 26529 7260 NA
2 2 4 12 1997 26530 7920 NA
3 3 4 12 1997 26531 4692 NA
4 4 4 12 1997 26532 5896 NA
5 5 4 12 1997 26533 88 NA
6 6 5 12 1997 26534 7040 NA
# … with 34 more variables: obs_logbook_id <lgl>, obs_haul_number <lgl>,
# haul_number <dbl>, vessel_name <chr>, vessel_nationality_code <chr>,
# fishing_purpose_code <chr>, season_ccamlr <dbl>,
# target_species <chr>, asd_code <dbl>, trawl_technique <lgl>,
# catchperiod_code <chr>, date_catchperiod_start <date>,
# datetime_set_start <dttm>, datetime_set_end <dttm>,
# datetime_haul_start <dttm>, datetime_haul_end <dttm>, …
and I did this raster
an <- getData("GADM", country = "ATA", level = 0)
an#data$NAME_0
e <- extent(-70,-40,-68,-60)
rc <- crop(an, e)
proj4string(rc) <- CRS("+init=epsg:4326")
rc3 <- st_as_sf(rc)
catk <- st_as_sf(catk, coords = c("Longitude", "Latitude"), crs = 4326) %>%
st_shift_longitude()
Grid <- rc3 %>%
st_make_grid(cellsize = c(1,0.4)) %>% # para que quede cuadrada
st_cast("MULTIPOLYGON") %>%
st_sf() %>%
mutate(cellid = row_number())
result <- Grid %>%
st_join(catk) %>%
group_by(cellid) %>%
summarise(sum_cat = sum(Catcht))
but I can not represent the data in the grid
ggplot() +
geom_sf(data = Grid, color="#d9d9d9", fill=NA) +
geom_sf(data = rc3) +
theme_bw() +
coord_sf() +
scale_alpha(guide="none")+
xlab(expression(paste(Longitude^o,~'O'))) +
ylab(expression(paste(Latitude^o,~'S')))+
guides( colour = guide_legend()) +
theme(panel.background = element_rect(fill = "#f7fbff"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position = "right")+
xlim(-69,-45)
fail plot
Please help me to find this solution!!
So I just saw that you shifted the coordinates with st_shift_longitude() and therefore your bounding box is:
Bounding box: xmin: 303.22 ymin: -61.43 xmax: 303.95 ymax: -60.78
Do you really need it? That doesn't match with your defined extent
e <- extent(-70,-40,-68,-60)
And a bbox for WGS84 is suppose to be at max c(-180, -90, 180, 90).
Also, on your plot you are not instructed ggplot2 to do anything with the values of catk. Grid and rc3 do not have anything from catk, is the result object.
Anyway, I give a try to your problem even though I don't have access to your dataset. I represent on each cell sum_cat from result
library(raster)
library(sf)
library(dplyr)
library(ggplot2)
# Mock your data
catk <- structure(list(Longitude = c(-59.0860203764828, -50.1352159580643,
-53.7671292009259, -67.9105254106185, -67.5753491797446, -51.7045571975837,
-45.6203830411619, -61.2695183762776, -51.6287384188695, -52.244074640978,
-45.4625770258213, -51.0935832496694, -45.6375681312716, -44.744215508174,
-66.3625310507564), Latitude = c(-62.0038884948778, -65.307178606448,
-65.8980199769778, -60.4475595973147, -67.7543165093134, -60.4616894158005,
-67.9720957068844, -62.2184680275876, -66.2345680342004, -64.1523442367459,
-62.5435163857161, -65.9127866479611, -66.8874734854608, -62.0859917484373,
-66.8762861503705), Catcht = c(18L, 95L, 32L, 40L, 16L, 49L,
22L, 14L, 86L, 45L, 3L, 51L, 45L, 41L, 19L)), row.names = c(NA,
-15L), class = "data.frame")
# Start the analysis
an <- getData("GADM", country = "ATA", level = 0)
e <- extent(-70,-40,-68,-60)
rc <- crop(an, e)
proj4string(rc) <- CRS("+init=epsg:4326")
rc3 <- st_as_sf(rc)
# Don't think you need st_shift_longitude, removed
catk <- st_as_sf(catk, coords = c("Longitude", "Latitude"), crs = 4326)
Grid <- rc3 %>%
st_make_grid(cellsize = c(1,0.4)) %>% # para que quede cuadrada
st_cast("MULTIPOLYGON") %>%
st_sf() %>%
mutate(cellid = row_number())
result <- Grid %>%
st_join(catk) %>%
group_by(cellid) %>%
summarise(sum_cat = sum(Catcht))
ggplot() +
geom_sf(data = Grid, color="#d9d9d9", fill=NA) +
# Add here results with my mock data by grid
geom_sf(data = result %>% filter(!is.na(sum_cat)), aes(fill=sum_cat)) +
geom_sf(data = rc3) +
theme_bw() +
coord_sf() +
scale_alpha(guide="none")+
xlab(expression(paste(Longitude^o,~'O'))) +
ylab(expression(paste(Latitude^o,~'S')))+
guides( colour = guide_legend()) +
theme(panel.background = element_rect(fill = "#f7fbff"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position = "right")+
xlim(-69,-45)
Created on 2022-03-23 by the reprex package (v2.0.1)

Transportation cost optimisation using OMPR for a large data set

I am solving a transport optimization problem given a set of constraints.
The following are the three key data sets that I have
#demand file
demand - has demand(DEMAND) across 4821(DPP) sale points(D)
head(demand)
D PP DEMAND DPP
1 ADILABAD (V) - T:11001 OPC:PACK 131.00 ADILABAD (V) - T:11001:OPC:PACK
2 ADILABAD (V) - T:13003 OPC:PACK 235.00 ADILABAD (V) - T:13003:OPC:PACK
3 ADILABAD (V) - T:2006 PPC:PACK 30.00 ADILABAD (V) - T:2006:PPC:PACK
4 ADILABAD (V) - T:4001 OPC:PACK 30.00 ADILABAD (V) - T:4001:OPC:PACK
5 ADILABAD (V) - T:7006 OPC:NPACK 34.84 ADILABAD (V) - T:7006:OPC:NPACK
6 AHMEDABAD:1001 OPC:PACK 442.10 AHMEDABAD:1001:OPC:PACK
#Capacity file
cc - has capacity constraint (MaxP, MinP) across 1823 sources(SOURCE)
head(cc,4)
SOURCE MinP MaxP
1 CHILAMKUR:P:OPC:NPACK:0:R 900 10806
2 CHILAMKUR:P:OPC:NPACK:0:W 900 10806
3 CHILAMKUR:P:OPC:PACK:0:R 5628 67536
4 CHILAMKUR:P:OPC:PACK:0:W 5628 67536
#LandingCost file
LCMat - This is a matrix with the landing cost to deliver the product across the demand location (DPP) from a given source(SOURCE). This is an 1823 x 4821 matrix. Since the landing costs to all locations do not exist from a given source, I have replace that with a huge cost (10^6) to such DPPs.
I am using the OMPR package in R to optimize shipping material to meet the demand.
This is potentially a very simple transport problem but it is taking a lot of time. I am using a 16GB ram machine
The following is the code. Could anyone guide me on what I should do better?
a = Sys.time()
grid = expand.grid(i = 1:nrow(LCMat),j = 1:ncol(LCMat))
grid_solve = grid[which(LCMat < 10^6),]
grid_notsolve = grid[which(LCMat >= 10^6),]
model <- MILPModel() %>%
add_variable(x[grid$i, grid$j],lb = 0, type = "continuous") %>%
add_constraint(x[grid_notsolve$i, grid_notsolve$j] == 0) %>%
add_constraint(sum_over(x[i,j], i = 1:nrow(LCMat)) <= demand$DEMAND[j], j = 1:ncol(LCMat)) %>%
add_constraint(sum_over(x[i,j], j = 1:ncol(LCMat)) <= cc$MaxP[i], i = 1:nrow(LCMat)) %>%
add_constraint(sum_over(x[i,j], j = 1:ncol(LCMat)) >= cc$MinP[i], i = 1:nrow(LCMat)) %>%
set_objective(sum_expr(LCMat[grid_solve$i,grid_solve$j]*x[grid_solve$i,grid_solve$j]),"min")
solution = model %>% solve_model(with_ROI(solver = "glpk", verbose = TRUE))
Sys.time() - a
Two options to potentially speed things up:
Make sure you use the latest CRAN versions of ompr and listcomp.
Try to use filter conditions to only create/use variables that are relevant to the model, instead of adding all nrow(LCMat)*ncol(LCMat) variables and then setting (potentially) a lot of them to 0. See the code below for an example. Depending on how sparse your problem is that could help as well.
The following code takes a sparse matrix (i.e. a matrix with many 0 elements or 10^6 elements in your case) and only generates x[i,j] variables that have an entry in sparse_matrix which is greater than 0. It hopefully illustrates how to use that feature and apply it to your case.
library(ompr)
sparse_matrix <- matrix(
c(
1, 0, 0, 1,
0, 1, 0, 1,
0, 0, 0, 1,
1, 0, 0, 0
), byrow = TRUE, ncol = 4
)
is_connected <- function(i, j) {
sparse_matrix[i, j] > 0
}
n <- nrow(sparse_matrix)
m <- ncol(sparse_matrix)
model <- MIPModel() |>
add_variable(x[i, j], i = 1:n, j = 1:m, is_connected(i, j)) |>
set_objective(sum_over(x[i, j], i = 1:n, j = 1:m, is_connected(i, j))) |>
add_constraint(sum_over(x[i, j], i = 1:n, is_connected(i, j)) <= 1, j = 1:m)
variable_keys(model)
#> [1] "x[1,1]" "x[1,4]" "x[2,2]" "x[2,4]" "x[3,4]" "x[4,1]"
extract_constraints(model)
#> $matrix
#> 3 x 6 sparse Matrix of class "dgCMatrix"
#>
#> [1,] 1 . . . . 1
#> [2,] . . 1 . . .
#> [3,] . 1 . 1 1 .
#>
#> $sense
#> [1] "<=" "<=" "<="
#>
#> $rhs
#> [1] 1 1 1
Created on 2022-03-12 by the reprex package (v2.0.1)
Both OMPR and GLPK are slow for large models.
You are duplicating sum_over(x[i,j], j = 1:ncol(LCMat)). That leads to more nonzero elements than needed. I usually try to prevent that (even at the expense of more variables).

Can I make a function that makes a dataframe like this using loops? (follow up question)

Thank you for your interest in this question.
I have the data as below.
a<- data.frame("Grade"=c(1, 2, 3, 4), "Prob"=c(0.01, 0.25, 0.45, 0.29))
b<- data.frame("Pot"= c(letters[1:18]))
Based on the codes below, I'd like to make a function that can loop 4 Grade numbers based on the Prob probability (replace=TRUE) and four random letters with the same probability (replace=FALSE). For instance, this loop may look like below:
3 2 3 2 d f k g
1 3 4 2 a k r b
I'd like to make a function that can compute not only the results in which the Grades result is only lower than 3, and the four alphabets that I selected appear, but the number of trials to get this result. So, if I want Pot to have "a", "b", "c", and "d" the result will look like:
Trial Grade Pot
15 3 2 1 3 a b c d
39 2 1 2 2 d b a c
2 3 3 3 3 d a b d
77 3 2 3 3 c d b a
I could learn the below code thanks to a very kind person, but I can't edit it to get the results I hope to see. Can you please help me?
samplefun <- function(a) {
c <- sample(a$Grade, size=4, prob=a$Prob, replace=TRUE)
res <- tibble(
Trial = which(c < 3)[1],
Result = c[which(c < 3)[1]]
)
nsamples <- 1000
x<-map_dfr(1:nsamples, ~ samplefun(a))
Thank you for reading this question.
Here's a solution to what I think you're after. I haven't specified a probability vector when sampling b$Pot, because you didn't give one that was 18 elements long in your question (see my comment).
library(tidyverse)
a<- data.frame(Grade =c(1, 2, 3, 4), Prob = c(0.01, 0.25, 0.45, 0.29))
b<- data.frame(Pot = letters[1:18])
chosenletters <- c("a", "b", "c", "d")
samplefun <- function(a, b, chosenletters) {
ntrials <- 0
repeat {
grades <- sample(a$Grade, size = 4, prob = a$Prob, replace = T)
chars <- sample(b$Pot, size = 4, replace = F)
ntrials <- ntrials + 1
if (all(grades < 4) & all(chars %in% chosenletters)) {break}
}
return( tibble(Trial = ntrials, Grade = list(grades), Letters = list(chars)) )
}
nsamples <- 5
res <- map_dfr(1:nsamples, ~ samplefun(a, b, chosenletters))
This dataframe res gives the correct Grades and Letters embedded in lists inside each dataframe cell, plus the trial at which the result was generated.
# A tibble: 5 x 3
Trial Grade Letters
<dbl> <list> <list>
1 20863 <dbl [4]> <fct [4]>
2 8755 <dbl [4]> <fct [4]>
3 15129 <dbl [4]> <fct [4]>
4 1033 <dbl [4]> <fct [4]>
5 5264 <dbl [4]> <fct [4]>
A better view of the nested lists:
> glimpse(res)
Rows: 5
Columns: 3
$ Trial <dbl> 20863, 8755, 15129, 1033, 5264
$ Grade <list> <3, 3, 3, 3>, <3, 2, 2, 2>, <3, 3, 2, 2>, <3, 3, 2, 3>, <3, 2, 3, 3>
$ Letters <list> <b, a, c, d>, <b, a, c, d>, <c, a, b, d>, <b, d, c, a>, <a, b, d, c>

SettingWithCopyError when scaling subset of columns with StandardScaler

I have a dataframe df with 100 columns:
index | col1 | col2 | col3 | ...
2021-04-01 | qwe | 1 | 1.1 | ...
2021-04-02 | asd | 2 | 2.2 | ...
2021-04-03 | yxc | 3 | 3.3 | ...
dtypes:
col1: category
col2: int32
col3: float64
I want to scale all columns that are not of type "category" AND return it as a dataframe, not a numpy array.
My code so far:
y_feature = "col2"
y = df[[y_feature]] # Set predictor y
X = df.drop(
[
y_feature,
],
axis=1,
)
days = (
pd.date_range(start=df.index.min(), end=df.index.max())
.to_frame(name="date")
.reset_index()
.drop("index", axis=1)
)
limit_training_days = int(len(days.index) * 0.85)
X_train_limit = days.iloc[limit_training_days, 0]
print(f"Date for training: {X_train_limit}")
X_train, y_train = (
X.query("date <= #X_train_limit").squeeze(),
y.query("date <= #X_train_limit").squeeze(),
)
X_test, y_test = (
X.query("date > #X_train_limit").squeeze(),
y.query("date > #X_train_limit").squeeze(),
)
categorical_feature = X_train.select_dtypes("category").columns.tolist()
num_cols = X.drop(categorical_feature, axis=1).columns.tolist()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_sc[num_cols] = scaler.transform(X_test[num_cols])
After updating my packages it now throws this error for the last 2 lines of code:
SettingWithCopyError: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
How can I scale only non-category columns (keeping category columns untouched) AND return it as a dataframe?
The problem is you trying to change X_train and X_test which are parts of a bigger dataframe. Try:
X_train, X_test = X_train.copy(), X_test.copy()
before scaling.
You can also do:
X_train, y_train = (
X.query("date <= #X_train_limit").squeeze().copy(), # here
y.query("date <= #X_train_limit").squeeze(),
)

Interactive Plots in R

Using the plotly library, I made the following plot in R:
library(dplyr)
library(ggplot2)
library(plotly)
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,5,5))
df <- df %>% mutate(var3 = ifelse(var1 <= 5 & var2 <= 5, "a", ifelse(var1 <= 10 & var2 <= 10, "b", "c")))
plot = df %>%
ggplot() + geom_point(aes(x=var1, y= var2, color= var3))
ggplotly(plot)
This is a simple scatter plot - two random variables are generated, and then the colors of the points are decided by some criteria (e.g. if var1 and var2 are between certain ranges).
From here, I could also summary statistics:
df$var3 = as.factor(df$var3)
summary = df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
# A tibble: 3 x 4
var3 Mean_var1 Mean_var2 count
* <fct> <dbl> <dbl> <int>
1 a -1.70 0.946 158
2 b 4.68 4.94 260
3 c 15.8 6.49 582
My question: is it possible to add some buttons to this plot which would allow the user to color the points based on custom choices? E.g. something like this :
Now, the user can type in any range they want - and the color of the points change, and the some summary statistics are generated.
Can someone please show me how to do this in R?
I had this idea - first I would create this massive table that would create all possible range combinations of "var1" and "var2":
vec1 <- c(-20:40,1)
vec2 <- c(-20:40,1)
a <- expand.grid(vec1, vec2)
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
df <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & j <= 10, "b", "c")))
}
}
Then, depending on which ranges the user wants - an SQL style statement isolate the rows from this massive table corresponding to those ranges :
custom_df = df[df$var1 > -20 & df$var1 <10 & df$var1 > -20 & df$var2 <10 , ]
Then, an individual grap would be made for "custom_df" and summary statistics would also be recorded for "custom_df":
summary = custom_df %>%
group_by(var3) %>%
summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
But I am not sure how to neatly and efficiently do this in R.
Can someone please show me how to do this?
Thanks
I have built a small shiny app to perform most of your requirements. Based on your pre-defined large dataframe df, user can define the following:
Choose the minimum and maximum value for variables var1 and var2.
Choose criteria to define the variable var3, which is used to display different colors of data points. This is a range now.
Save plot as a HTML file.
Summary stats displayed as a table.
You can define further options to provide the user the option to choose color and so on. For that perhaps you should google on how to use scale_color_manual().
Update: Added user option to choose red and green color based on var1 and var2 range values.
library(shiny)
library(plotly)
library(dplyr)
library(DT)
### define a large df
set.seed(123)
df <- data.frame(var1 = rnorm(1000,10,10),
var2 = rnorm(1000,15,15))
ui <- fluidPage(
titlePanel(p("My First Test App", style = "color:red")),
sidebarLayout(
sidebarPanel(
p("Choose Variable limits"),
# Horizontal line ----
tags$hr(),
uiOutput("var1a"), uiOutput("var1b"),
uiOutput("var2a"), uiOutput("var2b"),
uiOutput("criteria")
),
mainPanel(
DTOutput("summary"), br(),
plotlyOutput("plot"),
br(), br(), br(),
uiOutput("saveplotbtn")
)
)
)
server <- function(input, output, session){
output$var1a <- renderUI({
tagList(
numericInput("var11", "Variable 1 min",
min = min(df$var1), max = max(df$var1), value = min(df$var1))
)
})
output$var1b <- renderUI({
if (is.null(input$var11)){
low1 <- min(df$var1)
}else low1 <- max(min(df$var1),input$var11) ## cannot be lower than var 1 minimum
tagList(
numericInput("var12", "Variable 1 max", min = low1, max = max(df$var1), value = max(df$var1))
)
})
output$var2a <- renderUI({
tagList(
numericInput("var21", "Variable 2 min",
min = min(df$var2), max = max(df$var2), value = min(df$var2))
)
})
output$var2b <- renderUI({
if (is.null(input$var21)){
low2 <- min(df$var2)
}else low2 <- max(min(df$var2),input$var21) ## cannot be lower than var 2 minimum
tagList(
numericInput("var22", "Variable 2 max", min = low2, max = max(df$var2), value = max(df$var2))
)
})
output$criteria <- renderUI({
req(input$var11,input$var12,input$var21,input$var22)
tagList(
sliderInput("crit11", "Variable 1 red color range:",
min = -10, max = 0, value = c(-10,0)),
sliderInput("crit12", "Variable 2 red color range:",
min = -25, max = 0, value = c(-25,0)),
sliderInput("crit21", "Variable 1 green color range:",
min = 0.1, max = 10, value = c(0.1,10)),
sliderInput("crit22", "Variable 2 green color range:",
min = 0.1, max = 20, value = c(0.1,20))
)
})
dat <- reactive({
req(input$crit11,input$crit12,input$crit21,input$crit22)
df <- df %>% filter(between(var1, input$var11, input$var12)) %>%
filter(between(var2, input$var21, input$var22))
# df1 <- df %>% mutate(var3 = ifelse(var1 <= i & var2 <= i, "a", ifelse(var1 <= j & var2 <= j , "b", "c")))
df1 <- df %>% mutate(var3 = ifelse(between(var1, input$crit11[1], input$crit11[2]) & between(var2, input$crit12[1], input$crit12[2]), "a",
ifelse(between(var1, input$crit21[1], input$crit21[2]) & between(var2, input$crit22[1], input$crit22[2]), "b", "c")))
})
summari <- reactive({
req(dat())
df1 <- dat()
df1$var3 = as.factor(df1$var3)
summary = df1 %>%
group_by(var3) %>%
dplyr::summarize(Mean_var1 = mean(var1), Mean_var2 = mean(var2), count=n())
})
output$summary <- renderDT(summari())
rv <- reactiveValues()
observe({
req(dat())
p <- ggplot(data=dat()) + geom_point(aes(x=var1, y= var2, color= var3))
pp <- ggplotly(p)
rv$plot <- pp
})
output$plot <- renderPlotly({
rv$plot
})
output$saveplotbtn <- renderUI({
div(style="display: block; padding: 5px 350px 5px 50px;",
downloadBttn("saveHTML",
HTML("HTML"),
style = "fill",
color = "default",
size = "lg",
block = TRUE,
no_outline = TRUE
) )
})
output$saveHTML <- downloadHandler(
filename = function() {
paste("myplot", Sys.Date(), ".html", sep = "")
},
content = function(file) {
htmlwidgets::saveWidget(as_widget(rv$plot), file, selfcontained = TRUE) ## self-contained
}
)
}
shinyApp(ui, server)